Instructions to use roonbug/vpuqmqt0 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use roonbug/vpuqmqt0 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="roonbug/vpuqmqt0") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("roonbug/vpuqmqt0") model = AutoModelForCausalLM.from_pretrained("roonbug/vpuqmqt0") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use roonbug/vpuqmqt0 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "roonbug/vpuqmqt0" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/vpuqmqt0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/roonbug/vpuqmqt0
- SGLang
How to use roonbug/vpuqmqt0 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "roonbug/vpuqmqt0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/vpuqmqt0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "roonbug/vpuqmqt0" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/vpuqmqt0", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use roonbug/vpuqmqt0 with Docker Model Runner:
docker model run hf.co/roonbug/vpuqmqt0
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.2, | |
| "eval_steps": 100, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1780160423368216, | |
| "epoch": 0.016, | |
| "grad_norm": 13.0, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 1.8406, | |
| "mean_token_accuracy": 0.6489301804453135, | |
| "num_tokens": 340696.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.1818634796887637, | |
| "epoch": 0.032, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 1.2666666666666669e-06, | |
| "loss": 1.8191, | |
| "mean_token_accuracy": 0.6528046734631061, | |
| "num_tokens": 675930.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.195955842360854, | |
| "epoch": 0.048, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 1.9333333333333336e-06, | |
| "loss": 1.7817, | |
| "mean_token_accuracy": 0.6578715395182371, | |
| "num_tokens": 1007956.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.2672166559845208, | |
| "epoch": 0.064, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 2.6e-06, | |
| "loss": 1.7272, | |
| "mean_token_accuracy": 0.6625342659652234, | |
| "num_tokens": 1340560.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.2964693307876587, | |
| "epoch": 0.08, | |
| "grad_norm": 5.0, | |
| "learning_rate": 3.266666666666667e-06, | |
| "loss": 1.5867, | |
| "mean_token_accuracy": 0.6811951555311679, | |
| "num_tokens": 1679432.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.2539724007248878, | |
| "epoch": 0.096, | |
| "grad_norm": 3.375, | |
| "learning_rate": 3.9333333333333335e-06, | |
| "loss": 1.3962, | |
| "mean_token_accuracy": 0.7081016473472118, | |
| "num_tokens": 2021570.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.2302619956433773, | |
| "epoch": 0.112, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.600000000000001e-06, | |
| "loss": 1.2974, | |
| "mean_token_accuracy": 0.7239668637514114, | |
| "num_tokens": 2359297.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.169797332212329, | |
| "epoch": 0.128, | |
| "grad_norm": 1.875, | |
| "learning_rate": 5.2666666666666665e-06, | |
| "loss": 1.2306, | |
| "mean_token_accuracy": 0.7328833002597094, | |
| "num_tokens": 2683168.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.0165224198251963, | |
| "epoch": 0.144, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 5.933333333333335e-06, | |
| "loss": 1.1129, | |
| "mean_token_accuracy": 0.7561763934791088, | |
| "num_tokens": 3019426.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.9457759071141482, | |
| "epoch": 0.16, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 6.600000000000001e-06, | |
| "loss": 1.0448, | |
| "mean_token_accuracy": 0.7662712432444095, | |
| "num_tokens": 3354390.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_biology_entropy": 2.4009983863830566, | |
| "eval_biology_loss": 3.090766429901123, | |
| "eval_biology_mean_token_accuracy": 0.5075433547496796, | |
| "eval_biology_num_tokens": 3354390.0, | |
| "eval_biology_runtime": 38.801, | |
| "eval_biology_samples_per_second": 12.886, | |
| "eval_biology_steps_per_second": 3.222, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_chemistry_entropy": 1.1818295245170594, | |
| "eval_chemistry_loss": 1.4003204107284546, | |
| "eval_chemistry_mean_token_accuracy": 0.7158680257797241, | |
| "eval_chemistry_num_tokens": 3354390.0, | |
| "eval_chemistry_runtime": 48.2819, | |
| "eval_chemistry_samples_per_second": 10.356, | |
| "eval_chemistry_steps_per_second": 2.589, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_math_entropy": 0.8902422695159912, | |
| "eval_math_loss": 1.2323389053344727, | |
| "eval_math_mean_token_accuracy": 0.750218500137329, | |
| "eval_math_num_tokens": 3354390.0, | |
| "eval_math_runtime": 49.6484, | |
| "eval_math_samples_per_second": 10.071, | |
| "eval_math_steps_per_second": 2.518, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_physics_entropy": 0.9318590335845948, | |
| "eval_physics_loss": 1.0569193363189697, | |
| "eval_physics_mean_token_accuracy": 0.7675474114418029, | |
| "eval_physics_num_tokens": 3354390.0, | |
| "eval_physics_runtime": 57.1057, | |
| "eval_physics_samples_per_second": 8.756, | |
| "eval_physics_steps_per_second": 2.189, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.9224597703665495, | |
| "epoch": 0.176, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 7.266666666666668e-06, | |
| "loss": 1.0192, | |
| "mean_token_accuracy": 0.7696165222674608, | |
| "num_tokens": 3684374.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.8823430232703686, | |
| "epoch": 0.192, | |
| "grad_norm": 1.125, | |
| "learning_rate": 7.933333333333334e-06, | |
| "loss": 0.9854, | |
| "mean_token_accuracy": 0.7739298477768898, | |
| "num_tokens": 4023915.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.8393984897062182, | |
| "epoch": 0.208, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 8.6e-06, | |
| "loss": 0.9379, | |
| "mean_token_accuracy": 0.7841779347509146, | |
| "num_tokens": 4352045.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.8179828137159347, | |
| "epoch": 0.224, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 9.266666666666667e-06, | |
| "loss": 0.9191, | |
| "mean_token_accuracy": 0.7869349300861359, | |
| "num_tokens": 4681371.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.805096386373043, | |
| "epoch": 0.24, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.933333333333334e-06, | |
| "loss": 0.9012, | |
| "mean_token_accuracy": 0.7888233289122581, | |
| "num_tokens": 5021784.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.798224457167089, | |
| "epoch": 0.256, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 1.0600000000000002e-05, | |
| "loss": 0.8922, | |
| "mean_token_accuracy": 0.7903898701071739, | |
| "num_tokens": 5367308.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.805234762467444, | |
| "epoch": 0.272, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 1.1266666666666668e-05, | |
| "loss": 0.8993, | |
| "mean_token_accuracy": 0.7877147275954485, | |
| "num_tokens": 5699101.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.7946224914863705, | |
| "epoch": 0.288, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 1.1933333333333335e-05, | |
| "loss": 0.8976, | |
| "mean_token_accuracy": 0.7911442808806897, | |
| "num_tokens": 6022837.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.7837886592373252, | |
| "epoch": 0.304, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 0.8805, | |
| "mean_token_accuracy": 0.7934082143008709, | |
| "num_tokens": 6343826.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.778077344968915, | |
| "epoch": 0.32, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.3266666666666668e-05, | |
| "loss": 0.8781, | |
| "mean_token_accuracy": 0.7940127164125442, | |
| "num_tokens": 6670164.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_biology_entropy": 2.1669651889801025, | |
| "eval_biology_loss": 2.686201810836792, | |
| "eval_biology_mean_token_accuracy": 0.5460970797538758, | |
| "eval_biology_num_tokens": 6670164.0, | |
| "eval_biology_runtime": 38.7901, | |
| "eval_biology_samples_per_second": 12.89, | |
| "eval_biology_steps_per_second": 3.222, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_chemistry_entropy": 0.9981456413269043, | |
| "eval_chemistry_loss": 1.1842883825302124, | |
| "eval_chemistry_mean_token_accuracy": 0.7467401041984558, | |
| "eval_chemistry_num_tokens": 6670164.0, | |
| "eval_chemistry_runtime": 48.3154, | |
| "eval_chemistry_samples_per_second": 10.349, | |
| "eval_chemistry_steps_per_second": 2.587, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_math_entropy": 0.7862319254875183, | |
| "eval_math_loss": 1.0977362394332886, | |
| "eval_math_mean_token_accuracy": 0.7683374147415161, | |
| "eval_math_num_tokens": 6670164.0, | |
| "eval_math_runtime": 49.687, | |
| "eval_math_samples_per_second": 10.063, | |
| "eval_math_steps_per_second": 2.516, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_physics_entropy": 0.7679825134277344, | |
| "eval_physics_loss": 0.886142373085022, | |
| "eval_physics_mean_token_accuracy": 0.7957743234634399, | |
| "eval_physics_num_tokens": 6670164.0, | |
| "eval_physics_runtime": 57.1202, | |
| "eval_physics_samples_per_second": 8.753, | |
| "eval_physics_steps_per_second": 2.188, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.7617403082549572, | |
| "epoch": 0.336, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.3933333333333334e-05, | |
| "loss": 0.8531, | |
| "mean_token_accuracy": 0.7977134332060813, | |
| "num_tokens": 7003404.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.737130863033235, | |
| "epoch": 0.352, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.46e-05, | |
| "loss": 0.8339, | |
| "mean_token_accuracy": 0.8024804938584567, | |
| "num_tokens": 7342593.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.7299130430445075, | |
| "epoch": 0.368, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.5266666666666667e-05, | |
| "loss": 0.8252, | |
| "mean_token_accuracy": 0.805871631577611, | |
| "num_tokens": 7675569.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.7597114410251379, | |
| "epoch": 0.384, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.5933333333333336e-05, | |
| "loss": 0.8558, | |
| "mean_token_accuracy": 0.7943590730428696, | |
| "num_tokens": 8012218.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.7247350050136447, | |
| "epoch": 0.4, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.66e-05, | |
| "loss": 0.8193, | |
| "mean_token_accuracy": 0.8040182612836361, | |
| "num_tokens": 8350120.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.7144488081336021, | |
| "epoch": 0.416, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.726666666666667e-05, | |
| "loss": 0.7998, | |
| "mean_token_accuracy": 0.8067968346178531, | |
| "num_tokens": 8688773.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.7226355630904436, | |
| "epoch": 0.432, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 1.7933333333333333e-05, | |
| "loss": 0.8125, | |
| "mean_token_accuracy": 0.8042349684983492, | |
| "num_tokens": 9022094.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.6943683221936225, | |
| "epoch": 0.448, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.86e-05, | |
| "loss": 0.7924, | |
| "mean_token_accuracy": 0.8109067149460316, | |
| "num_tokens": 9360608.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.6898288525640964, | |
| "epoch": 0.464, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 1.926666666666667e-05, | |
| "loss": 0.7814, | |
| "mean_token_accuracy": 0.8110810052603483, | |
| "num_tokens": 9688896.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.6964065950363875, | |
| "epoch": 0.48, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.9933333333333334e-05, | |
| "loss": 0.7896, | |
| "mean_token_accuracy": 0.8094950247555971, | |
| "num_tokens": 10025864.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_biology_entropy": 2.1021561794281007, | |
| "eval_biology_loss": 2.546415090560913, | |
| "eval_biology_mean_token_accuracy": 0.5616441056728363, | |
| "eval_biology_num_tokens": 10025864.0, | |
| "eval_biology_runtime": 38.8255, | |
| "eval_biology_samples_per_second": 12.878, | |
| "eval_biology_steps_per_second": 3.22, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_chemistry_entropy": 0.9578249802589417, | |
| "eval_chemistry_loss": 1.1104036569595337, | |
| "eval_chemistry_mean_token_accuracy": 0.7582236580848694, | |
| "eval_chemistry_num_tokens": 10025864.0, | |
| "eval_chemistry_runtime": 48.5149, | |
| "eval_chemistry_samples_per_second": 10.306, | |
| "eval_chemistry_steps_per_second": 2.577, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_math_entropy": 0.7676350421905518, | |
| "eval_math_loss": 1.0549193620681763, | |
| "eval_math_mean_token_accuracy": 0.7745008988380432, | |
| "eval_math_num_tokens": 10025864.0, | |
| "eval_math_runtime": 49.8707, | |
| "eval_math_samples_per_second": 10.026, | |
| "eval_math_steps_per_second": 2.506, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_physics_entropy": 0.7307665984630585, | |
| "eval_physics_loss": 0.8242064118385315, | |
| "eval_physics_mean_token_accuracy": 0.8064617681503295, | |
| "eval_physics_num_tokens": 10025864.0, | |
| "eval_physics_runtime": 57.1693, | |
| "eval_physics_samples_per_second": 8.746, | |
| "eval_physics_steps_per_second": 2.186, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.7168749757111073, | |
| "epoch": 0.496, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 1.9933333333333334e-05, | |
| "loss": 0.8032, | |
| "mean_token_accuracy": 0.8051175128668546, | |
| "num_tokens": 10362573.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.6993480321019888, | |
| "epoch": 0.512, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.985925925925926e-05, | |
| "loss": 0.7903, | |
| "mean_token_accuracy": 0.8099954195320607, | |
| "num_tokens": 10694440.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.7138712629675865, | |
| "epoch": 0.528, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.9785185185185187e-05, | |
| "loss": 0.8101, | |
| "mean_token_accuracy": 0.805506169050932, | |
| "num_tokens": 11022357.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.6928766580298543, | |
| "epoch": 0.544, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.971111111111111e-05, | |
| "loss": 0.7916, | |
| "mean_token_accuracy": 0.8087165944278241, | |
| "num_tokens": 11345911.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.682861409150064, | |
| "epoch": 0.56, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.963703703703704e-05, | |
| "loss": 0.7735, | |
| "mean_token_accuracy": 0.8126827124506235, | |
| "num_tokens": 11687225.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.6824749782681465, | |
| "epoch": 0.576, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.9562962962962964e-05, | |
| "loss": 0.7723, | |
| "mean_token_accuracy": 0.8146394658833742, | |
| "num_tokens": 12015720.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.6805058639496565, | |
| "epoch": 0.592, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 1.948888888888889e-05, | |
| "loss": 0.776, | |
| "mean_token_accuracy": 0.811661035567522, | |
| "num_tokens": 12343393.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.6700849516317249, | |
| "epoch": 0.608, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 1.9414814814814817e-05, | |
| "loss": 0.7658, | |
| "mean_token_accuracy": 0.8141583666205406, | |
| "num_tokens": 12671474.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.6893410481512546, | |
| "epoch": 0.624, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 1.9340740740740743e-05, | |
| "loss": 0.7782, | |
| "mean_token_accuracy": 0.8111887093633413, | |
| "num_tokens": 12996077.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.6852772971615195, | |
| "epoch": 0.64, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.926666666666667e-05, | |
| "loss": 0.7799, | |
| "mean_token_accuracy": 0.8112590182572603, | |
| "num_tokens": 13325291.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_biology_entropy": 2.0124985208511355, | |
| "eval_biology_loss": 2.4155702590942383, | |
| "eval_biology_mean_token_accuracy": 0.5735694291591644, | |
| "eval_biology_num_tokens": 13325291.0, | |
| "eval_biology_runtime": 38.6667, | |
| "eval_biology_samples_per_second": 12.931, | |
| "eval_biology_steps_per_second": 3.233, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_chemistry_entropy": 0.8911373369693756, | |
| "eval_chemistry_loss": 1.0692609548568726, | |
| "eval_chemistry_mean_token_accuracy": 0.7647108516693115, | |
| "eval_chemistry_num_tokens": 13325291.0, | |
| "eval_chemistry_runtime": 48.2089, | |
| "eval_chemistry_samples_per_second": 10.372, | |
| "eval_chemistry_steps_per_second": 2.593, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_math_entropy": 0.7330298454761505, | |
| "eval_math_loss": 1.0336663722991943, | |
| "eval_math_mean_token_accuracy": 0.7786638278961182, | |
| "eval_math_num_tokens": 13325291.0, | |
| "eval_math_runtime": 49.7905, | |
| "eval_math_samples_per_second": 10.042, | |
| "eval_math_steps_per_second": 2.511, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_physics_entropy": 0.681670261144638, | |
| "eval_physics_loss": 0.7883204817771912, | |
| "eval_physics_mean_token_accuracy": 0.8129381031990052, | |
| "eval_physics_num_tokens": 13325291.0, | |
| "eval_physics_runtime": 57.2563, | |
| "eval_physics_samples_per_second": 8.733, | |
| "eval_physics_steps_per_second": 2.183, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.6723099524155259, | |
| "epoch": 0.656, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 1.9192592592592593e-05, | |
| "loss": 0.757, | |
| "mean_token_accuracy": 0.8151150114834309, | |
| "num_tokens": 13671434.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.6621538577601314, | |
| "epoch": 0.672, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.911851851851852e-05, | |
| "loss": 0.7563, | |
| "mean_token_accuracy": 0.8172798678278923, | |
| "num_tokens": 13993668.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.6673029117286206, | |
| "epoch": 0.688, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.9044444444444446e-05, | |
| "loss": 0.7529, | |
| "mean_token_accuracy": 0.8164214458316564, | |
| "num_tokens": 14334907.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.6810694945976138, | |
| "epoch": 0.704, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.8970370370370372e-05, | |
| "loss": 0.7695, | |
| "mean_token_accuracy": 0.8127802673727273, | |
| "num_tokens": 14670639.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.6582699194550514, | |
| "epoch": 0.72, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.8896296296296295e-05, | |
| "loss": 0.7544, | |
| "mean_token_accuracy": 0.8171826928853989, | |
| "num_tokens": 14987995.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.6564427128061652, | |
| "epoch": 0.736, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.8822222222222225e-05, | |
| "loss": 0.7446, | |
| "mean_token_accuracy": 0.8173684533685446, | |
| "num_tokens": 15324659.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.6495672106742859, | |
| "epoch": 0.752, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.874814814814815e-05, | |
| "loss": 0.7416, | |
| "mean_token_accuracy": 0.8195757914334536, | |
| "num_tokens": 15657342.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.6738376861438156, | |
| "epoch": 0.768, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.8674074074074075e-05, | |
| "loss": 0.7642, | |
| "mean_token_accuracy": 0.8124835971742869, | |
| "num_tokens": 15988993.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.6495399951934815, | |
| "epoch": 0.784, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.86e-05, | |
| "loss": 0.7324, | |
| "mean_token_accuracy": 0.8208612345159054, | |
| "num_tokens": 16333857.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.6567428983747959, | |
| "epoch": 0.8, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.8525925925925928e-05, | |
| "loss": 0.7488, | |
| "mean_token_accuracy": 0.817277068644762, | |
| "num_tokens": 16664690.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_biology_entropy": 2.039887357711792, | |
| "eval_biology_loss": 2.4157471656799316, | |
| "eval_biology_mean_token_accuracy": 0.5770127189159393, | |
| "eval_biology_num_tokens": 16664690.0, | |
| "eval_biology_runtime": 38.6815, | |
| "eval_biology_samples_per_second": 12.926, | |
| "eval_biology_steps_per_second": 3.232, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_chemistry_entropy": 0.8955893061161041, | |
| "eval_chemistry_loss": 1.048737645149231, | |
| "eval_chemistry_mean_token_accuracy": 0.768093888759613, | |
| "eval_chemistry_num_tokens": 16664690.0, | |
| "eval_chemistry_runtime": 48.42, | |
| "eval_chemistry_samples_per_second": 10.326, | |
| "eval_chemistry_steps_per_second": 2.582, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_math_entropy": 0.7318526220321655, | |
| "eval_math_loss": 1.0199034214019775, | |
| "eval_math_mean_token_accuracy": 0.7799897599220276, | |
| "eval_math_num_tokens": 16664690.0, | |
| "eval_math_runtime": 49.8882, | |
| "eval_math_samples_per_second": 10.022, | |
| "eval_math_steps_per_second": 2.506, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_physics_entropy": 0.6712099099159241, | |
| "eval_physics_loss": 0.7689476609230042, | |
| "eval_physics_mean_token_accuracy": 0.8163677668571472, | |
| "eval_physics_num_tokens": 16664690.0, | |
| "eval_physics_runtime": 57.4517, | |
| "eval_physics_samples_per_second": 8.703, | |
| "eval_physics_steps_per_second": 2.176, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.6573688389733434, | |
| "epoch": 0.816, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.8451851851851855e-05, | |
| "loss": 0.7477, | |
| "mean_token_accuracy": 0.8164514761418105, | |
| "num_tokens": 16989307.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.6648645078763366, | |
| "epoch": 0.832, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.8377777777777778e-05, | |
| "loss": 0.749, | |
| "mean_token_accuracy": 0.8170726090669632, | |
| "num_tokens": 17317524.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.6526606786996126, | |
| "epoch": 0.848, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 1.8303703703703704e-05, | |
| "loss": 0.7388, | |
| "mean_token_accuracy": 0.8195879191160202, | |
| "num_tokens": 17650424.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.6565553491935134, | |
| "epoch": 0.864, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.822962962962963e-05, | |
| "loss": 0.7464, | |
| "mean_token_accuracy": 0.8177060768008232, | |
| "num_tokens": 17985547.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.6623865978792309, | |
| "epoch": 0.88, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 1.8155555555555557e-05, | |
| "loss": 0.7495, | |
| "mean_token_accuracy": 0.8162953305989504, | |
| "num_tokens": 18315392.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.6179373754188419, | |
| "epoch": 0.896, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 1.8081481481481484e-05, | |
| "loss": 0.7023, | |
| "mean_token_accuracy": 0.8264330130070447, | |
| "num_tokens": 18667124.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.6555242039263248, | |
| "epoch": 0.912, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.800740740740741e-05, | |
| "loss": 0.7393, | |
| "mean_token_accuracy": 0.8174499638378621, | |
| "num_tokens": 19002186.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.6485247412696481, | |
| "epoch": 0.928, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 1.7933333333333333e-05, | |
| "loss": 0.7363, | |
| "mean_token_accuracy": 0.8195535041391849, | |
| "num_tokens": 19335013.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.63772834520787, | |
| "epoch": 0.944, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 1.785925925925926e-05, | |
| "loss": 0.7269, | |
| "mean_token_accuracy": 0.821834321692586, | |
| "num_tokens": 19662733.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.6372630735859275, | |
| "epoch": 0.96, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.7785185185185186e-05, | |
| "loss": 0.7318, | |
| "mean_token_accuracy": 0.8204116970300674, | |
| "num_tokens": 19996789.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_biology_entropy": 2.0030080833435058, | |
| "eval_biology_loss": 2.361283302307129, | |
| "eval_biology_mean_token_accuracy": 0.5793048655986786, | |
| "eval_biology_num_tokens": 19996789.0, | |
| "eval_biology_runtime": 38.6981, | |
| "eval_biology_samples_per_second": 12.921, | |
| "eval_biology_steps_per_second": 3.23, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_chemistry_entropy": 0.8840513801574708, | |
| "eval_chemistry_loss": 1.0323643684387207, | |
| "eval_chemistry_mean_token_accuracy": 0.7703397722244263, | |
| "eval_chemistry_num_tokens": 19996789.0, | |
| "eval_chemistry_runtime": 48.2039, | |
| "eval_chemistry_samples_per_second": 10.373, | |
| "eval_chemistry_steps_per_second": 2.593, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_math_entropy": 0.7270034260749817, | |
| "eval_math_loss": 1.009481430053711, | |
| "eval_math_mean_token_accuracy": 0.7820751585960388, | |
| "eval_math_num_tokens": 19996789.0, | |
| "eval_math_runtime": 49.571, | |
| "eval_math_samples_per_second": 10.087, | |
| "eval_math_steps_per_second": 2.522, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_physics_entropy": 0.6698693735599518, | |
| "eval_physics_loss": 0.7564050555229187, | |
| "eval_physics_mean_token_accuracy": 0.8185040464401245, | |
| "eval_physics_num_tokens": 19996789.0, | |
| "eval_physics_runtime": 56.9517, | |
| "eval_physics_samples_per_second": 8.779, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.6384279150515795, | |
| "epoch": 0.976, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.7711111111111113e-05, | |
| "loss": 0.7283, | |
| "mean_token_accuracy": 0.82187210470438, | |
| "num_tokens": 20332636.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.6450063675642014, | |
| "epoch": 0.992, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 1.763703703703704e-05, | |
| "loss": 0.7272, | |
| "mean_token_accuracy": 0.8209770727902651, | |
| "num_tokens": 20662209.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.6167608626186848, | |
| "epoch": 1.008, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 1.7562962962962962e-05, | |
| "loss": 0.7008, | |
| "mean_token_accuracy": 0.825862829759717, | |
| "num_tokens": 21005731.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.6117491278797388, | |
| "epoch": 1.024, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.7488888888888892e-05, | |
| "loss": 0.7058, | |
| "mean_token_accuracy": 0.8266724064946175, | |
| "num_tokens": 21335407.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.6068670526146889, | |
| "epoch": 1.04, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 1.7414814814814815e-05, | |
| "loss": 0.6989, | |
| "mean_token_accuracy": 0.8283324401825667, | |
| "num_tokens": 21668490.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.601451874896884, | |
| "epoch": 1.056, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.7340740740740742e-05, | |
| "loss": 0.6864, | |
| "mean_token_accuracy": 0.829675118252635, | |
| "num_tokens": 22006039.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.6137524953112006, | |
| "epoch": 1.072, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.726666666666667e-05, | |
| "loss": 0.7008, | |
| "mean_token_accuracy": 0.8262542523443699, | |
| "num_tokens": 22348206.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.6093455260619521, | |
| "epoch": 1.088, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.7192592592592595e-05, | |
| "loss": 0.6956, | |
| "mean_token_accuracy": 0.8260304640978575, | |
| "num_tokens": 22676163.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.6026462253183127, | |
| "epoch": 1.104, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 1.711851851851852e-05, | |
| "loss": 0.6967, | |
| "mean_token_accuracy": 0.8273924000561237, | |
| "num_tokens": 23009279.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.6122452523559332, | |
| "epoch": 1.12, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 1.7044444444444445e-05, | |
| "loss": 0.7, | |
| "mean_token_accuracy": 0.8257350366562605, | |
| "num_tokens": 23346828.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_biology_entropy": 1.8616097135543823, | |
| "eval_biology_loss": 2.4014246463775635, | |
| "eval_biology_mean_token_accuracy": 0.578062358379364, | |
| "eval_biology_num_tokens": 23346828.0, | |
| "eval_biology_runtime": 38.9662, | |
| "eval_biology_samples_per_second": 12.832, | |
| "eval_biology_steps_per_second": 3.208, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_chemistry_entropy": 0.8116880433559418, | |
| "eval_chemistry_loss": 1.0317949056625366, | |
| "eval_chemistry_mean_token_accuracy": 0.7715164208412171, | |
| "eval_chemistry_num_tokens": 23346828.0, | |
| "eval_chemistry_runtime": 48.496, | |
| "eval_chemistry_samples_per_second": 10.31, | |
| "eval_chemistry_steps_per_second": 2.578, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_math_entropy": 0.6895896532535553, | |
| "eval_math_loss": 1.0153776407241821, | |
| "eval_math_mean_token_accuracy": 0.7820956745147705, | |
| "eval_math_num_tokens": 23346828.0, | |
| "eval_math_runtime": 49.7815, | |
| "eval_math_samples_per_second": 10.044, | |
| "eval_math_steps_per_second": 2.511, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_physics_entropy": 0.6192426791191101, | |
| "eval_physics_loss": 0.7497905492782593, | |
| "eval_physics_mean_token_accuracy": 0.8200045080184937, | |
| "eval_physics_num_tokens": 23346828.0, | |
| "eval_physics_runtime": 57.1287, | |
| "eval_physics_samples_per_second": 8.752, | |
| "eval_physics_steps_per_second": 2.188, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.588855667039752, | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.697037037037037e-05, | |
| "loss": 0.6772, | |
| "mean_token_accuracy": 0.8326364874839782, | |
| "num_tokens": 23689161.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.6099351227283478, | |
| "epoch": 1.152, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.6896296296296298e-05, | |
| "loss": 0.7039, | |
| "mean_token_accuracy": 0.8282962709665298, | |
| "num_tokens": 24016439.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.6140455640852451, | |
| "epoch": 1.168, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.6822222222222224e-05, | |
| "loss": 0.7027, | |
| "mean_token_accuracy": 0.8253309100866317, | |
| "num_tokens": 24344431.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.6041248327121138, | |
| "epoch": 1.184, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.6748148148148147e-05, | |
| "loss": 0.6955, | |
| "mean_token_accuracy": 0.8286070462316275, | |
| "num_tokens": 24668087.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5895567566156388, | |
| "epoch": 1.2, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.6674074074074077e-05, | |
| "loss": 0.6822, | |
| "mean_token_accuracy": 0.8305550657212735, | |
| "num_tokens": 25000629.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.6073450578376651, | |
| "epoch": 1.216, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 1.66e-05, | |
| "loss": 0.7023, | |
| "mean_token_accuracy": 0.8261869914829731, | |
| "num_tokens": 25324324.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.6059388216584921, | |
| "epoch": 1.232, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 1.6525925925925927e-05, | |
| "loss": 0.6863, | |
| "mean_token_accuracy": 0.8298216536641121, | |
| "num_tokens": 25658795.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5957553267478943, | |
| "epoch": 1.248, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 1.6451851851851853e-05, | |
| "loss": 0.6864, | |
| "mean_token_accuracy": 0.830686765909195, | |
| "num_tokens": 25993177.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5959657493978738, | |
| "epoch": 1.264, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.637777777777778e-05, | |
| "loss": 0.6826, | |
| "mean_token_accuracy": 0.8305899318307638, | |
| "num_tokens": 26329726.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.6204709148034453, | |
| "epoch": 1.28, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.6303703703703706e-05, | |
| "loss": 0.7071, | |
| "mean_token_accuracy": 0.8238335218280554, | |
| "num_tokens": 26664093.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_biology_entropy": 1.7922346639633178, | |
| "eval_biology_loss": 2.4233291149139404, | |
| "eval_biology_mean_token_accuracy": 0.5778127768039704, | |
| "eval_biology_num_tokens": 26664093.0, | |
| "eval_biology_runtime": 39.1057, | |
| "eval_biology_samples_per_second": 12.786, | |
| "eval_biology_steps_per_second": 3.196, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_chemistry_entropy": 0.7962674243450165, | |
| "eval_chemistry_loss": 1.0286256074905396, | |
| "eval_chemistry_mean_token_accuracy": 0.7727626013755798, | |
| "eval_chemistry_num_tokens": 26664093.0, | |
| "eval_chemistry_runtime": 48.3131, | |
| "eval_chemistry_samples_per_second": 10.349, | |
| "eval_chemistry_steps_per_second": 2.587, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_math_entropy": 0.6702238636016846, | |
| "eval_math_loss": 1.0157709121704102, | |
| "eval_math_mean_token_accuracy": 0.7829344019889831, | |
| "eval_math_num_tokens": 26664093.0, | |
| "eval_math_runtime": 49.7647, | |
| "eval_math_samples_per_second": 10.047, | |
| "eval_math_steps_per_second": 2.512, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_physics_entropy": 0.6072953283786774, | |
| "eval_physics_loss": 0.7434503436088562, | |
| "eval_physics_mean_token_accuracy": 0.8212288799285888, | |
| "eval_physics_num_tokens": 26664093.0, | |
| "eval_physics_runtime": 57.1238, | |
| "eval_physics_samples_per_second": 8.753, | |
| "eval_physics_steps_per_second": 2.188, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.593550406768918, | |
| "epoch": 1.296, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 1.622962962962963e-05, | |
| "loss": 0.6825, | |
| "mean_token_accuracy": 0.8320997886359691, | |
| "num_tokens": 26994877.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.6156477816402912, | |
| "epoch": 1.312, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.6155555555555556e-05, | |
| "loss": 0.703, | |
| "mean_token_accuracy": 0.8259445391595364, | |
| "num_tokens": 27328198.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.5969830378890038, | |
| "epoch": 1.328, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.6081481481481482e-05, | |
| "loss": 0.6901, | |
| "mean_token_accuracy": 0.8290002550929785, | |
| "num_tokens": 27658818.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.5997996777296066, | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.600740740740741e-05, | |
| "loss": 0.6904, | |
| "mean_token_accuracy": 0.8284455709159374, | |
| "num_tokens": 27991108.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.6052390130236744, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.5933333333333336e-05, | |
| "loss": 0.6852, | |
| "mean_token_accuracy": 0.8274706263095141, | |
| "num_tokens": 28327775.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.6018602728843689, | |
| "epoch": 1.376, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.5859259259259262e-05, | |
| "loss": 0.6827, | |
| "mean_token_accuracy": 0.8304568257182836, | |
| "num_tokens": 28666116.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.5851238770410419, | |
| "epoch": 1.392, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 1.5785185185185185e-05, | |
| "loss": 0.6746, | |
| "mean_token_accuracy": 0.8333170894533396, | |
| "num_tokens": 29004293.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.58795285820961, | |
| "epoch": 1.408, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.571111111111111e-05, | |
| "loss": 0.6857, | |
| "mean_token_accuracy": 0.8305332105606794, | |
| "num_tokens": 29330131.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.6189510561525822, | |
| "epoch": 1.424, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 1.5637037037037038e-05, | |
| "loss": 0.7072, | |
| "mean_token_accuracy": 0.8251793116331101, | |
| "num_tokens": 29661850.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.5798567572608591, | |
| "epoch": 1.44, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 1.5562962962962965e-05, | |
| "loss": 0.6695, | |
| "mean_token_accuracy": 0.8342564977705479, | |
| "num_tokens": 29999392.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_biology_entropy": 1.8147089042663573, | |
| "eval_biology_loss": 2.417410135269165, | |
| "eval_biology_mean_token_accuracy": 0.5787473826408386, | |
| "eval_biology_num_tokens": 29999392.0, | |
| "eval_biology_runtime": 38.9072, | |
| "eval_biology_samples_per_second": 12.851, | |
| "eval_biology_steps_per_second": 3.213, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_chemistry_entropy": 0.7944381227493286, | |
| "eval_chemistry_loss": 1.0228049755096436, | |
| "eval_chemistry_mean_token_accuracy": 0.7732948322296143, | |
| "eval_chemistry_num_tokens": 29999392.0, | |
| "eval_chemistry_runtime": 48.4429, | |
| "eval_chemistry_samples_per_second": 10.321, | |
| "eval_chemistry_steps_per_second": 2.58, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_math_entropy": 0.6757729785442352, | |
| "eval_math_loss": 1.0138108730316162, | |
| "eval_math_mean_token_accuracy": 0.7829539861679077, | |
| "eval_math_num_tokens": 29999392.0, | |
| "eval_math_runtime": 49.813, | |
| "eval_math_samples_per_second": 10.038, | |
| "eval_math_steps_per_second": 2.509, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_physics_entropy": 0.607404308795929, | |
| "eval_physics_loss": 0.737089991569519, | |
| "eval_physics_mean_token_accuracy": 0.8226114134788514, | |
| "eval_physics_num_tokens": 29999392.0, | |
| "eval_physics_runtime": 57.3115, | |
| "eval_physics_samples_per_second": 8.724, | |
| "eval_physics_steps_per_second": 2.181, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.5827234297990799, | |
| "epoch": 1.456, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.548888888888889e-05, | |
| "loss": 0.6728, | |
| "mean_token_accuracy": 0.8342432040721178, | |
| "num_tokens": 30341004.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.613773494027555, | |
| "epoch": 1.472, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.5414814814814814e-05, | |
| "loss": 0.7035, | |
| "mean_token_accuracy": 0.8268327709287405, | |
| "num_tokens": 30667689.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.5988023646175862, | |
| "epoch": 1.488, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.5340740740740744e-05, | |
| "loss": 0.6897, | |
| "mean_token_accuracy": 0.8293631616979837, | |
| "num_tokens": 30999697.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.5896453000605106, | |
| "epoch": 1.504, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 1.5266666666666667e-05, | |
| "loss": 0.6728, | |
| "mean_token_accuracy": 0.8324251122772693, | |
| "num_tokens": 31332775.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.605005569756031, | |
| "epoch": 1.52, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.5192592592592594e-05, | |
| "loss": 0.6942, | |
| "mean_token_accuracy": 0.8275358382612467, | |
| "num_tokens": 31666521.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.5999037871137262, | |
| "epoch": 1.536, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.5118518518518519e-05, | |
| "loss": 0.6904, | |
| "mean_token_accuracy": 0.8276842717081309, | |
| "num_tokens": 31998232.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.5867987772449851, | |
| "epoch": 1.552, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 1.5044444444444445e-05, | |
| "loss": 0.6754, | |
| "mean_token_accuracy": 0.8337812848389149, | |
| "num_tokens": 32328085.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 0.5942975046113134, | |
| "epoch": 1.568, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.497037037037037e-05, | |
| "loss": 0.6807, | |
| "mean_token_accuracy": 0.831771444156766, | |
| "num_tokens": 32656603.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.6016290852800011, | |
| "epoch": 1.584, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 1.4896296296296298e-05, | |
| "loss": 0.6947, | |
| "mean_token_accuracy": 0.8264373868703843, | |
| "num_tokens": 32983769.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.5999371835961937, | |
| "epoch": 1.6, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 1.4822222222222225e-05, | |
| "loss": 0.6882, | |
| "mean_token_accuracy": 0.8293469067662954, | |
| "num_tokens": 33309282.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_biology_entropy": 1.788435049057007, | |
| "eval_biology_loss": 2.369239330291748, | |
| "eval_biology_mean_token_accuracy": 0.582524644613266, | |
| "eval_biology_num_tokens": 33309282.0, | |
| "eval_biology_runtime": 38.8968, | |
| "eval_biology_samples_per_second": 12.855, | |
| "eval_biology_steps_per_second": 3.214, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_chemistry_entropy": 0.7873289968967437, | |
| "eval_chemistry_loss": 1.0177081823349, | |
| "eval_chemistry_mean_token_accuracy": 0.774118812084198, | |
| "eval_chemistry_num_tokens": 33309282.0, | |
| "eval_chemistry_runtime": 48.3649, | |
| "eval_chemistry_samples_per_second": 10.338, | |
| "eval_chemistry_steps_per_second": 2.585, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_math_entropy": 0.6668027784824372, | |
| "eval_math_loss": 1.0126802921295166, | |
| "eval_math_mean_token_accuracy": 0.7838437123298645, | |
| "eval_math_num_tokens": 33309282.0, | |
| "eval_math_runtime": 49.8108, | |
| "eval_math_samples_per_second": 10.038, | |
| "eval_math_steps_per_second": 2.509, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_physics_entropy": 0.5990195829868317, | |
| "eval_physics_loss": 0.7323749661445618, | |
| "eval_physics_mean_token_accuracy": 0.8238483490943909, | |
| "eval_physics_num_tokens": 33309282.0, | |
| "eval_physics_runtime": 57.3705, | |
| "eval_physics_samples_per_second": 8.715, | |
| "eval_physics_steps_per_second": 2.179, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.5766303434967994, | |
| "epoch": 1.616, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.474814814814815e-05, | |
| "loss": 0.6687, | |
| "mean_token_accuracy": 0.8350894570350647, | |
| "num_tokens": 33647616.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 0.5906545480713248, | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.4674074074074076e-05, | |
| "loss": 0.6798, | |
| "mean_token_accuracy": 0.8322980519384146, | |
| "num_tokens": 33977586.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.5677616313099861, | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 1.46e-05, | |
| "loss": 0.658, | |
| "mean_token_accuracy": 0.8373314294964075, | |
| "num_tokens": 34312436.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.5948906594887375, | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.4525925925925927e-05, | |
| "loss": 0.682, | |
| "mean_token_accuracy": 0.8317620534449816, | |
| "num_tokens": 34641549.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.5764713797718286, | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 1.4451851851851852e-05, | |
| "loss": 0.6715, | |
| "mean_token_accuracy": 0.8332184217870235, | |
| "num_tokens": 34977031.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.5952808676287532, | |
| "epoch": 1.696, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.4377777777777779e-05, | |
| "loss": 0.6834, | |
| "mean_token_accuracy": 0.8309958126395941, | |
| "num_tokens": 35300962.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.5965396504849195, | |
| "epoch": 1.712, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.4303703703703703e-05, | |
| "loss": 0.6786, | |
| "mean_token_accuracy": 0.829968997463584, | |
| "num_tokens": 35642662.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.5757137715816498, | |
| "epoch": 1.728, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.4229629629629632e-05, | |
| "loss": 0.6695, | |
| "mean_token_accuracy": 0.833732133358717, | |
| "num_tokens": 35980993.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.5878799825906753, | |
| "epoch": 1.744, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.4155555555555556e-05, | |
| "loss": 0.6717, | |
| "mean_token_accuracy": 0.8321238547563553, | |
| "num_tokens": 36326797.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 0.577763288281858, | |
| "epoch": 1.76, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.4081481481481483e-05, | |
| "loss": 0.6698, | |
| "mean_token_accuracy": 0.8338598430156707, | |
| "num_tokens": 36654797.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_biology_entropy": 1.76575110912323, | |
| "eval_biology_loss": 2.379521369934082, | |
| "eval_biology_mean_token_accuracy": 0.5822040309906006, | |
| "eval_biology_num_tokens": 36654797.0, | |
| "eval_biology_runtime": 38.7201, | |
| "eval_biology_samples_per_second": 12.913, | |
| "eval_biology_steps_per_second": 3.228, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_chemistry_entropy": 0.7854319989681244, | |
| "eval_chemistry_loss": 1.01557457447052, | |
| "eval_chemistry_mean_token_accuracy": 0.7744541010856628, | |
| "eval_chemistry_num_tokens": 36654797.0, | |
| "eval_chemistry_runtime": 48.2464, | |
| "eval_chemistry_samples_per_second": 10.363, | |
| "eval_chemistry_steps_per_second": 2.591, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_math_entropy": 0.6641156001091003, | |
| "eval_math_loss": 1.0110862255096436, | |
| "eval_math_mean_token_accuracy": 0.783713164806366, | |
| "eval_math_num_tokens": 36654797.0, | |
| "eval_math_runtime": 49.5907, | |
| "eval_math_samples_per_second": 10.083, | |
| "eval_math_steps_per_second": 2.521, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_physics_entropy": 0.5953411047458649, | |
| "eval_physics_loss": 0.7290456295013428, | |
| "eval_physics_mean_token_accuracy": 0.8248090887069702, | |
| "eval_physics_num_tokens": 36654797.0, | |
| "eval_physics_runtime": 57.3959, | |
| "eval_physics_samples_per_second": 8.711, | |
| "eval_physics_steps_per_second": 2.178, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.5980557221919298, | |
| "epoch": 1.776, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.400740740740741e-05, | |
| "loss": 0.6911, | |
| "mean_token_accuracy": 0.8282759781926871, | |
| "num_tokens": 36989310.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 0.5839062621816993, | |
| "epoch": 1.792, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.3933333333333334e-05, | |
| "loss": 0.6708, | |
| "mean_token_accuracy": 0.8331681247800589, | |
| "num_tokens": 37318535.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.5731059337034822, | |
| "epoch": 1.808, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.385925925925926e-05, | |
| "loss": 0.6585, | |
| "mean_token_accuracy": 0.8363862674683332, | |
| "num_tokens": 37658889.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 0.5865196855738759, | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 1.3785185185185186e-05, | |
| "loss": 0.6698, | |
| "mean_token_accuracy": 0.8333104524761439, | |
| "num_tokens": 37995198.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.5773211907595396, | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.3711111111111112e-05, | |
| "loss": 0.6726, | |
| "mean_token_accuracy": 0.8339518435299397, | |
| "num_tokens": 38325542.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.5834932073950767, | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 1.3637037037037037e-05, | |
| "loss": 0.6632, | |
| "mean_token_accuracy": 0.8345976937562227, | |
| "num_tokens": 38668148.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.5739704865962267, | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 1.3562962962962965e-05, | |
| "loss": 0.6687, | |
| "mean_token_accuracy": 0.8333972290158271, | |
| "num_tokens": 39001652.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.5951925914734602, | |
| "epoch": 1.888, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.3488888888888888e-05, | |
| "loss": 0.682, | |
| "mean_token_accuracy": 0.8303560864180326, | |
| "num_tokens": 39333474.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.6042962603271007, | |
| "epoch": 1.904, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.3414814814814817e-05, | |
| "loss": 0.6915, | |
| "mean_token_accuracy": 0.8288168527185917, | |
| "num_tokens": 39663055.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 0.601089458540082, | |
| "epoch": 1.92, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.3340740740740741e-05, | |
| "loss": 0.6888, | |
| "mean_token_accuracy": 0.8287999380379916, | |
| "num_tokens": 39989824.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_biology_entropy": 1.759890214920044, | |
| "eval_biology_loss": 2.368649959564209, | |
| "eval_biology_mean_token_accuracy": 0.5835101284980774, | |
| "eval_biology_num_tokens": 39989824.0, | |
| "eval_biology_runtime": 38.6774, | |
| "eval_biology_samples_per_second": 12.927, | |
| "eval_biology_steps_per_second": 3.232, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_chemistry_entropy": 0.776658641576767, | |
| "eval_chemistry_loss": 1.0122178792953491, | |
| "eval_chemistry_mean_token_accuracy": 0.7752061448097229, | |
| "eval_chemistry_num_tokens": 39989824.0, | |
| "eval_chemistry_runtime": 48.1689, | |
| "eval_chemistry_samples_per_second": 10.38, | |
| "eval_chemistry_steps_per_second": 2.595, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_math_entropy": 0.6634243364334107, | |
| "eval_math_loss": 1.0101300477981567, | |
| "eval_math_mean_token_accuracy": 0.7840519022941589, | |
| "eval_math_num_tokens": 39989824.0, | |
| "eval_math_runtime": 49.5286, | |
| "eval_math_samples_per_second": 10.095, | |
| "eval_math_steps_per_second": 2.524, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_physics_entropy": 0.5912484722137451, | |
| "eval_physics_loss": 0.7268282175064087, | |
| "eval_physics_mean_token_accuracy": 0.825360511302948, | |
| "eval_physics_num_tokens": 39989824.0, | |
| "eval_physics_runtime": 56.9412, | |
| "eval_physics_samples_per_second": 8.781, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.5859691947698593, | |
| "epoch": 1.936, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.3266666666666668e-05, | |
| "loss": 0.676, | |
| "mean_token_accuracy": 0.8319578696042299, | |
| "num_tokens": 40325070.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 0.5632861316204071, | |
| "epoch": 1.952, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.3192592592592594e-05, | |
| "loss": 0.6508, | |
| "mean_token_accuracy": 0.83807716332376, | |
| "num_tokens": 40659614.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.5868391951546073, | |
| "epoch": 1.968, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.311851851851852e-05, | |
| "loss": 0.6756, | |
| "mean_token_accuracy": 0.8323294088244438, | |
| "num_tokens": 40991508.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 0.5867437845095992, | |
| "epoch": 1.984, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.3044444444444446e-05, | |
| "loss": 0.676, | |
| "mean_token_accuracy": 0.8312037277966737, | |
| "num_tokens": 41328779.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.593030778504908, | |
| "epoch": 2.0, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.297037037037037e-05, | |
| "loss": 0.6752, | |
| "mean_token_accuracy": 0.8317596733570098, | |
| "num_tokens": 41664296.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.5428176861256361, | |
| "epoch": 2.016, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.2896296296296299e-05, | |
| "loss": 0.6304, | |
| "mean_token_accuracy": 0.8421510916203261, | |
| "num_tokens": 42001193.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 0.5524626910686493, | |
| "epoch": 2.032, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.2822222222222222e-05, | |
| "loss": 0.633, | |
| "mean_token_accuracy": 0.8409431543201208, | |
| "num_tokens": 42340757.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 0.5630420710891485, | |
| "epoch": 2.048, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.274814814814815e-05, | |
| "loss": 0.6561, | |
| "mean_token_accuracy": 0.8348792966455221, | |
| "num_tokens": 42670324.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.5547426689416171, | |
| "epoch": 2.064, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.2674074074074075e-05, | |
| "loss": 0.6368, | |
| "mean_token_accuracy": 0.8413027279078961, | |
| "num_tokens": 43010231.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 0.5610322959721088, | |
| "epoch": 2.08, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 0.6516, | |
| "mean_token_accuracy": 0.8378075629472732, | |
| "num_tokens": 43340227.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_biology_entropy": 1.6757129163742066, | |
| "eval_biology_loss": 2.434731960296631, | |
| "eval_biology_mean_token_accuracy": 0.5802675273418426, | |
| "eval_biology_num_tokens": 43340227.0, | |
| "eval_biology_runtime": 38.7759, | |
| "eval_biology_samples_per_second": 12.895, | |
| "eval_biology_steps_per_second": 3.224, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_chemistry_entropy": 0.7416200067996979, | |
| "eval_chemistry_loss": 1.027290940284729, | |
| "eval_chemistry_mean_token_accuracy": 0.774878448009491, | |
| "eval_chemistry_num_tokens": 43340227.0, | |
| "eval_chemistry_runtime": 48.262, | |
| "eval_chemistry_samples_per_second": 10.36, | |
| "eval_chemistry_steps_per_second": 2.59, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_math_entropy": 0.6465173182487488, | |
| "eval_math_loss": 1.023693561553955, | |
| "eval_math_mean_token_accuracy": 0.7827271037101745, | |
| "eval_math_num_tokens": 43340227.0, | |
| "eval_math_runtime": 49.631, | |
| "eval_math_samples_per_second": 10.074, | |
| "eval_math_steps_per_second": 2.519, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_physics_entropy": 0.5699445073604583, | |
| "eval_physics_loss": 0.7285439372062683, | |
| "eval_physics_mean_token_accuracy": 0.8254043416976928, | |
| "eval_physics_num_tokens": 43340227.0, | |
| "eval_physics_runtime": 57.0739, | |
| "eval_physics_samples_per_second": 8.761, | |
| "eval_physics_steps_per_second": 2.19, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.5561112010851502, | |
| "epoch": 2.096, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.2525925925925928e-05, | |
| "loss": 0.6449, | |
| "mean_token_accuracy": 0.838719493150711, | |
| "num_tokens": 43673099.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 0.5452175224199891, | |
| "epoch": 2.112, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.2451851851851853e-05, | |
| "loss": 0.6321, | |
| "mean_token_accuracy": 0.8417959384620189, | |
| "num_tokens": 44005986.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.5562603289261461, | |
| "epoch": 2.128, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 1.237777777777778e-05, | |
| "loss": 0.653, | |
| "mean_token_accuracy": 0.8379485800862312, | |
| "num_tokens": 44321465.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.5499181509017944, | |
| "epoch": 2.144, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 1.2303703703703704e-05, | |
| "loss": 0.638, | |
| "mean_token_accuracy": 0.8397632710635662, | |
| "num_tokens": 44661027.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.5740855507552624, | |
| "epoch": 2.16, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 1.222962962962963e-05, | |
| "loss": 0.6641, | |
| "mean_token_accuracy": 0.8336062435060739, | |
| "num_tokens": 44994236.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.5468110611662269, | |
| "epoch": 2.176, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.2155555555555555e-05, | |
| "loss": 0.6341, | |
| "mean_token_accuracy": 0.8411031287163496, | |
| "num_tokens": 45334280.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.5445626365020871, | |
| "epoch": 2.192, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.2081481481481484e-05, | |
| "loss": 0.6379, | |
| "mean_token_accuracy": 0.8422058593481779, | |
| "num_tokens": 45666954.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.5418444711714983, | |
| "epoch": 2.208, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.2007407407407408e-05, | |
| "loss": 0.6314, | |
| "mean_token_accuracy": 0.8422281835228205, | |
| "num_tokens": 45999546.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.54025251083076, | |
| "epoch": 2.224, | |
| "grad_norm": 1.0, | |
| "learning_rate": 1.1933333333333335e-05, | |
| "loss": 0.6343, | |
| "mean_token_accuracy": 0.8418932400643826, | |
| "num_tokens": 46333809.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.5499276254326105, | |
| "epoch": 2.24, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.185925925925926e-05, | |
| "loss": 0.6369, | |
| "mean_token_accuracy": 0.8407698534429073, | |
| "num_tokens": 46673328.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_biology_entropy": 1.6533626160621644, | |
| "eval_biology_loss": 2.441138744354248, | |
| "eval_biology_mean_token_accuracy": 0.5818718819618225, | |
| "eval_biology_num_tokens": 46673328.0, | |
| "eval_biology_runtime": 38.7889, | |
| "eval_biology_samples_per_second": 12.89, | |
| "eval_biology_steps_per_second": 3.223, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_chemistry_entropy": 0.7358436925411225, | |
| "eval_chemistry_loss": 1.0279189348220825, | |
| "eval_chemistry_mean_token_accuracy": 0.7754086136817933, | |
| "eval_chemistry_num_tokens": 46673328.0, | |
| "eval_chemistry_runtime": 48.2677, | |
| "eval_chemistry_samples_per_second": 10.359, | |
| "eval_chemistry_steps_per_second": 2.59, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_math_entropy": 0.6436889851093293, | |
| "eval_math_loss": 1.0210638046264648, | |
| "eval_math_mean_token_accuracy": 0.7834169192314148, | |
| "eval_math_num_tokens": 46673328.0, | |
| "eval_math_runtime": 49.6493, | |
| "eval_math_samples_per_second": 10.071, | |
| "eval_math_steps_per_second": 2.518, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_physics_entropy": 0.5687800683975219, | |
| "eval_physics_loss": 0.7271425127983093, | |
| "eval_physics_mean_token_accuracy": 0.8259177951812744, | |
| "eval_physics_num_tokens": 46673328.0, | |
| "eval_physics_runtime": 57.064, | |
| "eval_physics_samples_per_second": 8.762, | |
| "eval_physics_steps_per_second": 2.191, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.5672985427081585, | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.1785185185185186e-05, | |
| "loss": 0.6585, | |
| "mean_token_accuracy": 0.8361219819635153, | |
| "num_tokens": 47001509.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.5440221125259995, | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.1711111111111113e-05, | |
| "loss": 0.6308, | |
| "mean_token_accuracy": 0.842396317794919, | |
| "num_tokens": 47340142.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.5601490139961243, | |
| "epoch": 2.288, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.1637037037037037e-05, | |
| "loss": 0.6515, | |
| "mean_token_accuracy": 0.8371975239366293, | |
| "num_tokens": 47677550.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 0.5608395885676145, | |
| "epoch": 2.304, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 1.1562962962962964e-05, | |
| "loss": 0.65, | |
| "mean_token_accuracy": 0.8390717066824436, | |
| "num_tokens": 47993851.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.5507894741371274, | |
| "epoch": 2.32, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 1.1488888888888889e-05, | |
| "loss": 0.6392, | |
| "mean_token_accuracy": 0.8389977443963289, | |
| "num_tokens": 48332577.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.5349721314385534, | |
| "epoch": 2.336, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.1414814814814817e-05, | |
| "loss": 0.6212, | |
| "mean_token_accuracy": 0.8431226223707199, | |
| "num_tokens": 48676623.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 0.5349597102031112, | |
| "epoch": 2.352, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.1340740740740742e-05, | |
| "loss": 0.6199, | |
| "mean_token_accuracy": 0.8435559894889593, | |
| "num_tokens": 49007770.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.5474015891551971, | |
| "epoch": 2.368, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.1266666666666668e-05, | |
| "loss": 0.6384, | |
| "mean_token_accuracy": 0.8399505577981472, | |
| "num_tokens": 49350016.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.5547215724363923, | |
| "epoch": 2.384, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.1192592592592593e-05, | |
| "loss": 0.6461, | |
| "mean_token_accuracy": 0.8389234948903322, | |
| "num_tokens": 49679074.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 0.5506776092574001, | |
| "epoch": 2.4, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.111851851851852e-05, | |
| "loss": 0.6381, | |
| "mean_token_accuracy": 0.8396286979317665, | |
| "num_tokens": 50020648.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_biology_entropy": 1.6101444239616394, | |
| "eval_biology_loss": 2.4393153190612793, | |
| "eval_biology_mean_token_accuracy": 0.5827285711765289, | |
| "eval_biology_num_tokens": 50020648.0, | |
| "eval_biology_runtime": 38.6357, | |
| "eval_biology_samples_per_second": 12.941, | |
| "eval_biology_steps_per_second": 3.235, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_chemistry_entropy": 0.7312388877868652, | |
| "eval_chemistry_loss": 1.029062271118164, | |
| "eval_chemistry_mean_token_accuracy": 0.7757972526550293, | |
| "eval_chemistry_num_tokens": 50020648.0, | |
| "eval_chemistry_runtime": 48.0523, | |
| "eval_chemistry_samples_per_second": 10.405, | |
| "eval_chemistry_steps_per_second": 2.601, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_math_entropy": 0.6399386277198792, | |
| "eval_math_loss": 1.0237832069396973, | |
| "eval_math_mean_token_accuracy": 0.7829466652870178, | |
| "eval_math_num_tokens": 50020648.0, | |
| "eval_math_runtime": 49.5118, | |
| "eval_math_samples_per_second": 10.099, | |
| "eval_math_steps_per_second": 2.525, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_physics_entropy": 0.5676034677028656, | |
| "eval_physics_loss": 0.7261826395988464, | |
| "eval_physics_mean_token_accuracy": 0.825977876663208, | |
| "eval_physics_num_tokens": 50020648.0, | |
| "eval_physics_runtime": 56.9466, | |
| "eval_physics_samples_per_second": 8.78, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.546076669357717, | |
| "epoch": 2.416, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.1044444444444444e-05, | |
| "loss": 0.6307, | |
| "mean_token_accuracy": 0.8414915602654218, | |
| "num_tokens": 50370135.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 0.5511986341327428, | |
| "epoch": 2.432, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 1.0970370370370371e-05, | |
| "loss": 0.6434, | |
| "mean_token_accuracy": 0.8396999359130859, | |
| "num_tokens": 50695363.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.568221763893962, | |
| "epoch": 2.448, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.0896296296296298e-05, | |
| "loss": 0.6577, | |
| "mean_token_accuracy": 0.8357432372868061, | |
| "num_tokens": 51023498.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 0.5589647406712175, | |
| "epoch": 2.464, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.0822222222222222e-05, | |
| "loss": 0.6484, | |
| "mean_token_accuracy": 0.8391309097409249, | |
| "num_tokens": 51349816.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.5509988136589528, | |
| "epoch": 2.48, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 1.074814814814815e-05, | |
| "loss": 0.6417, | |
| "mean_token_accuracy": 0.8385909989476203, | |
| "num_tokens": 51680507.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.5563702458515764, | |
| "epoch": 2.496, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 1.0674074074074074e-05, | |
| "loss": 0.6462, | |
| "mean_token_accuracy": 0.8388838239014149, | |
| "num_tokens": 52017752.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.5431887688115239, | |
| "epoch": 2.512, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 1.0600000000000002e-05, | |
| "loss": 0.6342, | |
| "mean_token_accuracy": 0.8411614701151848, | |
| "num_tokens": 52345439.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.543870740942657, | |
| "epoch": 2.528, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 1.0525925925925927e-05, | |
| "loss": 0.6315, | |
| "mean_token_accuracy": 0.8416834581643343, | |
| "num_tokens": 52683886.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.5500559687614441, | |
| "epoch": 2.544, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.0451851851851853e-05, | |
| "loss": 0.6438, | |
| "mean_token_accuracy": 0.8382218111306429, | |
| "num_tokens": 53004529.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 0.5530376594513655, | |
| "epoch": 2.56, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.0377777777777778e-05, | |
| "loss": 0.6366, | |
| "mean_token_accuracy": 0.8400206513702869, | |
| "num_tokens": 53336280.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_biology_entropy": 1.6161543126106261, | |
| "eval_biology_loss": 2.4465079307556152, | |
| "eval_biology_mean_token_accuracy": 0.5826826608180999, | |
| "eval_biology_num_tokens": 53336280.0, | |
| "eval_biology_runtime": 38.7006, | |
| "eval_biology_samples_per_second": 12.92, | |
| "eval_biology_steps_per_second": 3.23, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_chemistry_entropy": 0.7235212452411651, | |
| "eval_chemistry_loss": 1.0302128791809082, | |
| "eval_chemistry_mean_token_accuracy": 0.776062783241272, | |
| "eval_chemistry_num_tokens": 53336280.0, | |
| "eval_chemistry_runtime": 48.1934, | |
| "eval_chemistry_samples_per_second": 10.375, | |
| "eval_chemistry_steps_per_second": 2.594, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_math_entropy": 0.6339258260726929, | |
| "eval_math_loss": 1.025081753730774, | |
| "eval_math_mean_token_accuracy": 0.7834318013191223, | |
| "eval_math_num_tokens": 53336280.0, | |
| "eval_math_runtime": 49.912, | |
| "eval_math_samples_per_second": 10.018, | |
| "eval_math_steps_per_second": 2.504, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_physics_entropy": 0.5578962779045105, | |
| "eval_physics_loss": 0.7250717878341675, | |
| "eval_physics_mean_token_accuracy": 0.8264809098243714, | |
| "eval_physics_num_tokens": 53336280.0, | |
| "eval_physics_runtime": 56.9593, | |
| "eval_physics_samples_per_second": 8.778, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.537515789270401, | |
| "epoch": 2.576, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.0303703703703705e-05, | |
| "loss": 0.6322, | |
| "mean_token_accuracy": 0.8432505313307047, | |
| "num_tokens": 53667647.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 0.5506160443648696, | |
| "epoch": 2.592, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 1.0229629629629631e-05, | |
| "loss": 0.6402, | |
| "mean_token_accuracy": 0.8399668127298355, | |
| "num_tokens": 53997614.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.5386643601581454, | |
| "epoch": 2.608, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.0155555555555556e-05, | |
| "loss": 0.6243, | |
| "mean_token_accuracy": 0.8430452451109887, | |
| "num_tokens": 54330195.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.5495816670358181, | |
| "epoch": 2.624, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.0081481481481484e-05, | |
| "loss": 0.6364, | |
| "mean_token_accuracy": 0.8401540901511908, | |
| "num_tokens": 54664342.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.5469144558534026, | |
| "epoch": 2.64, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.0007407407407407e-05, | |
| "loss": 0.636, | |
| "mean_token_accuracy": 0.8406256098300219, | |
| "num_tokens": 54993626.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.5694095639511942, | |
| "epoch": 2.656, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.933333333333334e-06, | |
| "loss": 0.656, | |
| "mean_token_accuracy": 0.8346509717404842, | |
| "num_tokens": 55339962.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.5597302883863449, | |
| "epoch": 2.672, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 9.85925925925926e-06, | |
| "loss": 0.6462, | |
| "mean_token_accuracy": 0.838615670055151, | |
| "num_tokens": 55670567.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 0.5369519403204321, | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.785185185185187e-06, | |
| "loss": 0.6323, | |
| "mean_token_accuracy": 0.843082357198, | |
| "num_tokens": 56003156.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.5516389394178987, | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 9.711111111111111e-06, | |
| "loss": 0.6369, | |
| "mean_token_accuracy": 0.8410698171705008, | |
| "num_tokens": 56342926.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.5484106032177806, | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.637037037037038e-06, | |
| "loss": 0.6328, | |
| "mean_token_accuracy": 0.8402947820723057, | |
| "num_tokens": 56677521.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_biology_entropy": 1.5972739911079408, | |
| "eval_biology_loss": 2.458798408508301, | |
| "eval_biology_mean_token_accuracy": 0.5806125638484955, | |
| "eval_biology_num_tokens": 56677521.0, | |
| "eval_biology_runtime": 38.6662, | |
| "eval_biology_samples_per_second": 12.931, | |
| "eval_biology_steps_per_second": 3.233, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_chemistry_entropy": 0.7181811017990112, | |
| "eval_chemistry_loss": 1.0299792289733887, | |
| "eval_chemistry_mean_token_accuracy": 0.7751972675323486, | |
| "eval_chemistry_num_tokens": 56677521.0, | |
| "eval_chemistry_runtime": 48.1483, | |
| "eval_chemistry_samples_per_second": 10.385, | |
| "eval_chemistry_steps_per_second": 2.596, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_math_entropy": 0.6283527569770813, | |
| "eval_math_loss": 1.0243655443191528, | |
| "eval_math_mean_token_accuracy": 0.7836487565040589, | |
| "eval_math_num_tokens": 56677521.0, | |
| "eval_math_runtime": 49.508, | |
| "eval_math_samples_per_second": 10.099, | |
| "eval_math_steps_per_second": 2.525, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_physics_entropy": 0.5540005767345428, | |
| "eval_physics_loss": 0.7235716581344604, | |
| "eval_physics_mean_token_accuracy": 0.8267813692092896, | |
| "eval_physics_num_tokens": 56677521.0, | |
| "eval_physics_runtime": 56.9575, | |
| "eval_physics_samples_per_second": 8.778, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.5626651704311371, | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.562962962962965e-06, | |
| "loss": 0.6551, | |
| "mean_token_accuracy": 0.8361094355583191, | |
| "num_tokens": 57003161.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 0.5559584245085716, | |
| "epoch": 2.752, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.48888888888889e-06, | |
| "loss": 0.6447, | |
| "mean_token_accuracy": 0.8389039475470781, | |
| "num_tokens": 57335850.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.5382809387519956, | |
| "epoch": 2.768, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 9.414814814814816e-06, | |
| "loss": 0.6267, | |
| "mean_token_accuracy": 0.8432002298533916, | |
| "num_tokens": 57672649.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 0.5509585844352841, | |
| "epoch": 2.784, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.34074074074074e-06, | |
| "loss": 0.6395, | |
| "mean_token_accuracy": 0.8397889394313097, | |
| "num_tokens": 58007431.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.5838387541472911, | |
| "epoch": 2.8, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.266666666666667e-06, | |
| "loss": 0.6711, | |
| "mean_token_accuracy": 0.8319451794028282, | |
| "num_tokens": 58332730.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.5306204471737146, | |
| "epoch": 2.816, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.192592592592594e-06, | |
| "loss": 0.6171, | |
| "mean_token_accuracy": 0.8446537777781487, | |
| "num_tokens": 58672106.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.553738858550787, | |
| "epoch": 2.832, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.118518518518518e-06, | |
| "loss": 0.6486, | |
| "mean_token_accuracy": 0.8376123756170273, | |
| "num_tokens": 58997592.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 0.5568923223763704, | |
| "epoch": 2.848, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.044444444444445e-06, | |
| "loss": 0.6446, | |
| "mean_token_accuracy": 0.8393427152186632, | |
| "num_tokens": 59326336.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 0.5415080957114696, | |
| "epoch": 2.864, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 8.970370370370372e-06, | |
| "loss": 0.6308, | |
| "mean_token_accuracy": 0.8419267870485783, | |
| "num_tokens": 59668586.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 0.5516748385503888, | |
| "epoch": 2.88, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 8.896296296296298e-06, | |
| "loss": 0.6441, | |
| "mean_token_accuracy": 0.840485867485404, | |
| "num_tokens": 59993572.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_biology_entropy": 1.6067570729255676, | |
| "eval_biology_loss": 2.442582130432129, | |
| "eval_biology_mean_token_accuracy": 0.5815401375293732, | |
| "eval_biology_num_tokens": 59993572.0, | |
| "eval_biology_runtime": 38.7204, | |
| "eval_biology_samples_per_second": 12.913, | |
| "eval_biology_steps_per_second": 3.228, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_chemistry_entropy": 0.7207075932025909, | |
| "eval_chemistry_loss": 1.0278831720352173, | |
| "eval_chemistry_mean_token_accuracy": 0.7756774797439575, | |
| "eval_chemistry_num_tokens": 59993572.0, | |
| "eval_chemistry_runtime": 48.2116, | |
| "eval_chemistry_samples_per_second": 10.371, | |
| "eval_chemistry_steps_per_second": 2.593, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_math_entropy": 0.629777349948883, | |
| "eval_math_loss": 1.0234665870666504, | |
| "eval_math_mean_token_accuracy": 0.7837573509216309, | |
| "eval_math_num_tokens": 59993572.0, | |
| "eval_math_runtime": 49.5638, | |
| "eval_math_samples_per_second": 10.088, | |
| "eval_math_steps_per_second": 2.522, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_physics_entropy": 0.5562911832332611, | |
| "eval_physics_loss": 0.7223864793777466, | |
| "eval_physics_mean_token_accuracy": 0.8269883937835694, | |
| "eval_physics_num_tokens": 59993572.0, | |
| "eval_physics_runtime": 56.9994, | |
| "eval_physics_samples_per_second": 8.772, | |
| "eval_physics_steps_per_second": 2.193, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.5429861357435584, | |
| "epoch": 2.896, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8.822222222222223e-06, | |
| "loss": 0.6347, | |
| "mean_token_accuracy": 0.8413365628570318, | |
| "num_tokens": 60324128.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 0.5474321844056249, | |
| "epoch": 2.912, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8.74814814814815e-06, | |
| "loss": 0.635, | |
| "mean_token_accuracy": 0.8426228888332844, | |
| "num_tokens": 60657399.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.5416806817054749, | |
| "epoch": 2.928, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.674074074074074e-06, | |
| "loss": 0.6306, | |
| "mean_token_accuracy": 0.8423144549131394, | |
| "num_tokens": 60984711.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 0.5390040006488561, | |
| "epoch": 2.944, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 8.6e-06, | |
| "loss": 0.6304, | |
| "mean_token_accuracy": 0.8424391083419323, | |
| "num_tokens": 61321359.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.5532678855583072, | |
| "epoch": 2.96, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 8.525925925925927e-06, | |
| "loss": 0.6378, | |
| "mean_token_accuracy": 0.8402687277644872, | |
| "num_tokens": 61659042.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.5464650699868798, | |
| "epoch": 2.976, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 8.451851851851852e-06, | |
| "loss": 0.6345, | |
| "mean_token_accuracy": 0.8401576526463032, | |
| "num_tokens": 61993595.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 0.5306900983676315, | |
| "epoch": 2.992, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 8.377777777777779e-06, | |
| "loss": 0.6196, | |
| "mean_token_accuracy": 0.8441225662827492, | |
| "num_tokens": 62334012.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.5364003209397197, | |
| "epoch": 3.008, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.303703703703705e-06, | |
| "loss": 0.6242, | |
| "mean_token_accuracy": 0.8451750382781029, | |
| "num_tokens": 62660928.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 0.5261571481823921, | |
| "epoch": 3.024, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 8.229629629629632e-06, | |
| "loss": 0.614, | |
| "mean_token_accuracy": 0.8461304292082786, | |
| "num_tokens": 62992670.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 0.5176527475938201, | |
| "epoch": 3.04, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 8.155555555555556e-06, | |
| "loss": 0.6074, | |
| "mean_token_accuracy": 0.8469121795147657, | |
| "num_tokens": 63335157.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_biology_entropy": 1.5792802815437317, | |
| "eval_biology_loss": 2.4808876514434814, | |
| "eval_biology_mean_token_accuracy": 0.5793148455619812, | |
| "eval_biology_num_tokens": 63335157.0, | |
| "eval_biology_runtime": 38.7055, | |
| "eval_biology_samples_per_second": 12.918, | |
| "eval_biology_steps_per_second": 3.23, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_chemistry_entropy": 0.7061369748115539, | |
| "eval_chemistry_loss": 1.0390231609344482, | |
| "eval_chemistry_mean_token_accuracy": 0.7745346717834473, | |
| "eval_chemistry_num_tokens": 63335157.0, | |
| "eval_chemistry_runtime": 48.1872, | |
| "eval_chemistry_samples_per_second": 10.376, | |
| "eval_chemistry_steps_per_second": 2.594, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_math_entropy": 0.6224793126583099, | |
| "eval_math_loss": 1.034122347831726, | |
| "eval_math_mean_token_accuracy": 0.7828424015045166, | |
| "eval_math_num_tokens": 63335157.0, | |
| "eval_math_runtime": 49.5497, | |
| "eval_math_samples_per_second": 10.091, | |
| "eval_math_steps_per_second": 2.523, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_physics_entropy": 0.5448903846740722, | |
| "eval_physics_loss": 0.7253366708755493, | |
| "eval_physics_mean_token_accuracy": 0.8264335384368896, | |
| "eval_physics_num_tokens": 63335157.0, | |
| "eval_physics_runtime": 56.9581, | |
| "eval_physics_samples_per_second": 8.778, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.5329983660951256, | |
| "epoch": 3.056, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 8.081481481481483e-06, | |
| "loss": 0.6183, | |
| "mean_token_accuracy": 0.8438076838850975, | |
| "num_tokens": 63662314.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 0.5316095747053623, | |
| "epoch": 3.072, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 8.007407407407408e-06, | |
| "loss": 0.6223, | |
| "mean_token_accuracy": 0.8441695164889097, | |
| "num_tokens": 63997780.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.541679815761745, | |
| "epoch": 3.088, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 7.933333333333334e-06, | |
| "loss": 0.6328, | |
| "mean_token_accuracy": 0.8417537044733763, | |
| "num_tokens": 64325274.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.5170316396281123, | |
| "epoch": 3.104, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 7.859259259259259e-06, | |
| "loss": 0.6062, | |
| "mean_token_accuracy": 0.8478735946118832, | |
| "num_tokens": 64659683.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 0.5163595724850893, | |
| "epoch": 3.12, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 7.785185185185185e-06, | |
| "loss": 0.603, | |
| "mean_token_accuracy": 0.8484522052109241, | |
| "num_tokens": 64998212.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.5422403154894709, | |
| "epoch": 3.136, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 7.711111111111112e-06, | |
| "loss": 0.6357, | |
| "mean_token_accuracy": 0.8405985131859779, | |
| "num_tokens": 65328436.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.5195852382108569, | |
| "epoch": 3.152, | |
| "grad_norm": 1.125, | |
| "learning_rate": 7.637037037037037e-06, | |
| "loss": 0.602, | |
| "mean_token_accuracy": 0.8487723391503096, | |
| "num_tokens": 65659346.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 0.5302856534719467, | |
| "epoch": 3.168, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 7.562962962962963e-06, | |
| "loss": 0.6216, | |
| "mean_token_accuracy": 0.8445643980056048, | |
| "num_tokens": 65986382.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.5176609115675092, | |
| "epoch": 3.184, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 7.48888888888889e-06, | |
| "loss": 0.6056, | |
| "mean_token_accuracy": 0.8478365700691939, | |
| "num_tokens": 66324308.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.5267078908160329, | |
| "epoch": 3.2, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 7.4148148148148155e-06, | |
| "loss": 0.6149, | |
| "mean_token_accuracy": 0.8465773615986109, | |
| "num_tokens": 66658712.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_biology_entropy": 1.5625373516082763, | |
| "eval_biology_loss": 2.4981629848480225, | |
| "eval_biology_mean_token_accuracy": 0.57876149559021, | |
| "eval_biology_num_tokens": 66658712.0, | |
| "eval_biology_runtime": 38.9285, | |
| "eval_biology_samples_per_second": 12.844, | |
| "eval_biology_steps_per_second": 3.211, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_chemistry_entropy": 0.70146466588974, | |
| "eval_chemistry_loss": 1.0409005880355835, | |
| "eval_chemistry_mean_token_accuracy": 0.7748316297531128, | |
| "eval_chemistry_num_tokens": 66658712.0, | |
| "eval_chemistry_runtime": 48.3893, | |
| "eval_chemistry_samples_per_second": 10.333, | |
| "eval_chemistry_steps_per_second": 2.583, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_math_entropy": 0.6218927059173583, | |
| "eval_math_loss": 1.0327448844909668, | |
| "eval_math_mean_token_accuracy": 0.7830894327163697, | |
| "eval_math_num_tokens": 66658712.0, | |
| "eval_math_runtime": 49.899, | |
| "eval_math_samples_per_second": 10.02, | |
| "eval_math_steps_per_second": 2.505, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_physics_entropy": 0.5445813267230988, | |
| "eval_physics_loss": 0.725453794002533, | |
| "eval_physics_mean_token_accuracy": 0.8266869735717773, | |
| "eval_physics_num_tokens": 66658712.0, | |
| "eval_physics_runtime": 57.3527, | |
| "eval_physics_samples_per_second": 8.718, | |
| "eval_physics_steps_per_second": 2.179, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.956042200464073e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |