Instructions to use arunasank/6bk0jo2e with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use arunasank/6bk0jo2e with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="arunasank/6bk0jo2e") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("arunasank/6bk0jo2e") model = AutoModelForImageTextToText.from_pretrained("arunasank/6bk0jo2e") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use arunasank/6bk0jo2e with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "arunasank/6bk0jo2e" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "arunasank/6bk0jo2e", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/arunasank/6bk0jo2e
- SGLang
How to use arunasank/6bk0jo2e with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "arunasank/6bk0jo2e" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "arunasank/6bk0jo2e", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "arunasank/6bk0jo2e" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "arunasank/6bk0jo2e", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use arunasank/6bk0jo2e with Docker Model Runner:
docker model run hf.co/arunasank/6bk0jo2e
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.2, | |
| "eval_steps": 100, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.9806057829409838, | |
| "epoch": 0.016, | |
| "grad_norm": 157.0, | |
| "learning_rate": 1.8e-07, | |
| "loss": 21.7295, | |
| "mean_token_accuracy": 0.695003604888916, | |
| "num_tokens": 280941.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.0097382467240095, | |
| "epoch": 0.032, | |
| "grad_norm": 171.0, | |
| "learning_rate": 3.8e-07, | |
| "loss": 21.935, | |
| "mean_token_accuracy": 0.6911116372793913, | |
| "num_tokens": 558056.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.0059957668185233, | |
| "epoch": 0.048, | |
| "grad_norm": 146.0, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 21.4243, | |
| "mean_token_accuracy": 0.6972350142896175, | |
| "num_tokens": 836753.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.0606450594961643, | |
| "epoch": 0.064, | |
| "grad_norm": 129.0, | |
| "learning_rate": 7.8e-07, | |
| "loss": 22.2256, | |
| "mean_token_accuracy": 0.6895487096160651, | |
| "num_tokens": 1126446.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.1183118436485529, | |
| "epoch": 0.08, | |
| "grad_norm": 99.5, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 21.2808, | |
| "mean_token_accuracy": 0.6964452721178531, | |
| "num_tokens": 1413596.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.1463136691600084, | |
| "epoch": 0.096, | |
| "grad_norm": 83.0, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 20.5501, | |
| "mean_token_accuracy": 0.7033276192843914, | |
| "num_tokens": 1701193.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.1966488853096962, | |
| "epoch": 0.112, | |
| "grad_norm": 75.5, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 20.5984, | |
| "mean_token_accuracy": 0.7012545391917229, | |
| "num_tokens": 1979232.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.2089795324951411, | |
| "epoch": 0.128, | |
| "grad_norm": 69.0, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 20.2753, | |
| "mean_token_accuracy": 0.7075193412601948, | |
| "num_tokens": 2274177.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.1860636565834284, | |
| "epoch": 0.144, | |
| "grad_norm": 70.0, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 19.7829, | |
| "mean_token_accuracy": 0.710378497838974, | |
| "num_tokens": 2548445.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.1838629063218833, | |
| "epoch": 0.16, | |
| "grad_norm": 85.5, | |
| "learning_rate": 1.98e-06, | |
| "loss": 19.4179, | |
| "mean_token_accuracy": 0.7149847097694874, | |
| "num_tokens": 2824418.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_biology_entropy": 1.474690469264984, | |
| "eval_biology_loss": 1.3576592206954956, | |
| "eval_biology_mean_token_accuracy": 0.6709716210365295, | |
| "eval_biology_num_tokens": 2824418.0, | |
| "eval_biology_runtime": 22.204, | |
| "eval_biology_samples_per_second": 22.518, | |
| "eval_biology_steps_per_second": 5.63, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_chemistry_entropy": 1.247537253856659, | |
| "eval_chemistry_loss": 1.1488269567489624, | |
| "eval_chemistry_mean_token_accuracy": 0.7154391117095947, | |
| "eval_chemistry_num_tokens": 2824418.0, | |
| "eval_chemistry_runtime": 26.8594, | |
| "eval_chemistry_samples_per_second": 18.615, | |
| "eval_chemistry_steps_per_second": 4.654, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_math_entropy": 1.2081865873336792, | |
| "eval_math_loss": 1.2300448417663574, | |
| "eval_math_mean_token_accuracy": 0.710663827419281, | |
| "eval_math_num_tokens": 2824418.0, | |
| "eval_math_runtime": 27.5187, | |
| "eval_math_samples_per_second": 18.169, | |
| "eval_math_steps_per_second": 4.542, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_cyber_entropy": 3.049607857465744, | |
| "eval_cyber_loss": 3.3160624504089355, | |
| "eval_cyber_mean_token_accuracy": 0.4259996695816517, | |
| "eval_cyber_num_tokens": 2824418.0, | |
| "eval_cyber_runtime": 26.1305, | |
| "eval_cyber_samples_per_second": 15.193, | |
| "eval_cyber_steps_per_second": 3.827, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.1945379309356212, | |
| "epoch": 0.176, | |
| "grad_norm": 78.5, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 19.3659, | |
| "mean_token_accuracy": 0.7157268539071083, | |
| "num_tokens": 3110313.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.1879339709877967, | |
| "epoch": 0.192, | |
| "grad_norm": 62.5, | |
| "learning_rate": 2.38e-06, | |
| "loss": 19.2957, | |
| "mean_token_accuracy": 0.7150671608746052, | |
| "num_tokens": 3394170.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.1658376209437846, | |
| "epoch": 0.208, | |
| "grad_norm": 64.5, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 18.9243, | |
| "mean_token_accuracy": 0.7201551966369152, | |
| "num_tokens": 3673600.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.1712732832878827, | |
| "epoch": 0.224, | |
| "grad_norm": 62.25, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 18.8985, | |
| "mean_token_accuracy": 0.7195578265935183, | |
| "num_tokens": 3953732.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.1372143357992173, | |
| "epoch": 0.24, | |
| "grad_norm": 59.25, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 18.3388, | |
| "mean_token_accuracy": 0.7276442721486092, | |
| "num_tokens": 4243655.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.1075575590133666, | |
| "epoch": 0.256, | |
| "grad_norm": 55.5, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 17.7317, | |
| "mean_token_accuracy": 0.7332122329622507, | |
| "num_tokens": 4531471.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.1275247156620025, | |
| "epoch": 0.272, | |
| "grad_norm": 53.0, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 18.2164, | |
| "mean_token_accuracy": 0.7298861864954234, | |
| "num_tokens": 4810284.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.1071187134832143, | |
| "epoch": 0.288, | |
| "grad_norm": 49.25, | |
| "learning_rate": 3.58e-06, | |
| "loss": 17.7625, | |
| "mean_token_accuracy": 0.7337470225989818, | |
| "num_tokens": 5095104.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.1132069051265716, | |
| "epoch": 0.304, | |
| "grad_norm": 54.25, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 17.7962, | |
| "mean_token_accuracy": 0.7336070898920297, | |
| "num_tokens": 5383732.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.0761282254010438, | |
| "epoch": 0.32, | |
| "grad_norm": 51.0, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 17.3026, | |
| "mean_token_accuracy": 0.7393171060830355, | |
| "num_tokens": 5676334.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_biology_entropy": 1.4338086099624634, | |
| "eval_biology_loss": 1.3705029487609863, | |
| "eval_biology_mean_token_accuracy": 0.6675603828430176, | |
| "eval_biology_num_tokens": 5676334.0, | |
| "eval_biology_runtime": 22.0314, | |
| "eval_biology_samples_per_second": 22.695, | |
| "eval_biology_steps_per_second": 5.674, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_chemistry_entropy": 1.1902209458351136, | |
| "eval_chemistry_loss": 1.1403467655181885, | |
| "eval_chemistry_mean_token_accuracy": 0.7157996978759765, | |
| "eval_chemistry_num_tokens": 5676334.0, | |
| "eval_chemistry_runtime": 26.8655, | |
| "eval_chemistry_samples_per_second": 18.611, | |
| "eval_chemistry_steps_per_second": 4.653, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_math_entropy": 1.1017972359657286, | |
| "eval_math_loss": 1.1044487953186035, | |
| "eval_math_mean_token_accuracy": 0.7332207527160645, | |
| "eval_math_num_tokens": 5676334.0, | |
| "eval_math_runtime": 27.5365, | |
| "eval_math_samples_per_second": 18.158, | |
| "eval_math_steps_per_second": 4.539, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_cyber_entropy": 2.916593015193939, | |
| "eval_cyber_loss": 3.2743771076202393, | |
| "eval_cyber_mean_token_accuracy": 0.4284310387074947, | |
| "eval_cyber_num_tokens": 5676334.0, | |
| "eval_cyber_runtime": 26.1262, | |
| "eval_cyber_samples_per_second": 15.195, | |
| "eval_cyber_steps_per_second": 3.828, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.0972535338252782, | |
| "epoch": 0.336, | |
| "grad_norm": 48.0, | |
| "learning_rate": 4.18e-06, | |
| "loss": 17.6967, | |
| "mean_token_accuracy": 0.7339770793914795, | |
| "num_tokens": 5958480.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.0661637954413892, | |
| "epoch": 0.352, | |
| "grad_norm": 52.0, | |
| "learning_rate": 4.38e-06, | |
| "loss": 17.0451, | |
| "mean_token_accuracy": 0.7420999370515347, | |
| "num_tokens": 6242161.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.0574691709131003, | |
| "epoch": 0.368, | |
| "grad_norm": 43.75, | |
| "learning_rate": 4.58e-06, | |
| "loss": 17.0476, | |
| "mean_token_accuracy": 0.7413399379700423, | |
| "num_tokens": 6523679.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.0383559666574, | |
| "epoch": 0.384, | |
| "grad_norm": 52.75, | |
| "learning_rate": 4.78e-06, | |
| "loss": 16.6413, | |
| "mean_token_accuracy": 0.7455471660941839, | |
| "num_tokens": 6810978.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.0844071809202434, | |
| "epoch": 0.4, | |
| "grad_norm": 54.25, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 17.5456, | |
| "mean_token_accuracy": 0.7374324273318053, | |
| "num_tokens": 7096903.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.0462343256920577, | |
| "epoch": 0.416, | |
| "grad_norm": 46.25, | |
| "learning_rate": 5.18e-06, | |
| "loss": 16.7333, | |
| "mean_token_accuracy": 0.7437998823821544, | |
| "num_tokens": 7377181.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.0309648185968399, | |
| "epoch": 0.432, | |
| "grad_norm": 53.5, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 16.3635, | |
| "mean_token_accuracy": 0.7494884602725506, | |
| "num_tokens": 7650523.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.011606451496482, | |
| "epoch": 0.448, | |
| "grad_norm": 55.5, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 16.2941, | |
| "mean_token_accuracy": 0.7498481426388025, | |
| "num_tokens": 7936788.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.0099118243902923, | |
| "epoch": 0.464, | |
| "grad_norm": 46.25, | |
| "learning_rate": 5.78e-06, | |
| "loss": 16.2711, | |
| "mean_token_accuracy": 0.7521415069699288, | |
| "num_tokens": 8223147.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.0334131706506013, | |
| "epoch": 0.48, | |
| "grad_norm": 43.25, | |
| "learning_rate": 5.98e-06, | |
| "loss": 16.4655, | |
| "mean_token_accuracy": 0.748864620923996, | |
| "num_tokens": 8506030.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_biology_entropy": 1.422005220413208, | |
| "eval_biology_loss": 1.3784860372543335, | |
| "eval_biology_mean_token_accuracy": 0.6655148763656616, | |
| "eval_biology_num_tokens": 8506030.0, | |
| "eval_biology_runtime": 22.0223, | |
| "eval_biology_samples_per_second": 22.704, | |
| "eval_biology_steps_per_second": 5.676, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_chemistry_entropy": 1.1605761876106262, | |
| "eval_chemistry_loss": 1.1291848421096802, | |
| "eval_chemistry_mean_token_accuracy": 0.7179730429649352, | |
| "eval_chemistry_num_tokens": 8506030.0, | |
| "eval_chemistry_runtime": 26.8904, | |
| "eval_chemistry_samples_per_second": 18.594, | |
| "eval_chemistry_steps_per_second": 4.648, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_math_entropy": 1.0216443300247193, | |
| "eval_math_loss": 1.0201935768127441, | |
| "eval_math_mean_token_accuracy": 0.749815523147583, | |
| "eval_math_num_tokens": 8506030.0, | |
| "eval_math_runtime": 27.5335, | |
| "eval_math_samples_per_second": 18.16, | |
| "eval_math_steps_per_second": 4.54, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_cyber_entropy": 2.940396952629089, | |
| "eval_cyber_loss": 3.153137683868408, | |
| "eval_cyber_mean_token_accuracy": 0.4395553506910801, | |
| "eval_cyber_num_tokens": 8506030.0, | |
| "eval_cyber_runtime": 26.1452, | |
| "eval_cyber_samples_per_second": 15.184, | |
| "eval_cyber_steps_per_second": 3.825, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.9862126674503088, | |
| "epoch": 0.496, | |
| "grad_norm": 56.25, | |
| "learning_rate": 6.18e-06, | |
| "loss": 15.7919, | |
| "mean_token_accuracy": 0.7557632889598608, | |
| "num_tokens": 8788726.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.9811401419341564, | |
| "epoch": 0.512, | |
| "grad_norm": 46.25, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 15.7454, | |
| "mean_token_accuracy": 0.7576492365449667, | |
| "num_tokens": 9078039.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.9647420089691877, | |
| "epoch": 0.528, | |
| "grad_norm": 44.5, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 15.4619, | |
| "mean_token_accuracy": 0.7620009411126375, | |
| "num_tokens": 9373860.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.9784351203590631, | |
| "epoch": 0.544, | |
| "grad_norm": 51.75, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 15.6775, | |
| "mean_token_accuracy": 0.7592026349157095, | |
| "num_tokens": 9660940.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.9894711822271347, | |
| "epoch": 0.56, | |
| "grad_norm": 62.25, | |
| "learning_rate": 6.98e-06, | |
| "loss": 15.8268, | |
| "mean_token_accuracy": 0.754706758633256, | |
| "num_tokens": 9932302.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.9637547507882118, | |
| "epoch": 0.576, | |
| "grad_norm": 45.0, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 15.4676, | |
| "mean_token_accuracy": 0.7605381533503532, | |
| "num_tokens": 10215462.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.9582533340901136, | |
| "epoch": 0.592, | |
| "grad_norm": 42.5, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 15.343, | |
| "mean_token_accuracy": 0.7618525486439467, | |
| "num_tokens": 10504396.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.9651506002992392, | |
| "epoch": 0.608, | |
| "grad_norm": 42.75, | |
| "learning_rate": 7.58e-06, | |
| "loss": 15.4775, | |
| "mean_token_accuracy": 0.7616991735994816, | |
| "num_tokens": 10793126.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.951299836859107, | |
| "epoch": 0.624, | |
| "grad_norm": 46.0, | |
| "learning_rate": 7.78e-06, | |
| "loss": 15.2327, | |
| "mean_token_accuracy": 0.7628108691424131, | |
| "num_tokens": 11081768.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.9370664428919554, | |
| "epoch": 0.64, | |
| "grad_norm": 45.75, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 14.9056, | |
| "mean_token_accuracy": 0.7668638564646244, | |
| "num_tokens": 11370320.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_biology_entropy": 1.4101261868476869, | |
| "eval_biology_loss": 1.385290503501892, | |
| "eval_biology_mean_token_accuracy": 0.6643283500671386, | |
| "eval_biology_num_tokens": 11370320.0, | |
| "eval_biology_runtime": 22.0311, | |
| "eval_biology_samples_per_second": 22.695, | |
| "eval_biology_steps_per_second": 5.674, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_chemistry_entropy": 1.134839651107788, | |
| "eval_chemistry_loss": 1.117846965789795, | |
| "eval_chemistry_mean_token_accuracy": 0.721014030456543, | |
| "eval_chemistry_num_tokens": 11370320.0, | |
| "eval_chemistry_runtime": 26.9193, | |
| "eval_chemistry_samples_per_second": 18.574, | |
| "eval_chemistry_steps_per_second": 4.644, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_math_entropy": 0.9476065578460693, | |
| "eval_math_loss": 0.9578045606613159, | |
| "eval_math_mean_token_accuracy": 0.760874231338501, | |
| "eval_math_num_tokens": 11370320.0, | |
| "eval_math_runtime": 27.5198, | |
| "eval_math_samples_per_second": 18.169, | |
| "eval_math_steps_per_second": 4.542, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_cyber_entropy": 2.8283654510974885, | |
| "eval_cyber_loss": 3.0569260120391846, | |
| "eval_cyber_mean_token_accuracy": 0.4431245893239975, | |
| "eval_cyber_num_tokens": 11370320.0, | |
| "eval_cyber_runtime": 26.384, | |
| "eval_cyber_samples_per_second": 15.047, | |
| "eval_cyber_steps_per_second": 3.79, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.9207184508442878, | |
| "epoch": 0.656, | |
| "grad_norm": 44.5, | |
| "learning_rate": 8.18e-06, | |
| "loss": 14.748, | |
| "mean_token_accuracy": 0.7685097701847553, | |
| "num_tokens": 11657835.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.9384444292634726, | |
| "epoch": 0.672, | |
| "grad_norm": 44.5, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 15.0013, | |
| "mean_token_accuracy": 0.7654238797724247, | |
| "num_tokens": 11949262.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.9148579228669405, | |
| "epoch": 0.688, | |
| "grad_norm": 35.75, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 14.6923, | |
| "mean_token_accuracy": 0.7695376992225647, | |
| "num_tokens": 12227640.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.9156919397413731, | |
| "epoch": 0.704, | |
| "grad_norm": 36.5, | |
| "learning_rate": 8.78e-06, | |
| "loss": 14.6672, | |
| "mean_token_accuracy": 0.7705338027328252, | |
| "num_tokens": 12516641.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.9278485044836998, | |
| "epoch": 0.72, | |
| "grad_norm": 42.5, | |
| "learning_rate": 8.98e-06, | |
| "loss": 14.8435, | |
| "mean_token_accuracy": 0.7673114899545908, | |
| "num_tokens": 12793343.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.9052219696342945, | |
| "epoch": 0.736, | |
| "grad_norm": 41.0, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 14.5171, | |
| "mean_token_accuracy": 0.7729556966573, | |
| "num_tokens": 13077981.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.8888643320649863, | |
| "epoch": 0.752, | |
| "grad_norm": 41.25, | |
| "learning_rate": 9.38e-06, | |
| "loss": 14.1497, | |
| "mean_token_accuracy": 0.776068452000618, | |
| "num_tokens": 13358957.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.8620530396699906, | |
| "epoch": 0.768, | |
| "grad_norm": 42.5, | |
| "learning_rate": 9.58e-06, | |
| "loss": 13.834, | |
| "mean_token_accuracy": 0.7819891981780529, | |
| "num_tokens": 13653412.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.9176285572350025, | |
| "epoch": 0.784, | |
| "grad_norm": 39.0, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 14.6357, | |
| "mean_token_accuracy": 0.7709478087723255, | |
| "num_tokens": 13940856.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.9128546692430973, | |
| "epoch": 0.8, | |
| "grad_norm": 40.25, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 14.5874, | |
| "mean_token_accuracy": 0.7713334109634161, | |
| "num_tokens": 14230754.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_biology_entropy": 1.4446520280838013, | |
| "eval_biology_loss": 1.392669439315796, | |
| "eval_biology_mean_token_accuracy": 0.6637401723861694, | |
| "eval_biology_num_tokens": 14230754.0, | |
| "eval_biology_runtime": 22.2391, | |
| "eval_biology_samples_per_second": 22.483, | |
| "eval_biology_steps_per_second": 5.621, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_chemistry_entropy": 1.1566239352226257, | |
| "eval_chemistry_loss": 1.1047961711883545, | |
| "eval_chemistry_mean_token_accuracy": 0.7233014287948608, | |
| "eval_chemistry_num_tokens": 14230754.0, | |
| "eval_chemistry_runtime": 26.8042, | |
| "eval_chemistry_samples_per_second": 18.654, | |
| "eval_chemistry_steps_per_second": 4.663, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_math_entropy": 0.9341866765022278, | |
| "eval_math_loss": 0.9057817459106445, | |
| "eval_math_mean_token_accuracy": 0.7704657621383667, | |
| "eval_math_num_tokens": 14230754.0, | |
| "eval_math_runtime": 27.5093, | |
| "eval_math_samples_per_second": 18.176, | |
| "eval_math_steps_per_second": 4.544, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_cyber_entropy": 2.902767553329468, | |
| "eval_cyber_loss": 2.947915554046631, | |
| "eval_cyber_mean_token_accuracy": 0.45555895671248436, | |
| "eval_cyber_num_tokens": 14230754.0, | |
| "eval_cyber_runtime": 26.1612, | |
| "eval_cyber_samples_per_second": 15.175, | |
| "eval_cyber_steps_per_second": 3.822, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.8841698631644249, | |
| "epoch": 0.816, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.018e-05, | |
| "loss": 14.1477, | |
| "mean_token_accuracy": 0.7752781912684441, | |
| "num_tokens": 14519893.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.875461632013321, | |
| "epoch": 0.832, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.038e-05, | |
| "loss": 14.0361, | |
| "mean_token_accuracy": 0.7779335591942071, | |
| "num_tokens": 14805088.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.899658627063036, | |
| "epoch": 0.848, | |
| "grad_norm": 37.25, | |
| "learning_rate": 1.0580000000000002e-05, | |
| "loss": 14.3925, | |
| "mean_token_accuracy": 0.7728543490171432, | |
| "num_tokens": 15086306.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.8889949310570955, | |
| "epoch": 0.864, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.0780000000000002e-05, | |
| "loss": 14.1314, | |
| "mean_token_accuracy": 0.7759746141731739, | |
| "num_tokens": 15370985.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.8936371214687824, | |
| "epoch": 0.88, | |
| "grad_norm": 36.5, | |
| "learning_rate": 1.0980000000000002e-05, | |
| "loss": 14.2807, | |
| "mean_token_accuracy": 0.7754444174468518, | |
| "num_tokens": 15653836.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.8719608142971993, | |
| "epoch": 0.896, | |
| "grad_norm": 34.75, | |
| "learning_rate": 1.1180000000000001e-05, | |
| "loss": 13.9767, | |
| "mean_token_accuracy": 0.7785432428121567, | |
| "num_tokens": 15932179.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.8601628458127379, | |
| "epoch": 0.912, | |
| "grad_norm": 35.25, | |
| "learning_rate": 1.138e-05, | |
| "loss": 13.7333, | |
| "mean_token_accuracy": 0.77998266518116, | |
| "num_tokens": 16219842.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.848052042350173, | |
| "epoch": 0.928, | |
| "grad_norm": 34.25, | |
| "learning_rate": 1.1580000000000001e-05, | |
| "loss": 13.5598, | |
| "mean_token_accuracy": 0.783201026916504, | |
| "num_tokens": 16499842.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.8647568510845304, | |
| "epoch": 0.944, | |
| "grad_norm": 98.5, | |
| "learning_rate": 1.178e-05, | |
| "loss": 13.9513, | |
| "mean_token_accuracy": 0.7791629247367382, | |
| "num_tokens": 16781882.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.8681454580277205, | |
| "epoch": 0.96, | |
| "grad_norm": 28.375, | |
| "learning_rate": 1.198e-05, | |
| "loss": 13.8619, | |
| "mean_token_accuracy": 0.7806598395109177, | |
| "num_tokens": 17067407.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_biology_entropy": 1.403841501235962, | |
| "eval_biology_loss": 1.3980714082717896, | |
| "eval_biology_mean_token_accuracy": 0.6626365647315979, | |
| "eval_biology_num_tokens": 17067407.0, | |
| "eval_biology_runtime": 22.2675, | |
| "eval_biology_samples_per_second": 22.454, | |
| "eval_biology_steps_per_second": 5.614, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_chemistry_entropy": 1.099705493927002, | |
| "eval_chemistry_loss": 1.0918222665786743, | |
| "eval_chemistry_mean_token_accuracy": 0.7260931057929992, | |
| "eval_chemistry_num_tokens": 17067407.0, | |
| "eval_chemistry_runtime": 26.9078, | |
| "eval_chemistry_samples_per_second": 18.582, | |
| "eval_chemistry_steps_per_second": 4.645, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_math_entropy": 0.8664147562980652, | |
| "eval_math_loss": 0.8665754795074463, | |
| "eval_math_mean_token_accuracy": 0.7780344748497009, | |
| "eval_math_num_tokens": 17067407.0, | |
| "eval_math_runtime": 27.5569, | |
| "eval_math_samples_per_second": 18.144, | |
| "eval_math_steps_per_second": 4.536, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_cyber_entropy": 2.6499598491191865, | |
| "eval_cyber_loss": 2.8601412773132324, | |
| "eval_cyber_mean_token_accuracy": 0.4642623996734619, | |
| "eval_cyber_num_tokens": 17067407.0, | |
| "eval_cyber_runtime": 26.1765, | |
| "eval_cyber_samples_per_second": 15.166, | |
| "eval_cyber_steps_per_second": 3.82, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.8494564741849899, | |
| "epoch": 0.976, | |
| "grad_norm": 40.25, | |
| "learning_rate": 1.218e-05, | |
| "loss": 13.6376, | |
| "mean_token_accuracy": 0.7825958080589771, | |
| "num_tokens": 17350994.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.8730685204267502, | |
| "epoch": 0.992, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.2380000000000002e-05, | |
| "loss": 13.8595, | |
| "mean_token_accuracy": 0.7786977473646403, | |
| "num_tokens": 17637514.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.8468878531828523, | |
| "epoch": 1.008, | |
| "grad_norm": 32.5, | |
| "learning_rate": 1.2580000000000002e-05, | |
| "loss": 13.534, | |
| "mean_token_accuracy": 0.7822502862662077, | |
| "num_tokens": 17926570.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.8290399981662631, | |
| "epoch": 1.024, | |
| "grad_norm": 29.375, | |
| "learning_rate": 1.2780000000000001e-05, | |
| "loss": 13.2779, | |
| "mean_token_accuracy": 0.7864516761153937, | |
| "num_tokens": 18207652.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.8298395985737443, | |
| "epoch": 1.04, | |
| "grad_norm": 31.0, | |
| "learning_rate": 1.2980000000000001e-05, | |
| "loss": 13.1281, | |
| "mean_token_accuracy": 0.7878676626831294, | |
| "num_tokens": 18484931.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.8254991695284843, | |
| "epoch": 1.056, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.3180000000000001e-05, | |
| "loss": 13.2747, | |
| "mean_token_accuracy": 0.7866876818239689, | |
| "num_tokens": 18773457.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.8410865612328052, | |
| "epoch": 1.072, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.3380000000000002e-05, | |
| "loss": 13.3101, | |
| "mean_token_accuracy": 0.7853217396885157, | |
| "num_tokens": 19055365.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.8255538143217563, | |
| "epoch": 1.088, | |
| "grad_norm": 28.875, | |
| "learning_rate": 1.3580000000000002e-05, | |
| "loss": 13.2174, | |
| "mean_token_accuracy": 0.7872007485479117, | |
| "num_tokens": 19345730.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.823124579153955, | |
| "epoch": 1.104, | |
| "grad_norm": 29.375, | |
| "learning_rate": 1.378e-05, | |
| "loss": 13.1696, | |
| "mean_token_accuracy": 0.7877223126590251, | |
| "num_tokens": 19637390.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.8028364922851324, | |
| "epoch": 1.12, | |
| "grad_norm": 34.5, | |
| "learning_rate": 1.398e-05, | |
| "loss": 12.7597, | |
| "mean_token_accuracy": 0.792793495580554, | |
| "num_tokens": 19923914.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_biology_entropy": 1.3986766724586486, | |
| "eval_biology_loss": 1.407199740409851, | |
| "eval_biology_mean_token_accuracy": 0.6613863172531128, | |
| "eval_biology_num_tokens": 19923914.0, | |
| "eval_biology_runtime": 21.9946, | |
| "eval_biology_samples_per_second": 22.733, | |
| "eval_biology_steps_per_second": 5.683, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_chemistry_entropy": 1.0769947800636293, | |
| "eval_chemistry_loss": 1.0871174335479736, | |
| "eval_chemistry_mean_token_accuracy": 0.7282235732078552, | |
| "eval_chemistry_num_tokens": 19923914.0, | |
| "eval_chemistry_runtime": 26.8846, | |
| "eval_chemistry_samples_per_second": 18.598, | |
| "eval_chemistry_steps_per_second": 4.65, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_math_entropy": 0.8362808737754822, | |
| "eval_math_loss": 0.8373622894287109, | |
| "eval_math_mean_token_accuracy": 0.7839989976882935, | |
| "eval_math_num_tokens": 19923914.0, | |
| "eval_math_runtime": 27.4992, | |
| "eval_math_samples_per_second": 18.182, | |
| "eval_math_steps_per_second": 4.546, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_cyber_entropy": 2.5681421542167664, | |
| "eval_cyber_loss": 2.8721120357513428, | |
| "eval_cyber_mean_token_accuracy": 0.4651792038977146, | |
| "eval_cyber_num_tokens": 19923914.0, | |
| "eval_cyber_runtime": 26.1144, | |
| "eval_cyber_samples_per_second": 15.202, | |
| "eval_cyber_steps_per_second": 3.829, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.8095206459984183, | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.418e-05, | |
| "loss": 12.8823, | |
| "mean_token_accuracy": 0.7904491990804672, | |
| "num_tokens": 20201892.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.8196006739512086, | |
| "epoch": 1.152, | |
| "grad_norm": 30.75, | |
| "learning_rate": 1.4380000000000001e-05, | |
| "loss": 13.0652, | |
| "mean_token_accuracy": 0.7910903133451939, | |
| "num_tokens": 20490282.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.8046272564679384, | |
| "epoch": 1.168, | |
| "grad_norm": 29.5, | |
| "learning_rate": 1.4580000000000001e-05, | |
| "loss": 12.8751, | |
| "mean_token_accuracy": 0.7915120176970959, | |
| "num_tokens": 20785786.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.8037027461454272, | |
| "epoch": 1.184, | |
| "grad_norm": 30.5, | |
| "learning_rate": 1.478e-05, | |
| "loss": 12.8554, | |
| "mean_token_accuracy": 0.7912269696593285, | |
| "num_tokens": 21074205.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.79942841604352, | |
| "epoch": 1.2, | |
| "grad_norm": 30.75, | |
| "learning_rate": 1.498e-05, | |
| "loss": 12.7343, | |
| "mean_token_accuracy": 0.7923291265964508, | |
| "num_tokens": 21369159.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.807464637234807, | |
| "epoch": 1.216, | |
| "grad_norm": 35.0, | |
| "learning_rate": 1.5180000000000002e-05, | |
| "loss": 12.8367, | |
| "mean_token_accuracy": 0.7913754984736443, | |
| "num_tokens": 21649178.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.7876615423709155, | |
| "epoch": 1.232, | |
| "grad_norm": 30.25, | |
| "learning_rate": 1.5380000000000002e-05, | |
| "loss": 12.556, | |
| "mean_token_accuracy": 0.7947213523089885, | |
| "num_tokens": 21930239.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.7889078231528401, | |
| "epoch": 1.248, | |
| "grad_norm": 29.75, | |
| "learning_rate": 1.5580000000000003e-05, | |
| "loss": 12.5585, | |
| "mean_token_accuracy": 0.7938403252512216, | |
| "num_tokens": 22216387.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.8203166201710701, | |
| "epoch": 1.264, | |
| "grad_norm": 30.0, | |
| "learning_rate": 1.578e-05, | |
| "loss": 13.0401, | |
| "mean_token_accuracy": 0.7889407943934202, | |
| "num_tokens": 22501002.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.7915117274969816, | |
| "epoch": 1.28, | |
| "grad_norm": 33.0, | |
| "learning_rate": 1.5980000000000003e-05, | |
| "loss": 12.6967, | |
| "mean_token_accuracy": 0.7933882053941488, | |
| "num_tokens": 22779682.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_biology_entropy": 1.4146036610603332, | |
| "eval_biology_loss": 1.413214921951294, | |
| "eval_biology_mean_token_accuracy": 0.659837914943695, | |
| "eval_biology_num_tokens": 22779682.0, | |
| "eval_biology_runtime": 22.0253, | |
| "eval_biology_samples_per_second": 22.701, | |
| "eval_biology_steps_per_second": 5.675, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_chemistry_entropy": 1.0714722080230712, | |
| "eval_chemistry_loss": 1.0812031030654907, | |
| "eval_chemistry_mean_token_accuracy": 0.7291303877830505, | |
| "eval_chemistry_num_tokens": 22779682.0, | |
| "eval_chemistry_runtime": 26.8892, | |
| "eval_chemistry_samples_per_second": 18.595, | |
| "eval_chemistry_steps_per_second": 4.649, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_math_entropy": 0.8192751173973083, | |
| "eval_math_loss": 0.8122938275337219, | |
| "eval_math_mean_token_accuracy": 0.789606306552887, | |
| "eval_math_num_tokens": 22779682.0, | |
| "eval_math_runtime": 27.5274, | |
| "eval_math_samples_per_second": 18.164, | |
| "eval_math_steps_per_second": 4.541, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_cyber_entropy": 2.5670096004009246, | |
| "eval_cyber_loss": 2.8539652824401855, | |
| "eval_cyber_mean_token_accuracy": 0.46533648878335954, | |
| "eval_cyber_num_tokens": 22779682.0, | |
| "eval_cyber_runtime": 26.1723, | |
| "eval_cyber_samples_per_second": 15.169, | |
| "eval_cyber_steps_per_second": 3.821, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.786592660844326, | |
| "epoch": 1.296, | |
| "grad_norm": 35.5, | |
| "learning_rate": 1.618e-05, | |
| "loss": 12.5036, | |
| "mean_token_accuracy": 0.7954838387668133, | |
| "num_tokens": 23057744.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.7977361943572759, | |
| "epoch": 1.312, | |
| "grad_norm": 34.5, | |
| "learning_rate": 1.638e-05, | |
| "loss": 12.7511, | |
| "mean_token_accuracy": 0.7935790359973908, | |
| "num_tokens": 23344644.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.8038571482524276, | |
| "epoch": 1.328, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.658e-05, | |
| "loss": 12.847, | |
| "mean_token_accuracy": 0.7909597154706717, | |
| "num_tokens": 23622405.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.781531005539, | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 28.75, | |
| "learning_rate": 1.6780000000000002e-05, | |
| "loss": 12.4551, | |
| "mean_token_accuracy": 0.795590429380536, | |
| "num_tokens": 23899771.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.7783096175640821, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 27.0, | |
| "learning_rate": 1.698e-05, | |
| "loss": 12.4462, | |
| "mean_token_accuracy": 0.7967745348811149, | |
| "num_tokens": 24187023.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.8302321504801512, | |
| "epoch": 1.376, | |
| "grad_norm": 30.625, | |
| "learning_rate": 1.718e-05, | |
| "loss": 13.2594, | |
| "mean_token_accuracy": 0.7850385505706072, | |
| "num_tokens": 24466132.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.7808034917339682, | |
| "epoch": 1.392, | |
| "grad_norm": 34.5, | |
| "learning_rate": 1.7380000000000003e-05, | |
| "loss": 12.4747, | |
| "mean_token_accuracy": 0.7949298892170191, | |
| "num_tokens": 24748043.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.7715026669204235, | |
| "epoch": 1.408, | |
| "grad_norm": 36.0, | |
| "learning_rate": 1.758e-05, | |
| "loss": 12.3399, | |
| "mean_token_accuracy": 0.7984749253839254, | |
| "num_tokens": 25036674.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.7645593881607056, | |
| "epoch": 1.424, | |
| "grad_norm": 27.25, | |
| "learning_rate": 1.7780000000000003e-05, | |
| "loss": 12.1973, | |
| "mean_token_accuracy": 0.7993213057518005, | |
| "num_tokens": 25324579.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.7820997565984726, | |
| "epoch": 1.44, | |
| "grad_norm": 33.0, | |
| "learning_rate": 1.798e-05, | |
| "loss": 12.5051, | |
| "mean_token_accuracy": 0.7951443370431661, | |
| "num_tokens": 25606824.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_biology_entropy": 1.3808940649032593, | |
| "eval_biology_loss": 1.4220765829086304, | |
| "eval_biology_mean_token_accuracy": 0.6588086094856263, | |
| "eval_biology_num_tokens": 25606824.0, | |
| "eval_biology_runtime": 22.0118, | |
| "eval_biology_samples_per_second": 22.715, | |
| "eval_biology_steps_per_second": 5.679, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_chemistry_entropy": 1.0482762174606324, | |
| "eval_chemistry_loss": 1.073889136314392, | |
| "eval_chemistry_mean_token_accuracy": 0.7307762913703918, | |
| "eval_chemistry_num_tokens": 25606824.0, | |
| "eval_chemistry_runtime": 26.8657, | |
| "eval_chemistry_samples_per_second": 18.611, | |
| "eval_chemistry_steps_per_second": 4.653, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_math_entropy": 0.7846709032058716, | |
| "eval_math_loss": 0.7932249903678894, | |
| "eval_math_mean_token_accuracy": 0.792679114818573, | |
| "eval_math_num_tokens": 25606824.0, | |
| "eval_math_runtime": 27.514, | |
| "eval_math_samples_per_second": 18.173, | |
| "eval_math_steps_per_second": 4.543, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_cyber_entropy": 2.5483840811252594, | |
| "eval_cyber_loss": 2.8718831539154053, | |
| "eval_cyber_mean_token_accuracy": 0.4638554835319519, | |
| "eval_cyber_num_tokens": 25606824.0, | |
| "eval_cyber_runtime": 26.216, | |
| "eval_cyber_samples_per_second": 15.143, | |
| "eval_cyber_steps_per_second": 3.814, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.7723285494372248, | |
| "epoch": 1.456, | |
| "grad_norm": 28.625, | |
| "learning_rate": 1.8180000000000002e-05, | |
| "loss": 12.303, | |
| "mean_token_accuracy": 0.7964108034968376, | |
| "num_tokens": 25886396.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.7762986140325665, | |
| "epoch": 1.472, | |
| "grad_norm": 28.875, | |
| "learning_rate": 1.8380000000000004e-05, | |
| "loss": 12.4134, | |
| "mean_token_accuracy": 0.7955747056752444, | |
| "num_tokens": 26163618.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.7938198037445545, | |
| "epoch": 1.488, | |
| "grad_norm": 29.625, | |
| "learning_rate": 1.858e-05, | |
| "loss": 12.75, | |
| "mean_token_accuracy": 0.7917455974966288, | |
| "num_tokens": 26438338.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.7594615155830979, | |
| "epoch": 1.504, | |
| "grad_norm": 34.0, | |
| "learning_rate": 1.878e-05, | |
| "loss": 12.14, | |
| "mean_token_accuracy": 0.8009778898209333, | |
| "num_tokens": 26729255.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.7861603863537312, | |
| "epoch": 1.52, | |
| "grad_norm": 27.125, | |
| "learning_rate": 1.898e-05, | |
| "loss": 12.4626, | |
| "mean_token_accuracy": 0.7956234533339739, | |
| "num_tokens": 27017935.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.7631909586489201, | |
| "epoch": 1.536, | |
| "grad_norm": 24.625, | |
| "learning_rate": 1.918e-05, | |
| "loss": 12.1955, | |
| "mean_token_accuracy": 0.7989141892641782, | |
| "num_tokens": 27306339.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.7708934009075165, | |
| "epoch": 1.552, | |
| "grad_norm": 27.75, | |
| "learning_rate": 1.938e-05, | |
| "loss": 12.1963, | |
| "mean_token_accuracy": 0.7984277427196502, | |
| "num_tokens": 27591959.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 0.7459486592561007, | |
| "epoch": 1.568, | |
| "grad_norm": 28.625, | |
| "learning_rate": 1.9580000000000002e-05, | |
| "loss": 11.9228, | |
| "mean_token_accuracy": 0.8039638720452785, | |
| "num_tokens": 27884398.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.7573445823043585, | |
| "epoch": 1.584, | |
| "grad_norm": 27.0, | |
| "learning_rate": 1.978e-05, | |
| "loss": 12.0883, | |
| "mean_token_accuracy": 0.8011246718466282, | |
| "num_tokens": 28171274.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.7612122105434537, | |
| "epoch": 1.6, | |
| "grad_norm": 27.5, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 12.0981, | |
| "mean_token_accuracy": 0.7986274570226669, | |
| "num_tokens": 28457624.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_biology_entropy": 1.3829385170936583, | |
| "eval_biology_loss": 1.4260554313659668, | |
| "eval_biology_mean_token_accuracy": 0.6589009766578674, | |
| "eval_biology_num_tokens": 28457624.0, | |
| "eval_biology_runtime": 22.024, | |
| "eval_biology_samples_per_second": 22.703, | |
| "eval_biology_steps_per_second": 5.676, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_chemistry_entropy": 1.0377137541770936, | |
| "eval_chemistry_loss": 1.0700007677078247, | |
| "eval_chemistry_mean_token_accuracy": 0.7318583874702453, | |
| "eval_chemistry_num_tokens": 28457624.0, | |
| "eval_chemistry_runtime": 26.9114, | |
| "eval_chemistry_samples_per_second": 18.579, | |
| "eval_chemistry_steps_per_second": 4.645, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_math_entropy": 0.775481684923172, | |
| "eval_math_loss": 0.7736496329307556, | |
| "eval_math_mean_token_accuracy": 0.7959285154342651, | |
| "eval_math_num_tokens": 28457624.0, | |
| "eval_math_runtime": 27.53, | |
| "eval_math_samples_per_second": 18.162, | |
| "eval_math_steps_per_second": 4.541, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_cyber_entropy": 2.5157601726055145, | |
| "eval_cyber_loss": 2.8350298404693604, | |
| "eval_cyber_mean_token_accuracy": 0.4679137858748436, | |
| "eval_cyber_num_tokens": 28457624.0, | |
| "eval_cyber_runtime": 26.1345, | |
| "eval_cyber_samples_per_second": 15.191, | |
| "eval_cyber_steps_per_second": 3.826, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.7650468161329627, | |
| "epoch": 1.616, | |
| "grad_norm": 24.875, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 12.1883, | |
| "mean_token_accuracy": 0.798528803884983, | |
| "num_tokens": 28743099.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 0.7713425377383828, | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 25.75, | |
| "learning_rate": 1.995777777777778e-05, | |
| "loss": 12.2948, | |
| "mean_token_accuracy": 0.7963836405426263, | |
| "num_tokens": 29017297.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.7499153949320316, | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 25.75, | |
| "learning_rate": 1.9935555555555557e-05, | |
| "loss": 11.9856, | |
| "mean_token_accuracy": 0.803160610422492, | |
| "num_tokens": 29303707.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.7566261947154999, | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 25.0, | |
| "learning_rate": 1.9913333333333335e-05, | |
| "loss": 12.034, | |
| "mean_token_accuracy": 0.7999875675886869, | |
| "num_tokens": 29597156.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.7669804213568568, | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 24.25, | |
| "learning_rate": 1.9891111111111112e-05, | |
| "loss": 12.2025, | |
| "mean_token_accuracy": 0.7990686308592558, | |
| "num_tokens": 29883879.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.7553620956838131, | |
| "epoch": 1.696, | |
| "grad_norm": 28.125, | |
| "learning_rate": 1.986888888888889e-05, | |
| "loss": 12.1827, | |
| "mean_token_accuracy": 0.8000101692974567, | |
| "num_tokens": 30165760.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.7463795414194465, | |
| "epoch": 1.712, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.9846666666666668e-05, | |
| "loss": 11.9561, | |
| "mean_token_accuracy": 0.8028988271951676, | |
| "num_tokens": 30460367.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.7401833109557628, | |
| "epoch": 1.728, | |
| "grad_norm": 27.375, | |
| "learning_rate": 1.9824444444444445e-05, | |
| "loss": 11.7133, | |
| "mean_token_accuracy": 0.8052776392549277, | |
| "num_tokens": 30739137.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.7436290748417378, | |
| "epoch": 1.744, | |
| "grad_norm": 27.375, | |
| "learning_rate": 1.9802222222222226e-05, | |
| "loss": 11.8806, | |
| "mean_token_accuracy": 0.8036583166569471, | |
| "num_tokens": 31022663.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 0.7478637570515275, | |
| "epoch": 1.76, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.978e-05, | |
| "loss": 11.9202, | |
| "mean_token_accuracy": 0.8017146904021502, | |
| "num_tokens": 31306494.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_biology_entropy": 1.3845259475708007, | |
| "eval_biology_loss": 1.4283864498138428, | |
| "eval_biology_mean_token_accuracy": 0.657891107082367, | |
| "eval_biology_num_tokens": 31306494.0, | |
| "eval_biology_runtime": 21.9927, | |
| "eval_biology_samples_per_second": 22.735, | |
| "eval_biology_steps_per_second": 5.684, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_chemistry_entropy": 1.0231492972373963, | |
| "eval_chemistry_loss": 1.063183307647705, | |
| "eval_chemistry_mean_token_accuracy": 0.7330445971488952, | |
| "eval_chemistry_num_tokens": 31306494.0, | |
| "eval_chemistry_runtime": 26.8519, | |
| "eval_chemistry_samples_per_second": 18.621, | |
| "eval_chemistry_steps_per_second": 4.655, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_math_entropy": 0.7524698441028594, | |
| "eval_math_loss": 0.7613377571105957, | |
| "eval_math_mean_token_accuracy": 0.798446418762207, | |
| "eval_math_num_tokens": 31306494.0, | |
| "eval_math_runtime": 27.5284, | |
| "eval_math_samples_per_second": 18.163, | |
| "eval_math_steps_per_second": 4.541, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_cyber_entropy": 2.3844272685050965, | |
| "eval_cyber_loss": 2.8584094047546387, | |
| "eval_cyber_mean_token_accuracy": 0.47087193533778193, | |
| "eval_cyber_num_tokens": 31306494.0, | |
| "eval_cyber_runtime": 26.2154, | |
| "eval_cyber_samples_per_second": 15.144, | |
| "eval_cyber_steps_per_second": 3.815, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.7506109833717346, | |
| "epoch": 1.776, | |
| "grad_norm": 22.875, | |
| "learning_rate": 1.975777777777778e-05, | |
| "loss": 11.957, | |
| "mean_token_accuracy": 0.803263409435749, | |
| "num_tokens": 31595542.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 0.7545284632593393, | |
| "epoch": 1.792, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.9735555555555556e-05, | |
| "loss": 12.055, | |
| "mean_token_accuracy": 0.8008169520646333, | |
| "num_tokens": 31881189.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.7454792723059654, | |
| "epoch": 1.808, | |
| "grad_norm": 22.625, | |
| "learning_rate": 1.9713333333333337e-05, | |
| "loss": 11.8818, | |
| "mean_token_accuracy": 0.8028201397508383, | |
| "num_tokens": 32164196.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 0.7103133289143443, | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 23.625, | |
| "learning_rate": 1.969111111111111e-05, | |
| "loss": 11.3018, | |
| "mean_token_accuracy": 0.8097421944141387, | |
| "num_tokens": 32441530.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.7296694969758392, | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 23.125, | |
| "learning_rate": 1.9668888888888892e-05, | |
| "loss": 11.722, | |
| "mean_token_accuracy": 0.8063848353922367, | |
| "num_tokens": 32723145.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.746064018085599, | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 26.375, | |
| "learning_rate": 1.9646666666666666e-05, | |
| "loss": 11.9198, | |
| "mean_token_accuracy": 0.8034628454595805, | |
| "num_tokens": 33011263.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.7246854526922106, | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 25.125, | |
| "learning_rate": 1.9624444444444447e-05, | |
| "loss": 11.6702, | |
| "mean_token_accuracy": 0.8065225839614868, | |
| "num_tokens": 33298921.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.7422073289752007, | |
| "epoch": 1.888, | |
| "grad_norm": 33.0, | |
| "learning_rate": 1.9602222222222225e-05, | |
| "loss": 11.8223, | |
| "mean_token_accuracy": 0.8029078282415867, | |
| "num_tokens": 33576243.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.7377389714121818, | |
| "epoch": 1.904, | |
| "grad_norm": 21.875, | |
| "learning_rate": 1.9580000000000002e-05, | |
| "loss": 11.7626, | |
| "mean_token_accuracy": 0.803905576467514, | |
| "num_tokens": 33850968.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 0.720432554371655, | |
| "epoch": 1.92, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.955777777777778e-05, | |
| "loss": 11.4648, | |
| "mean_token_accuracy": 0.8074128460139036, | |
| "num_tokens": 34128558.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_biology_entropy": 1.426067009449005, | |
| "eval_biology_loss": 1.4298174381256104, | |
| "eval_biology_mean_token_accuracy": 0.6572316522598267, | |
| "eval_biology_num_tokens": 34128558.0, | |
| "eval_biology_runtime": 21.9751, | |
| "eval_biology_samples_per_second": 22.753, | |
| "eval_biology_steps_per_second": 5.688, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_chemistry_entropy": 1.052424753189087, | |
| "eval_chemistry_loss": 1.058487057685852, | |
| "eval_chemistry_mean_token_accuracy": 0.7344707479476928, | |
| "eval_chemistry_num_tokens": 34128558.0, | |
| "eval_chemistry_runtime": 26.8451, | |
| "eval_chemistry_samples_per_second": 18.625, | |
| "eval_chemistry_steps_per_second": 4.656, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_math_entropy": 0.7641568143367767, | |
| "eval_math_loss": 0.7467027306556702, | |
| "eval_math_mean_token_accuracy": 0.8014610476493835, | |
| "eval_math_num_tokens": 34128558.0, | |
| "eval_math_runtime": 27.5216, | |
| "eval_math_samples_per_second": 18.168, | |
| "eval_math_steps_per_second": 4.542, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_cyber_entropy": 2.4904652881622313, | |
| "eval_cyber_loss": 2.6954379081726074, | |
| "eval_cyber_mean_token_accuracy": 0.4830169627070427, | |
| "eval_cyber_num_tokens": 34128558.0, | |
| "eval_cyber_runtime": 26.1923, | |
| "eval_cyber_samples_per_second": 15.157, | |
| "eval_cyber_steps_per_second": 3.818, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.731533533334732, | |
| "epoch": 1.936, | |
| "grad_norm": 25.125, | |
| "learning_rate": 1.9535555555555557e-05, | |
| "loss": 11.6804, | |
| "mean_token_accuracy": 0.8052534744143486, | |
| "num_tokens": 34408056.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 0.7303263584151864, | |
| "epoch": 1.952, | |
| "grad_norm": 23.375, | |
| "learning_rate": 1.9513333333333335e-05, | |
| "loss": 11.5676, | |
| "mean_token_accuracy": 0.8064503286033868, | |
| "num_tokens": 34684679.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.7569911142811179, | |
| "epoch": 1.968, | |
| "grad_norm": 25.375, | |
| "learning_rate": 1.9491111111111113e-05, | |
| "loss": 12.1009, | |
| "mean_token_accuracy": 0.8005167040973902, | |
| "num_tokens": 34971038.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 0.7218442076817155, | |
| "epoch": 1.984, | |
| "grad_norm": 23.375, | |
| "learning_rate": 1.946888888888889e-05, | |
| "loss": 11.5014, | |
| "mean_token_accuracy": 0.808349072188139, | |
| "num_tokens": 35262281.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.7173755820840597, | |
| "epoch": 2.0, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.9446666666666668e-05, | |
| "loss": 11.4742, | |
| "mean_token_accuracy": 0.8086670659482479, | |
| "num_tokens": 35560864.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.6936481088399887, | |
| "epoch": 2.016, | |
| "grad_norm": 23.125, | |
| "learning_rate": 1.9424444444444446e-05, | |
| "loss": 10.8847, | |
| "mean_token_accuracy": 0.8153551481664181, | |
| "num_tokens": 35846704.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 0.6694988587871193, | |
| "epoch": 2.032, | |
| "grad_norm": 22.5, | |
| "learning_rate": 1.9402222222222223e-05, | |
| "loss": 10.711, | |
| "mean_token_accuracy": 0.816493459790945, | |
| "num_tokens": 36128775.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 0.6576118635013699, | |
| "epoch": 2.048, | |
| "grad_norm": 23.25, | |
| "learning_rate": 1.938e-05, | |
| "loss": 10.4997, | |
| "mean_token_accuracy": 0.8206516925245524, | |
| "num_tokens": 36419504.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.6648308178409934, | |
| "epoch": 2.064, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.935777777777778e-05, | |
| "loss": 10.5449, | |
| "mean_token_accuracy": 0.8189927719533443, | |
| "num_tokens": 36706816.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 0.6633218213915825, | |
| "epoch": 2.08, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.9335555555555556e-05, | |
| "loss": 10.5216, | |
| "mean_token_accuracy": 0.8182109944522381, | |
| "num_tokens": 36988475.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_biology_entropy": 1.240901198387146, | |
| "eval_biology_loss": 1.4575155973434448, | |
| "eval_biology_mean_token_accuracy": 0.6538263387680053, | |
| "eval_biology_num_tokens": 36988475.0, | |
| "eval_biology_runtime": 22.0164, | |
| "eval_biology_samples_per_second": 22.71, | |
| "eval_biology_steps_per_second": 5.678, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_chemistry_entropy": 0.9047480673789978, | |
| "eval_chemistry_loss": 1.077072262763977, | |
| "eval_chemistry_mean_token_accuracy": 0.7334306511878967, | |
| "eval_chemistry_num_tokens": 36988475.0, | |
| "eval_chemistry_runtime": 26.8608, | |
| "eval_chemistry_samples_per_second": 18.614, | |
| "eval_chemistry_steps_per_second": 4.654, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_math_entropy": 0.6772888927459717, | |
| "eval_math_loss": 0.7423775792121887, | |
| "eval_math_mean_token_accuracy": 0.8029003148078918, | |
| "eval_math_num_tokens": 36988475.0, | |
| "eval_math_runtime": 27.5039, | |
| "eval_math_samples_per_second": 18.179, | |
| "eval_math_steps_per_second": 4.545, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_cyber_entropy": 2.240402947664261, | |
| "eval_cyber_loss": 2.8395872116088867, | |
| "eval_cyber_mean_token_accuracy": 0.4778119161725044, | |
| "eval_cyber_num_tokens": 36988475.0, | |
| "eval_cyber_runtime": 26.1457, | |
| "eval_cyber_samples_per_second": 15.184, | |
| "eval_cyber_steps_per_second": 3.825, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.6708741160109639, | |
| "epoch": 2.096, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.9313333333333334e-05, | |
| "loss": 10.6881, | |
| "mean_token_accuracy": 0.8182591505348682, | |
| "num_tokens": 37270131.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 0.6532387970015406, | |
| "epoch": 2.112, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.9291111111111115e-05, | |
| "loss": 10.4792, | |
| "mean_token_accuracy": 0.8189583510160446, | |
| "num_tokens": 37563537.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.6555240735411644, | |
| "epoch": 2.128, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.926888888888889e-05, | |
| "loss": 10.369, | |
| "mean_token_accuracy": 0.8218111298978329, | |
| "num_tokens": 37843959.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.6665401035919786, | |
| "epoch": 2.144, | |
| "grad_norm": 24.25, | |
| "learning_rate": 1.924666666666667e-05, | |
| "loss": 10.5665, | |
| "mean_token_accuracy": 0.8194981347769499, | |
| "num_tokens": 38133092.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.6592796456068755, | |
| "epoch": 2.16, | |
| "grad_norm": 21.5, | |
| "learning_rate": 1.9224444444444444e-05, | |
| "loss": 10.5062, | |
| "mean_token_accuracy": 0.8200013760477305, | |
| "num_tokens": 38421229.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.639416103810072, | |
| "epoch": 2.176, | |
| "grad_norm": 23.0, | |
| "learning_rate": 1.9202222222222225e-05, | |
| "loss": 10.1779, | |
| "mean_token_accuracy": 0.8243647638708353, | |
| "num_tokens": 38708043.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.667258214391768, | |
| "epoch": 2.192, | |
| "grad_norm": 23.375, | |
| "learning_rate": 1.918e-05, | |
| "loss": 10.6186, | |
| "mean_token_accuracy": 0.8171255987137556, | |
| "num_tokens": 38996930.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.6627653013914824, | |
| "epoch": 2.208, | |
| "grad_norm": 23.75, | |
| "learning_rate": 1.915777777777778e-05, | |
| "loss": 10.6227, | |
| "mean_token_accuracy": 0.818722078576684, | |
| "num_tokens": 39279481.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.6583162900060415, | |
| "epoch": 2.224, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.9135555555555555e-05, | |
| "loss": 10.4429, | |
| "mean_token_accuracy": 0.8204963516443968, | |
| "num_tokens": 39569030.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.6620556140318513, | |
| "epoch": 2.24, | |
| "grad_norm": 23.625, | |
| "learning_rate": 1.9113333333333336e-05, | |
| "loss": 10.591, | |
| "mean_token_accuracy": 0.8184640970081091, | |
| "num_tokens": 39854873.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_biology_entropy": 1.1904019894599915, | |
| "eval_biology_loss": 1.473069667816162, | |
| "eval_biology_mean_token_accuracy": 0.6529810581207275, | |
| "eval_biology_num_tokens": 39854873.0, | |
| "eval_biology_runtime": 22.0119, | |
| "eval_biology_samples_per_second": 22.715, | |
| "eval_biology_steps_per_second": 5.679, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_chemistry_entropy": 0.8868081021308899, | |
| "eval_chemistry_loss": 1.0847948789596558, | |
| "eval_chemistry_mean_token_accuracy": 0.7322362198829651, | |
| "eval_chemistry_num_tokens": 39854873.0, | |
| "eval_chemistry_runtime": 27.1533, | |
| "eval_chemistry_samples_per_second": 18.414, | |
| "eval_chemistry_steps_per_second": 4.603, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_math_entropy": 0.6937474160194397, | |
| "eval_math_loss": 0.7352772951126099, | |
| "eval_math_mean_token_accuracy": 0.8037003560066223, | |
| "eval_math_num_tokens": 39854873.0, | |
| "eval_math_runtime": 27.5567, | |
| "eval_math_samples_per_second": 18.144, | |
| "eval_math_steps_per_second": 4.536, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_cyber_entropy": 2.280633035302162, | |
| "eval_cyber_loss": 2.848487615585327, | |
| "eval_cyber_mean_token_accuracy": 0.46778192803263663, | |
| "eval_cyber_num_tokens": 39854873.0, | |
| "eval_cyber_runtime": 26.2399, | |
| "eval_cyber_samples_per_second": 15.13, | |
| "eval_cyber_steps_per_second": 3.811, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.6573172532021999, | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 22.75, | |
| "learning_rate": 1.9091111111111113e-05, | |
| "loss": 10.4628, | |
| "mean_token_accuracy": 0.8198426373302936, | |
| "num_tokens": 40141190.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.6748596677556634, | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 26.25, | |
| "learning_rate": 1.906888888888889e-05, | |
| "loss": 10.7759, | |
| "mean_token_accuracy": 0.8157621681690216, | |
| "num_tokens": 40415203.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.6660139387473464, | |
| "epoch": 2.288, | |
| "grad_norm": 25.0, | |
| "learning_rate": 1.904666666666667e-05, | |
| "loss": 10.5722, | |
| "mean_token_accuracy": 0.8170963436365127, | |
| "num_tokens": 40702393.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 0.6447202865034342, | |
| "epoch": 2.304, | |
| "grad_norm": 24.875, | |
| "learning_rate": 1.9024444444444446e-05, | |
| "loss": 10.2772, | |
| "mean_token_accuracy": 0.8228711977601051, | |
| "num_tokens": 40982775.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.667175211571157, | |
| "epoch": 2.32, | |
| "grad_norm": 25.375, | |
| "learning_rate": 1.9002222222222224e-05, | |
| "loss": 10.6322, | |
| "mean_token_accuracy": 0.817449289560318, | |
| "num_tokens": 41263356.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.6582919212058187, | |
| "epoch": 2.336, | |
| "grad_norm": 24.625, | |
| "learning_rate": 1.898e-05, | |
| "loss": 10.5061, | |
| "mean_token_accuracy": 0.8195808235555887, | |
| "num_tokens": 41545235.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 0.683755399286747, | |
| "epoch": 2.352, | |
| "grad_norm": 24.25, | |
| "learning_rate": 1.895777777777778e-05, | |
| "loss": 10.8267, | |
| "mean_token_accuracy": 0.8143463153392076, | |
| "num_tokens": 41833417.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.6577698297798633, | |
| "epoch": 2.368, | |
| "grad_norm": 24.625, | |
| "learning_rate": 1.8935555555555556e-05, | |
| "loss": 10.5268, | |
| "mean_token_accuracy": 0.8191198598593473, | |
| "num_tokens": 42117030.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.6793028621003032, | |
| "epoch": 2.384, | |
| "grad_norm": 28.25, | |
| "learning_rate": 1.8913333333333334e-05, | |
| "loss": 10.7829, | |
| "mean_token_accuracy": 0.8163190931081772, | |
| "num_tokens": 42410990.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 0.6641744881868362, | |
| "epoch": 2.4, | |
| "grad_norm": 24.5, | |
| "learning_rate": 1.8891111111111115e-05, | |
| "loss": 10.5965, | |
| "mean_token_accuracy": 0.8189876776188612, | |
| "num_tokens": 42691890.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_biology_entropy": 1.2293707489967347, | |
| "eval_biology_loss": 1.4714155197143555, | |
| "eval_biology_mean_token_accuracy": 0.6514331855773926, | |
| "eval_biology_num_tokens": 42691890.0, | |
| "eval_biology_runtime": 21.9938, | |
| "eval_biology_samples_per_second": 22.734, | |
| "eval_biology_steps_per_second": 5.683, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_chemistry_entropy": 0.9099567327499389, | |
| "eval_chemistry_loss": 1.0838559865951538, | |
| "eval_chemistry_mean_token_accuracy": 0.7319689979553222, | |
| "eval_chemistry_num_tokens": 42691890.0, | |
| "eval_chemistry_runtime": 26.8774, | |
| "eval_chemistry_samples_per_second": 18.603, | |
| "eval_chemistry_steps_per_second": 4.651, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_math_entropy": 0.6841141791343689, | |
| "eval_math_loss": 0.726224958896637, | |
| "eval_math_mean_token_accuracy": 0.8050775575637817, | |
| "eval_math_num_tokens": 42691890.0, | |
| "eval_math_runtime": 27.5383, | |
| "eval_math_samples_per_second": 18.157, | |
| "eval_math_steps_per_second": 4.539, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_cyber_entropy": 2.1712605100870133, | |
| "eval_cyber_loss": 2.742515802383423, | |
| "eval_cyber_mean_token_accuracy": 0.48678219854831695, | |
| "eval_cyber_num_tokens": 42691890.0, | |
| "eval_cyber_runtime": 26.1725, | |
| "eval_cyber_samples_per_second": 15.169, | |
| "eval_cyber_steps_per_second": 3.821, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.681360544078052, | |
| "epoch": 2.416, | |
| "grad_norm": 23.875, | |
| "learning_rate": 1.886888888888889e-05, | |
| "loss": 10.8018, | |
| "mean_token_accuracy": 0.8153177864849568, | |
| "num_tokens": 42971588.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 0.6504227627068758, | |
| "epoch": 2.432, | |
| "grad_norm": 23.625, | |
| "learning_rate": 1.884666666666667e-05, | |
| "loss": 10.4029, | |
| "mean_token_accuracy": 0.8209474917501212, | |
| "num_tokens": 43253821.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.6501767633482813, | |
| "epoch": 2.448, | |
| "grad_norm": 24.375, | |
| "learning_rate": 1.8824444444444445e-05, | |
| "loss": 10.3415, | |
| "mean_token_accuracy": 0.8225077040493488, | |
| "num_tokens": 43550902.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 0.6623956672847271, | |
| "epoch": 2.464, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.8802222222222226e-05, | |
| "loss": 10.5246, | |
| "mean_token_accuracy": 0.8181775715202093, | |
| "num_tokens": 43844259.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.6859173832461238, | |
| "epoch": 2.48, | |
| "grad_norm": 23.125, | |
| "learning_rate": 1.878e-05, | |
| "loss": 10.9239, | |
| "mean_token_accuracy": 0.8137104224413634, | |
| "num_tokens": 44115701.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.6711945479735733, | |
| "epoch": 2.496, | |
| "grad_norm": 21.875, | |
| "learning_rate": 1.875777777777778e-05, | |
| "loss": 10.6685, | |
| "mean_token_accuracy": 0.8173221621662379, | |
| "num_tokens": 44405520.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.6624481493607164, | |
| "epoch": 2.512, | |
| "grad_norm": 23.375, | |
| "learning_rate": 1.873555555555556e-05, | |
| "loss": 10.4945, | |
| "mean_token_accuracy": 0.8196087624877691, | |
| "num_tokens": 44686477.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.6599110793322325, | |
| "epoch": 2.528, | |
| "grad_norm": 22.75, | |
| "learning_rate": 1.8713333333333336e-05, | |
| "loss": 10.4873, | |
| "mean_token_accuracy": 0.8193403802812099, | |
| "num_tokens": 44969760.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.6503490032628179, | |
| "epoch": 2.544, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.8691111111111114e-05, | |
| "loss": 10.4441, | |
| "mean_token_accuracy": 0.8206405211240053, | |
| "num_tokens": 45255326.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 0.6536685146391392, | |
| "epoch": 2.56, | |
| "grad_norm": 22.875, | |
| "learning_rate": 1.866888888888889e-05, | |
| "loss": 10.4091, | |
| "mean_token_accuracy": 0.8202110458165407, | |
| "num_tokens": 45532525.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_biology_entropy": 1.2134844155311584, | |
| "eval_biology_loss": 1.478155255317688, | |
| "eval_biology_mean_token_accuracy": 0.6511134657859802, | |
| "eval_biology_num_tokens": 45532525.0, | |
| "eval_biology_runtime": 21.9488, | |
| "eval_biology_samples_per_second": 22.78, | |
| "eval_biology_steps_per_second": 5.695, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_chemistry_entropy": 0.8961781821250916, | |
| "eval_chemistry_loss": 1.0829113721847534, | |
| "eval_chemistry_mean_token_accuracy": 0.7327736926078796, | |
| "eval_chemistry_num_tokens": 45532525.0, | |
| "eval_chemistry_runtime": 26.8437, | |
| "eval_chemistry_samples_per_second": 18.626, | |
| "eval_chemistry_steps_per_second": 4.657, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_math_entropy": 0.6742748956680298, | |
| "eval_math_loss": 0.7170487642288208, | |
| "eval_math_mean_token_accuracy": 0.8080672206878662, | |
| "eval_math_num_tokens": 45532525.0, | |
| "eval_math_runtime": 27.5073, | |
| "eval_math_samples_per_second": 18.177, | |
| "eval_math_steps_per_second": 4.544, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_cyber_entropy": 2.198235506415367, | |
| "eval_cyber_loss": 2.8126513957977295, | |
| "eval_cyber_mean_token_accuracy": 0.4754572454094887, | |
| "eval_cyber_num_tokens": 45532525.0, | |
| "eval_cyber_runtime": 26.3904, | |
| "eval_cyber_samples_per_second": 15.043, | |
| "eval_cyber_steps_per_second": 3.789, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.659245578199625, | |
| "epoch": 2.576, | |
| "grad_norm": 23.125, | |
| "learning_rate": 1.864666666666667e-05, | |
| "loss": 10.4939, | |
| "mean_token_accuracy": 0.8194193851202727, | |
| "num_tokens": 45817478.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 0.6425925368443132, | |
| "epoch": 2.592, | |
| "grad_norm": 22.375, | |
| "learning_rate": 1.8624444444444446e-05, | |
| "loss": 10.2816, | |
| "mean_token_accuracy": 0.8222486432641745, | |
| "num_tokens": 46109575.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.6715872915461659, | |
| "epoch": 2.608, | |
| "grad_norm": 25.25, | |
| "learning_rate": 1.8602222222222224e-05, | |
| "loss": 10.6569, | |
| "mean_token_accuracy": 0.8173692885786294, | |
| "num_tokens": 46391461.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.6294447083026171, | |
| "epoch": 2.624, | |
| "grad_norm": 22.5, | |
| "learning_rate": 1.858e-05, | |
| "loss": 10.0396, | |
| "mean_token_accuracy": 0.8279551289975643, | |
| "num_tokens": 46683117.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.6628140497952699, | |
| "epoch": 2.64, | |
| "grad_norm": 23.375, | |
| "learning_rate": 1.855777777777778e-05, | |
| "loss": 10.5068, | |
| "mean_token_accuracy": 0.8184260647743941, | |
| "num_tokens": 46965534.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.6376811485737562, | |
| "epoch": 2.656, | |
| "grad_norm": 23.0, | |
| "learning_rate": 1.8535555555555557e-05, | |
| "loss": 10.0941, | |
| "mean_token_accuracy": 0.8235533174127341, | |
| "num_tokens": 47245852.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.6615891676396132, | |
| "epoch": 2.672, | |
| "grad_norm": 24.625, | |
| "learning_rate": 1.8513333333333335e-05, | |
| "loss": 10.5681, | |
| "mean_token_accuracy": 0.8179556384682656, | |
| "num_tokens": 47524916.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 0.6560400146991014, | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 22.0, | |
| "learning_rate": 1.8491111111111112e-05, | |
| "loss": 10.4122, | |
| "mean_token_accuracy": 0.820205406472087, | |
| "num_tokens": 47807131.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.6595821080729365, | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 23.875, | |
| "learning_rate": 1.846888888888889e-05, | |
| "loss": 10.5383, | |
| "mean_token_accuracy": 0.8188040845096112, | |
| "num_tokens": 48099654.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.6555765904486179, | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.8446666666666667e-05, | |
| "loss": 10.3768, | |
| "mean_token_accuracy": 0.8199018821120262, | |
| "num_tokens": 48375019.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_biology_entropy": 1.1961839065551758, | |
| "eval_biology_loss": 1.474403977394104, | |
| "eval_biology_mean_token_accuracy": 0.6521351528167725, | |
| "eval_biology_num_tokens": 48375019.0, | |
| "eval_biology_runtime": 22.2564, | |
| "eval_biology_samples_per_second": 22.465, | |
| "eval_biology_steps_per_second": 5.616, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_chemistry_entropy": 0.8893180413246154, | |
| "eval_chemistry_loss": 1.0786951780319214, | |
| "eval_chemistry_mean_token_accuracy": 0.7333630976676941, | |
| "eval_chemistry_num_tokens": 48375019.0, | |
| "eval_chemistry_runtime": 26.8892, | |
| "eval_chemistry_samples_per_second": 18.595, | |
| "eval_chemistry_steps_per_second": 4.649, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_math_entropy": 0.6758732006549836, | |
| "eval_math_loss": 0.7107370495796204, | |
| "eval_math_mean_token_accuracy": 0.8093916850090027, | |
| "eval_math_num_tokens": 48375019.0, | |
| "eval_math_runtime": 27.5333, | |
| "eval_math_samples_per_second": 18.16, | |
| "eval_math_steps_per_second": 4.54, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_cyber_entropy": 2.2349812412261962, | |
| "eval_cyber_loss": 2.779714822769165, | |
| "eval_cyber_mean_token_accuracy": 0.4819659352302551, | |
| "eval_cyber_num_tokens": 48375019.0, | |
| "eval_cyber_runtime": 26.1417, | |
| "eval_cyber_samples_per_second": 15.186, | |
| "eval_cyber_steps_per_second": 3.825, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.6559532942250371, | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 24.0, | |
| "learning_rate": 1.842444444444445e-05, | |
| "loss": 10.4594, | |
| "mean_token_accuracy": 0.8196242570877075, | |
| "num_tokens": 48659284.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 0.6569495009258389, | |
| "epoch": 2.752, | |
| "grad_norm": 22.25, | |
| "learning_rate": 1.8402222222222223e-05, | |
| "loss": 10.4467, | |
| "mean_token_accuracy": 0.8201438017189503, | |
| "num_tokens": 48943804.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.6529736818745733, | |
| "epoch": 2.768, | |
| "grad_norm": 21.875, | |
| "learning_rate": 1.8380000000000004e-05, | |
| "loss": 10.4356, | |
| "mean_token_accuracy": 0.8206765007227659, | |
| "num_tokens": 49230939.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 0.6720895063132047, | |
| "epoch": 2.784, | |
| "grad_norm": 23.0, | |
| "learning_rate": 1.8357777777777778e-05, | |
| "loss": 10.6977, | |
| "mean_token_accuracy": 0.8159397479146719, | |
| "num_tokens": 49504425.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.6508316185325385, | |
| "epoch": 2.8, | |
| "grad_norm": 23.125, | |
| "learning_rate": 1.833555555555556e-05, | |
| "loss": 10.351, | |
| "mean_token_accuracy": 0.8222934223711491, | |
| "num_tokens": 49782661.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.6627168050035834, | |
| "epoch": 2.816, | |
| "grad_norm": 24.5, | |
| "learning_rate": 1.8313333333333333e-05, | |
| "loss": 10.445, | |
| "mean_token_accuracy": 0.8191927138715982, | |
| "num_tokens": 50073632.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.6389238258823753, | |
| "epoch": 2.832, | |
| "grad_norm": 25.875, | |
| "learning_rate": 1.8291111111111114e-05, | |
| "loss": 10.2761, | |
| "mean_token_accuracy": 0.8222359851002693, | |
| "num_tokens": 50356964.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 0.6667038291692734, | |
| "epoch": 2.848, | |
| "grad_norm": 21.375, | |
| "learning_rate": 1.8268888888888888e-05, | |
| "loss": 10.586, | |
| "mean_token_accuracy": 0.818210769072175, | |
| "num_tokens": 50644535.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 0.6509249521419406, | |
| "epoch": 2.864, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.824666666666667e-05, | |
| "loss": 10.4338, | |
| "mean_token_accuracy": 0.8203214287757874, | |
| "num_tokens": 50925653.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 0.6507569069042802, | |
| "epoch": 2.88, | |
| "grad_norm": 23.625, | |
| "learning_rate": 1.8224444444444447e-05, | |
| "loss": 10.285, | |
| "mean_token_accuracy": 0.8213449958711863, | |
| "num_tokens": 51204374.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_biology_entropy": 1.1794821062088012, | |
| "eval_biology_loss": 1.4819698333740234, | |
| "eval_biology_mean_token_accuracy": 0.6517698068618775, | |
| "eval_biology_num_tokens": 51204374.0, | |
| "eval_biology_runtime": 21.9924, | |
| "eval_biology_samples_per_second": 22.735, | |
| "eval_biology_steps_per_second": 5.684, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_chemistry_entropy": 0.8722342405319213, | |
| "eval_chemistry_loss": 1.0784211158752441, | |
| "eval_chemistry_mean_token_accuracy": 0.7344747610092163, | |
| "eval_chemistry_num_tokens": 51204374.0, | |
| "eval_chemistry_runtime": 26.8909, | |
| "eval_chemistry_samples_per_second": 18.594, | |
| "eval_chemistry_steps_per_second": 4.648, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_math_entropy": 0.6604897229671478, | |
| "eval_math_loss": 0.7050039768218994, | |
| "eval_math_mean_token_accuracy": 0.8106719055175782, | |
| "eval_math_num_tokens": 51204374.0, | |
| "eval_math_runtime": 27.5351, | |
| "eval_math_samples_per_second": 18.159, | |
| "eval_math_steps_per_second": 4.54, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_cyber_entropy": 2.112373055815697, | |
| "eval_cyber_loss": 2.8215885162353516, | |
| "eval_cyber_mean_token_accuracy": 0.4802186432480812, | |
| "eval_cyber_num_tokens": 51204374.0, | |
| "eval_cyber_runtime": 26.1331, | |
| "eval_cyber_samples_per_second": 15.191, | |
| "eval_cyber_steps_per_second": 3.827, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.6363504879176617, | |
| "epoch": 2.896, | |
| "grad_norm": 23.75, | |
| "learning_rate": 1.8202222222222225e-05, | |
| "loss": 10.224, | |
| "mean_token_accuracy": 0.8235703807324171, | |
| "num_tokens": 51483944.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 0.6513262124732137, | |
| "epoch": 2.912, | |
| "grad_norm": 23.0, | |
| "learning_rate": 1.8180000000000002e-05, | |
| "loss": 10.3132, | |
| "mean_token_accuracy": 0.8217961758375167, | |
| "num_tokens": 51765755.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.6529291735962033, | |
| "epoch": 2.928, | |
| "grad_norm": 23.25, | |
| "learning_rate": 1.815777777777778e-05, | |
| "loss": 10.3766, | |
| "mean_token_accuracy": 0.8222961116582155, | |
| "num_tokens": 52056379.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 0.6346785051748156, | |
| "epoch": 2.944, | |
| "grad_norm": 23.75, | |
| "learning_rate": 1.8135555555555557e-05, | |
| "loss": 10.14, | |
| "mean_token_accuracy": 0.8229989748448133, | |
| "num_tokens": 52346232.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.6526656987145543, | |
| "epoch": 2.96, | |
| "grad_norm": 21.75, | |
| "learning_rate": 1.8113333333333335e-05, | |
| "loss": 10.38, | |
| "mean_token_accuracy": 0.8202067915350199, | |
| "num_tokens": 52633789.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.6502787992358208, | |
| "epoch": 2.976, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.8091111111111113e-05, | |
| "loss": 10.2748, | |
| "mean_token_accuracy": 0.8214363507926464, | |
| "num_tokens": 52911755.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 0.6417823160067201, | |
| "epoch": 2.992, | |
| "grad_norm": 22.875, | |
| "learning_rate": 1.806888888888889e-05, | |
| "loss": 10.2309, | |
| "mean_token_accuracy": 0.8228288643062115, | |
| "num_tokens": 53198176.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.6243049314245581, | |
| "epoch": 3.008, | |
| "grad_norm": 26.25, | |
| "learning_rate": 1.8046666666666668e-05, | |
| "loss": 9.7241, | |
| "mean_token_accuracy": 0.8291641604155302, | |
| "num_tokens": 53481893.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 0.5638323642313481, | |
| "epoch": 3.024, | |
| "grad_norm": 27.125, | |
| "learning_rate": 1.8024444444444445e-05, | |
| "loss": 8.9824, | |
| "mean_token_accuracy": 0.8390695653855801, | |
| "num_tokens": 53771717.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 0.558561889640987, | |
| "epoch": 3.04, | |
| "grad_norm": 24.125, | |
| "learning_rate": 1.8002222222222223e-05, | |
| "loss": 8.9038, | |
| "mean_token_accuracy": 0.8408384509384632, | |
| "num_tokens": 54058045.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_biology_entropy": 1.0243187880516051, | |
| "eval_biology_loss": 1.581600546836853, | |
| "eval_biology_mean_token_accuracy": 0.6445312175750733, | |
| "eval_biology_num_tokens": 54058045.0, | |
| "eval_biology_runtime": 21.9531, | |
| "eval_biology_samples_per_second": 22.776, | |
| "eval_biology_steps_per_second": 5.694, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_chemistry_entropy": 0.7532488117218018, | |
| "eval_chemistry_loss": 1.1571515798568726, | |
| "eval_chemistry_mean_token_accuracy": 0.7285859537124634, | |
| "eval_chemistry_num_tokens": 54058045.0, | |
| "eval_chemistry_runtime": 26.8231, | |
| "eval_chemistry_samples_per_second": 18.641, | |
| "eval_chemistry_steps_per_second": 4.66, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_math_entropy": 0.5930595288276672, | |
| "eval_math_loss": 0.7225678563117981, | |
| "eval_math_mean_token_accuracy": 0.8094966850280761, | |
| "eval_math_num_tokens": 54058045.0, | |
| "eval_math_runtime": 27.497, | |
| "eval_math_samples_per_second": 18.184, | |
| "eval_math_steps_per_second": 4.546, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_cyber_entropy": 1.8411164230108261, | |
| "eval_cyber_loss": 3.063901424407959, | |
| "eval_cyber_mean_token_accuracy": 0.47065704002976416, | |
| "eval_cyber_num_tokens": 54058045.0, | |
| "eval_cyber_runtime": 26.1343, | |
| "eval_cyber_samples_per_second": 15.191, | |
| "eval_cyber_steps_per_second": 3.826, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.552313212864101, | |
| "epoch": 3.056, | |
| "grad_norm": 26.625, | |
| "learning_rate": 1.798e-05, | |
| "loss": 8.7715, | |
| "mean_token_accuracy": 0.8414296887814998, | |
| "num_tokens": 54334332.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 0.5570558808743954, | |
| "epoch": 3.072, | |
| "grad_norm": 27.125, | |
| "learning_rate": 1.7957777777777778e-05, | |
| "loss": 8.7638, | |
| "mean_token_accuracy": 0.8418575689196587, | |
| "num_tokens": 54624543.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.5351565392687917, | |
| "epoch": 3.088, | |
| "grad_norm": 27.625, | |
| "learning_rate": 1.7935555555555556e-05, | |
| "loss": 8.5286, | |
| "mean_token_accuracy": 0.8457726195454598, | |
| "num_tokens": 54907550.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.5519297284074127, | |
| "epoch": 3.104, | |
| "grad_norm": 27.75, | |
| "learning_rate": 1.7913333333333337e-05, | |
| "loss": 8.7504, | |
| "mean_token_accuracy": 0.8423502463847399, | |
| "num_tokens": 55190959.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 0.544937571324408, | |
| "epoch": 3.12, | |
| "grad_norm": 27.0, | |
| "learning_rate": 1.789111111111111e-05, | |
| "loss": 8.6451, | |
| "mean_token_accuracy": 0.8444420550018549, | |
| "num_tokens": 55481635.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.546911165677011, | |
| "epoch": 3.136, | |
| "grad_norm": 26.75, | |
| "learning_rate": 1.7868888888888892e-05, | |
| "loss": 8.6997, | |
| "mean_token_accuracy": 0.8431835647672414, | |
| "num_tokens": 55769010.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.5542461348697543, | |
| "epoch": 3.152, | |
| "grad_norm": 26.5, | |
| "learning_rate": 1.7846666666666666e-05, | |
| "loss": 8.7996, | |
| "mean_token_accuracy": 0.842128399387002, | |
| "num_tokens": 56053160.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 0.5676137331873179, | |
| "epoch": 3.168, | |
| "grad_norm": 26.75, | |
| "learning_rate": 1.7824444444444447e-05, | |
| "loss": 8.9893, | |
| "mean_token_accuracy": 0.8386933848261833, | |
| "num_tokens": 56337066.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.5574962265789509, | |
| "epoch": 3.184, | |
| "grad_norm": 30.0, | |
| "learning_rate": 1.780222222222222e-05, | |
| "loss": 8.8126, | |
| "mean_token_accuracy": 0.8404616348445415, | |
| "num_tokens": 56618899.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.5407627185806632, | |
| "epoch": 3.2, | |
| "grad_norm": 26.0, | |
| "learning_rate": 1.7780000000000003e-05, | |
| "loss": 8.6062, | |
| "mean_token_accuracy": 0.8444454524666071, | |
| "num_tokens": 56910071.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_biology_entropy": 0.9433750138282776, | |
| "eval_biology_loss": 1.6609779596328735, | |
| "eval_biology_mean_token_accuracy": 0.6404439859390259, | |
| "eval_biology_num_tokens": 56910071.0, | |
| "eval_biology_runtime": 21.9804, | |
| "eval_biology_samples_per_second": 22.748, | |
| "eval_biology_steps_per_second": 5.687, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_chemistry_entropy": 0.697946096420288, | |
| "eval_chemistry_loss": 1.214890718460083, | |
| "eval_chemistry_mean_token_accuracy": 0.7252082509994506, | |
| "eval_chemistry_num_tokens": 56910071.0, | |
| "eval_chemistry_runtime": 26.8518, | |
| "eval_chemistry_samples_per_second": 18.621, | |
| "eval_chemistry_steps_per_second": 4.655, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_math_entropy": 0.5889736828804016, | |
| "eval_math_loss": 0.72825688123703, | |
| "eval_math_mean_token_accuracy": 0.809436321735382, | |
| "eval_math_num_tokens": 56910071.0, | |
| "eval_math_runtime": 27.524, | |
| "eval_math_samples_per_second": 18.166, | |
| "eval_math_steps_per_second": 4.541, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_cyber_entropy": 1.8025889378786086, | |
| "eval_cyber_loss": 3.0566282272338867, | |
| "eval_cyber_mean_token_accuracy": 0.46734614998102186, | |
| "eval_cyber_num_tokens": 56910071.0, | |
| "eval_cyber_runtime": 26.2014, | |
| "eval_cyber_samples_per_second": 15.152, | |
| "eval_cyber_steps_per_second": 3.817, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 16, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.791878293573609e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |