Instructions to use roonbug/afsck72p with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use roonbug/afsck72p with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="roonbug/afsck72p") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("roonbug/afsck72p") model = AutoModelForImageTextToText.from_pretrained("roonbug/afsck72p") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use roonbug/afsck72p with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "roonbug/afsck72p" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/afsck72p", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/roonbug/afsck72p
- SGLang
How to use roonbug/afsck72p with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "roonbug/afsck72p" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/afsck72p", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "roonbug/afsck72p" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/afsck72p", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use roonbug/afsck72p with Docker Model Runner:
docker model run hf.co/roonbug/afsck72p
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 100, | |
| "global_step": 7500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.3589675037190318, | |
| "epoch": 0.004, | |
| "grad_norm": 221.0, | |
| "learning_rate": 1.8e-07, | |
| "loss": 14.235, | |
| "mean_token_accuracy": 0.8023731399327516, | |
| "num_tokens": 89176.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.381451623653993, | |
| "epoch": 0.008, | |
| "grad_norm": 251.0, | |
| "learning_rate": 3.8e-07, | |
| "loss": 16.438, | |
| "mean_token_accuracy": 0.7818928115069866, | |
| "num_tokens": 173886.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.36998924349900336, | |
| "epoch": 0.012, | |
| "grad_norm": 284.0, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 15.3069, | |
| "mean_token_accuracy": 0.7961655277758837, | |
| "num_tokens": 258970.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.3889644297771156, | |
| "epoch": 0.016, | |
| "grad_norm": 296.0, | |
| "learning_rate": 7.8e-07, | |
| "loss": 15.6587, | |
| "mean_token_accuracy": 0.7874974697828293, | |
| "num_tokens": 338352.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.3830348197836429, | |
| "epoch": 0.02, | |
| "grad_norm": 330.0, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 15.0754, | |
| "mean_token_accuracy": 0.7973605334758759, | |
| "num_tokens": 424243.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.40132593517191706, | |
| "epoch": 0.024, | |
| "grad_norm": 284.0, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 15.9374, | |
| "mean_token_accuracy": 0.7853362146764994, | |
| "num_tokens": 505613.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.41173397554084656, | |
| "epoch": 0.028, | |
| "grad_norm": 201.0, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 15.1473, | |
| "mean_token_accuracy": 0.7896888509392739, | |
| "num_tokens": 584157.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.4223371436353773, | |
| "epoch": 0.032, | |
| "grad_norm": 164.0, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 13.7069, | |
| "mean_token_accuracy": 0.8018626874312759, | |
| "num_tokens": 671193.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.4982604907825589, | |
| "epoch": 0.036, | |
| "grad_norm": 240.0, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 14.6572, | |
| "mean_token_accuracy": 0.7824514407664538, | |
| "num_tokens": 753947.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.4913600580766797, | |
| "epoch": 0.04, | |
| "grad_norm": 155.0, | |
| "learning_rate": 1.98e-06, | |
| "loss": 13.0149, | |
| "mean_token_accuracy": 0.8003234412521124, | |
| "num_tokens": 835868.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_chemistry_entropy": 0.41473443768918516, | |
| "eval_chemistry_loss": 0.7713431119918823, | |
| "eval_chemistry_mean_token_accuracy": 0.8103395719528198, | |
| "eval_chemistry_num_tokens": 835868.0, | |
| "eval_chemistry_runtime": 53.7203, | |
| "eval_chemistry_samples_per_second": 9.307, | |
| "eval_chemistry_steps_per_second": 9.307, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_physics_entropy": 0.5291771245449781, | |
| "eval_physics_loss": 0.8209348917007446, | |
| "eval_physics_mean_token_accuracy": 0.7944456633925437, | |
| "eval_physics_num_tokens": 835868.0, | |
| "eval_physics_runtime": 62.4292, | |
| "eval_physics_samples_per_second": 8.009, | |
| "eval_physics_steps_per_second": 8.009, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5052730706520379, | |
| "epoch": 0.044, | |
| "grad_norm": 132.0, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 11.9999, | |
| "mean_token_accuracy": 0.807476469874382, | |
| "num_tokens": 917534.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5510593980550766, | |
| "epoch": 0.048, | |
| "grad_norm": 100.5, | |
| "learning_rate": 2.38e-06, | |
| "loss": 11.8438, | |
| "mean_token_accuracy": 0.8063667319715023, | |
| "num_tokens": 1001008.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.59730707956478, | |
| "epoch": 0.052, | |
| "grad_norm": 87.5, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 11.8941, | |
| "mean_token_accuracy": 0.8023553561419249, | |
| "num_tokens": 1085098.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.6581894762814045, | |
| "epoch": 0.056, | |
| "grad_norm": 102.0, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 12.6245, | |
| "mean_token_accuracy": 0.7945734079927206, | |
| "num_tokens": 1166615.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.6519844191148877, | |
| "epoch": 0.06, | |
| "grad_norm": 70.5, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 11.5194, | |
| "mean_token_accuracy": 0.8026637583971024, | |
| "num_tokens": 1248387.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.7085968372412026, | |
| "epoch": 0.064, | |
| "grad_norm": 72.0, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 12.4073, | |
| "mean_token_accuracy": 0.7906260795891284, | |
| "num_tokens": 1331208.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.6624301395379006, | |
| "epoch": 0.068, | |
| "grad_norm": 66.5, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 11.3744, | |
| "mean_token_accuracy": 0.8067036394029856, | |
| "num_tokens": 1416841.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.664617495983839, | |
| "epoch": 0.072, | |
| "grad_norm": 70.5, | |
| "learning_rate": 3.58e-06, | |
| "loss": 11.3264, | |
| "mean_token_accuracy": 0.8068108163774014, | |
| "num_tokens": 1499449.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.6721668383106589, | |
| "epoch": 0.076, | |
| "grad_norm": 70.5, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 11.2874, | |
| "mean_token_accuracy": 0.8029039859771728, | |
| "num_tokens": 1585982.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.6800463449209928, | |
| "epoch": 0.08, | |
| "grad_norm": 58.5, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 11.4959, | |
| "mean_token_accuracy": 0.8001787267625332, | |
| "num_tokens": 1667532.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_chemistry_entropy": 0.579608644425869, | |
| "eval_chemistry_loss": 0.6572756171226501, | |
| "eval_chemistry_mean_token_accuracy": 0.8160659775733948, | |
| "eval_chemistry_num_tokens": 1667532.0, | |
| "eval_chemistry_runtime": 53.9736, | |
| "eval_chemistry_samples_per_second": 9.264, | |
| "eval_chemistry_steps_per_second": 9.264, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_physics_entropy": 0.6811966061294079, | |
| "eval_physics_loss": 0.714818000793457, | |
| "eval_physics_mean_token_accuracy": 0.8037424722313881, | |
| "eval_physics_num_tokens": 1667532.0, | |
| "eval_physics_runtime": 62.6745, | |
| "eval_physics_samples_per_second": 7.978, | |
| "eval_physics_steps_per_second": 7.978, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.6441303874365986, | |
| "epoch": 0.084, | |
| "grad_norm": 65.5, | |
| "learning_rate": 4.18e-06, | |
| "loss": 10.9679, | |
| "mean_token_accuracy": 0.8105842903256416, | |
| "num_tokens": 1750813.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.7001293174922466, | |
| "epoch": 0.088, | |
| "grad_norm": 63.5, | |
| "learning_rate": 4.38e-06, | |
| "loss": 11.3739, | |
| "mean_token_accuracy": 0.8039855100214481, | |
| "num_tokens": 1835489.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.7212686208076775, | |
| "epoch": 0.092, | |
| "grad_norm": 68.5, | |
| "learning_rate": 4.58e-06, | |
| "loss": 11.9056, | |
| "mean_token_accuracy": 0.7951413784176111, | |
| "num_tokens": 1915115.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.6619318895973265, | |
| "epoch": 0.096, | |
| "grad_norm": 66.5, | |
| "learning_rate": 4.78e-06, | |
| "loss": 10.9428, | |
| "mean_token_accuracy": 0.8108693141490221, | |
| "num_tokens": 2007176.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.6880421325564384, | |
| "epoch": 0.1, | |
| "grad_norm": 55.5, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 11.3231, | |
| "mean_token_accuracy": 0.8026650555431842, | |
| "num_tokens": 2088696.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.7066576741635799, | |
| "epoch": 0.104, | |
| "grad_norm": 58.25, | |
| "learning_rate": 5.18e-06, | |
| "loss": 11.6232, | |
| "mean_token_accuracy": 0.799198641628027, | |
| "num_tokens": 2171335.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.6949395652860403, | |
| "epoch": 0.108, | |
| "grad_norm": 69.5, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 11.2549, | |
| "mean_token_accuracy": 0.8055713165551424, | |
| "num_tokens": 2254168.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.666729858610779, | |
| "epoch": 0.112, | |
| "grad_norm": 67.0, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 10.8193, | |
| "mean_token_accuracy": 0.8086711473762989, | |
| "num_tokens": 2342259.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.6924487132579088, | |
| "epoch": 0.116, | |
| "grad_norm": 60.5, | |
| "learning_rate": 5.78e-06, | |
| "loss": 11.1656, | |
| "mean_token_accuracy": 0.8066085658967495, | |
| "num_tokens": 2426289.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.7209857527166605, | |
| "epoch": 0.12, | |
| "grad_norm": 53.75, | |
| "learning_rate": 5.98e-06, | |
| "loss": 11.7716, | |
| "mean_token_accuracy": 0.7946148321032525, | |
| "num_tokens": 2504279.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_chemistry_entropy": 0.6136138401329517, | |
| "eval_chemistry_loss": 0.6401469707489014, | |
| "eval_chemistry_mean_token_accuracy": 0.8167599263191223, | |
| "eval_chemistry_num_tokens": 2504279.0, | |
| "eval_chemistry_runtime": 53.6939, | |
| "eval_chemistry_samples_per_second": 9.312, | |
| "eval_chemistry_steps_per_second": 9.312, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_physics_entropy": 0.678692765057087, | |
| "eval_physics_loss": 0.6855186820030212, | |
| "eval_physics_mean_token_accuracy": 0.8083885474801064, | |
| "eval_physics_num_tokens": 2504279.0, | |
| "eval_physics_runtime": 62.4413, | |
| "eval_physics_samples_per_second": 8.008, | |
| "eval_physics_steps_per_second": 8.008, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.7016886103898287, | |
| "epoch": 0.124, | |
| "grad_norm": 51.0, | |
| "learning_rate": 6.18e-06, | |
| "loss": 11.5109, | |
| "mean_token_accuracy": 0.79740143828094, | |
| "num_tokens": 2583599.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.6659886735491455, | |
| "epoch": 0.128, | |
| "grad_norm": 50.5, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 10.6687, | |
| "mean_token_accuracy": 0.8114107441157102, | |
| "num_tokens": 2663987.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.6984131418168544, | |
| "epoch": 0.132, | |
| "grad_norm": 55.5, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 11.5084, | |
| "mean_token_accuracy": 0.8018826052546502, | |
| "num_tokens": 2743933.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.685006461199373, | |
| "epoch": 0.136, | |
| "grad_norm": 43.25, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 11.0885, | |
| "mean_token_accuracy": 0.8083073671907186, | |
| "num_tokens": 2829720.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.6919135926291347, | |
| "epoch": 0.14, | |
| "grad_norm": 44.25, | |
| "learning_rate": 6.98e-06, | |
| "loss": 11.1629, | |
| "mean_token_accuracy": 0.8035723548382521, | |
| "num_tokens": 2915814.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.6693393810652196, | |
| "epoch": 0.144, | |
| "grad_norm": 68.5, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 10.8935, | |
| "mean_token_accuracy": 0.8098476737737655, | |
| "num_tokens": 2997824.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.6817218182608485, | |
| "epoch": 0.148, | |
| "grad_norm": 55.0, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 10.9932, | |
| "mean_token_accuracy": 0.8062656305730342, | |
| "num_tokens": 3076422.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.6472045814618468, | |
| "epoch": 0.152, | |
| "grad_norm": 48.5, | |
| "learning_rate": 7.58e-06, | |
| "loss": 10.5274, | |
| "mean_token_accuracy": 0.8134200550615788, | |
| "num_tokens": 3159946.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.639185450039804, | |
| "epoch": 0.156, | |
| "grad_norm": 54.25, | |
| "learning_rate": 7.78e-06, | |
| "loss": 10.3706, | |
| "mean_token_accuracy": 0.8159802399575711, | |
| "num_tokens": 3247218.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.6741150500252843, | |
| "epoch": 0.16, | |
| "grad_norm": 53.25, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 10.9071, | |
| "mean_token_accuracy": 0.8088992539793253, | |
| "num_tokens": 3330597.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_chemistry_entropy": 0.6244144983887673, | |
| "eval_chemistry_loss": 0.6363974809646606, | |
| "eval_chemistry_mean_token_accuracy": 0.8181359567642212, | |
| "eval_chemistry_num_tokens": 3330597.0, | |
| "eval_chemistry_runtime": 53.6338, | |
| "eval_chemistry_samples_per_second": 9.322, | |
| "eval_chemistry_steps_per_second": 9.322, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_physics_entropy": 0.6705876287817955, | |
| "eval_physics_loss": 0.6704522371292114, | |
| "eval_physics_mean_token_accuracy": 0.8111195236444473, | |
| "eval_physics_num_tokens": 3330597.0, | |
| "eval_physics_runtime": 62.3611, | |
| "eval_physics_samples_per_second": 8.018, | |
| "eval_physics_steps_per_second": 8.018, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.6919050188735127, | |
| "epoch": 0.164, | |
| "grad_norm": 57.25, | |
| "learning_rate": 8.18e-06, | |
| "loss": 11.1447, | |
| "mean_token_accuracy": 0.8047709062695503, | |
| "num_tokens": 3414445.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.6699288227595389, | |
| "epoch": 0.168, | |
| "grad_norm": 59.75, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 10.7791, | |
| "mean_token_accuracy": 0.8070078935474158, | |
| "num_tokens": 3496104.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.6792626342736184, | |
| "epoch": 0.172, | |
| "grad_norm": 48.75, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 11.0757, | |
| "mean_token_accuracy": 0.8060608543455601, | |
| "num_tokens": 3576979.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.6533824939280748, | |
| "epoch": 0.176, | |
| "grad_norm": 61.0, | |
| "learning_rate": 8.78e-06, | |
| "loss": 10.5681, | |
| "mean_token_accuracy": 0.8110670737922192, | |
| "num_tokens": 3658264.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.6711471493355929, | |
| "epoch": 0.18, | |
| "grad_norm": 57.25, | |
| "learning_rate": 8.98e-06, | |
| "loss": 10.9005, | |
| "mean_token_accuracy": 0.8076804723590613, | |
| "num_tokens": 3739142.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.6721100191585719, | |
| "epoch": 0.184, | |
| "grad_norm": 55.0, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 10.792, | |
| "mean_token_accuracy": 0.8070170730352402, | |
| "num_tokens": 3821352.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.6371375021524728, | |
| "epoch": 0.188, | |
| "grad_norm": 45.75, | |
| "learning_rate": 9.38e-06, | |
| "loss": 10.3376, | |
| "mean_token_accuracy": 0.8134377617388964, | |
| "num_tokens": 3905291.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.6436822958290577, | |
| "epoch": 0.192, | |
| "grad_norm": 45.75, | |
| "learning_rate": 9.58e-06, | |
| "loss": 10.4275, | |
| "mean_token_accuracy": 0.8145703230053186, | |
| "num_tokens": 3995568.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.6615747599862516, | |
| "epoch": 0.196, | |
| "grad_norm": 55.25, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 10.75, | |
| "mean_token_accuracy": 0.8106145299971104, | |
| "num_tokens": 4075670.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.6414908402599394, | |
| "epoch": 0.2, | |
| "grad_norm": 55.0, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 10.318, | |
| "mean_token_accuracy": 0.8157790113240481, | |
| "num_tokens": 4160918.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_chemistry_entropy": 0.6180844354629517, | |
| "eval_chemistry_loss": 0.6358755230903625, | |
| "eval_chemistry_mean_token_accuracy": 0.8190241982936859, | |
| "eval_chemistry_num_tokens": 4160918.0, | |
| "eval_chemistry_runtime": 53.6, | |
| "eval_chemistry_samples_per_second": 9.328, | |
| "eval_chemistry_steps_per_second": 9.328, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_physics_entropy": 0.6517913154959679, | |
| "eval_physics_loss": 0.6597976088523865, | |
| "eval_physics_mean_token_accuracy": 0.8130922448635102, | |
| "eval_physics_num_tokens": 4160918.0, | |
| "eval_physics_runtime": 62.5405, | |
| "eval_physics_samples_per_second": 7.995, | |
| "eval_physics_steps_per_second": 7.995, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.6417077989317477, | |
| "epoch": 0.204, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.018e-05, | |
| "loss": 10.4263, | |
| "mean_token_accuracy": 0.8166976355016231, | |
| "num_tokens": 4238810.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.635231535602361, | |
| "epoch": 0.208, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.038e-05, | |
| "loss": 10.327, | |
| "mean_token_accuracy": 0.8165579732507467, | |
| "num_tokens": 4321436.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.6600574821233749, | |
| "epoch": 0.212, | |
| "grad_norm": 49.5, | |
| "learning_rate": 1.0580000000000002e-05, | |
| "loss": 10.4772, | |
| "mean_token_accuracy": 0.811732816696167, | |
| "num_tokens": 4401710.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.6544907880946994, | |
| "epoch": 0.216, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.0780000000000002e-05, | |
| "loss": 10.7566, | |
| "mean_token_accuracy": 0.808320876955986, | |
| "num_tokens": 4482225.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.6429057026281952, | |
| "epoch": 0.22, | |
| "grad_norm": 51.25, | |
| "learning_rate": 1.0980000000000002e-05, | |
| "loss": 10.4272, | |
| "mean_token_accuracy": 0.8134562011808157, | |
| "num_tokens": 4564662.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.6186996471136809, | |
| "epoch": 0.224, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.1180000000000001e-05, | |
| "loss": 9.9666, | |
| "mean_token_accuracy": 0.8213985275477171, | |
| "num_tokens": 4648491.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.6025784744881093, | |
| "epoch": 0.228, | |
| "grad_norm": 40.75, | |
| "learning_rate": 1.138e-05, | |
| "loss": 9.8638, | |
| "mean_token_accuracy": 0.8230083137750626, | |
| "num_tokens": 4731596.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.6584511337801814, | |
| "epoch": 0.232, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.1580000000000001e-05, | |
| "loss": 10.6688, | |
| "mean_token_accuracy": 0.8104485847055912, | |
| "num_tokens": 4813449.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.6477856112644076, | |
| "epoch": 0.236, | |
| "grad_norm": 40.25, | |
| "learning_rate": 1.178e-05, | |
| "loss": 10.3036, | |
| "mean_token_accuracy": 0.8149273872375489, | |
| "num_tokens": 4899144.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.632176025211811, | |
| "epoch": 0.24, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.198e-05, | |
| "loss": 10.4072, | |
| "mean_token_accuracy": 0.8141517870128154, | |
| "num_tokens": 4986175.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_chemistry_entropy": 0.6246057401299476, | |
| "eval_chemistry_loss": 0.6355127096176147, | |
| "eval_chemistry_mean_token_accuracy": 0.8170186321735382, | |
| "eval_chemistry_num_tokens": 4986175.0, | |
| "eval_chemistry_runtime": 53.7353, | |
| "eval_chemistry_samples_per_second": 9.305, | |
| "eval_chemistry_steps_per_second": 9.305, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_physics_entropy": 0.6429958860874176, | |
| "eval_physics_loss": 0.6528828740119934, | |
| "eval_physics_mean_token_accuracy": 0.8143444744944572, | |
| "eval_physics_num_tokens": 4986175.0, | |
| "eval_physics_runtime": 62.4695, | |
| "eval_physics_samples_per_second": 8.004, | |
| "eval_physics_steps_per_second": 8.004, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.6520758749917149, | |
| "epoch": 0.244, | |
| "grad_norm": 50.0, | |
| "learning_rate": 1.218e-05, | |
| "loss": 10.5332, | |
| "mean_token_accuracy": 0.8124375600367785, | |
| "num_tokens": 5071994.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.6255051350221038, | |
| "epoch": 0.248, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.2380000000000002e-05, | |
| "loss": 10.0529, | |
| "mean_token_accuracy": 0.8187242690473795, | |
| "num_tokens": 5164741.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.6554224069230259, | |
| "epoch": 0.252, | |
| "grad_norm": 47.5, | |
| "learning_rate": 1.2580000000000002e-05, | |
| "loss": 10.7383, | |
| "mean_token_accuracy": 0.8093429666012526, | |
| "num_tokens": 5245878.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.635463903658092, | |
| "epoch": 0.256, | |
| "grad_norm": 51.0, | |
| "learning_rate": 1.2780000000000001e-05, | |
| "loss": 10.3241, | |
| "mean_token_accuracy": 0.8143212880939246, | |
| "num_tokens": 5329320.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.6407582541927695, | |
| "epoch": 0.26, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.2980000000000001e-05, | |
| "loss": 10.3061, | |
| "mean_token_accuracy": 0.8153956741094589, | |
| "num_tokens": 5412557.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.6218360926955938, | |
| "epoch": 0.264, | |
| "grad_norm": 55.25, | |
| "learning_rate": 1.3180000000000001e-05, | |
| "loss": 10.1715, | |
| "mean_token_accuracy": 0.8187983844429254, | |
| "num_tokens": 5497259.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.6525244776159524, | |
| "epoch": 0.268, | |
| "grad_norm": 48.5, | |
| "learning_rate": 1.3380000000000002e-05, | |
| "loss": 10.5148, | |
| "mean_token_accuracy": 0.8133967652916908, | |
| "num_tokens": 5576721.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.6617097955197095, | |
| "epoch": 0.272, | |
| "grad_norm": 50.0, | |
| "learning_rate": 1.3580000000000002e-05, | |
| "loss": 10.7486, | |
| "mean_token_accuracy": 0.8067130610346794, | |
| "num_tokens": 5658796.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.6757121763192118, | |
| "epoch": 0.276, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.378e-05, | |
| "loss": 10.8755, | |
| "mean_token_accuracy": 0.8065906465053558, | |
| "num_tokens": 5737499.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.6453007774427533, | |
| "epoch": 0.28, | |
| "grad_norm": 62.5, | |
| "learning_rate": 1.398e-05, | |
| "loss": 10.4792, | |
| "mean_token_accuracy": 0.8154859133064747, | |
| "num_tokens": 5819027.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_chemistry_entropy": 0.6300962825417519, | |
| "eval_chemistry_loss": 0.6389291882514954, | |
| "eval_chemistry_mean_token_accuracy": 0.8176450840234757, | |
| "eval_chemistry_num_tokens": 5819027.0, | |
| "eval_chemistry_runtime": 53.7695, | |
| "eval_chemistry_samples_per_second": 9.299, | |
| "eval_chemistry_steps_per_second": 9.299, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_physics_entropy": 0.6429837597608566, | |
| "eval_physics_loss": 0.6483125686645508, | |
| "eval_physics_mean_token_accuracy": 0.8155634801387787, | |
| "eval_physics_num_tokens": 5819027.0, | |
| "eval_physics_runtime": 62.5294, | |
| "eval_physics_samples_per_second": 7.996, | |
| "eval_physics_steps_per_second": 7.996, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.6762596594169736, | |
| "epoch": 0.284, | |
| "grad_norm": 59.0, | |
| "learning_rate": 1.418e-05, | |
| "loss": 10.8583, | |
| "mean_token_accuracy": 0.8072559602558613, | |
| "num_tokens": 5898693.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.6340524691157043, | |
| "epoch": 0.288, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.4380000000000001e-05, | |
| "loss": 10.3892, | |
| "mean_token_accuracy": 0.8159883081912994, | |
| "num_tokens": 5980201.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.6257706139236688, | |
| "epoch": 0.292, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.4580000000000001e-05, | |
| "loss": 10.052, | |
| "mean_token_accuracy": 0.8204058665782213, | |
| "num_tokens": 6059507.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.6355020292103291, | |
| "epoch": 0.296, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.478e-05, | |
| "loss": 10.3665, | |
| "mean_token_accuracy": 0.8142751138657331, | |
| "num_tokens": 6139830.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.6532576438970864, | |
| "epoch": 0.3, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.498e-05, | |
| "loss": 10.3728, | |
| "mean_token_accuracy": 0.8153154000639915, | |
| "num_tokens": 6218134.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.6463365102186799, | |
| "epoch": 0.304, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.5180000000000002e-05, | |
| "loss": 10.4965, | |
| "mean_token_accuracy": 0.8147804290056229, | |
| "num_tokens": 6298858.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.6870367551222444, | |
| "epoch": 0.308, | |
| "grad_norm": 55.0, | |
| "learning_rate": 1.5380000000000002e-05, | |
| "loss": 11.2005, | |
| "mean_token_accuracy": 0.8020890522748232, | |
| "num_tokens": 6375761.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.6256772884167731, | |
| "epoch": 0.312, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.5580000000000003e-05, | |
| "loss": 10.0914, | |
| "mean_token_accuracy": 0.8196329109370708, | |
| "num_tokens": 6456502.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.6423514087684452, | |
| "epoch": 0.316, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.578e-05, | |
| "loss": 10.492, | |
| "mean_token_accuracy": 0.8146003533154726, | |
| "num_tokens": 6537521.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.6631265950389207, | |
| "epoch": 0.32, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.5980000000000003e-05, | |
| "loss": 10.7489, | |
| "mean_token_accuracy": 0.8101503010839224, | |
| "num_tokens": 6622798.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_chemistry_entropy": 0.6421979127824307, | |
| "eval_chemistry_loss": 0.6415462493896484, | |
| "eval_chemistry_mean_token_accuracy": 0.817062867641449, | |
| "eval_chemistry_num_tokens": 6622798.0, | |
| "eval_chemistry_runtime": 53.6883, | |
| "eval_chemistry_samples_per_second": 9.313, | |
| "eval_chemistry_steps_per_second": 9.313, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_physics_entropy": 0.6440860625505448, | |
| "eval_physics_loss": 0.6467884182929993, | |
| "eval_physics_mean_token_accuracy": 0.8157004522681236, | |
| "eval_physics_num_tokens": 6622798.0, | |
| "eval_physics_runtime": 62.4499, | |
| "eval_physics_samples_per_second": 8.006, | |
| "eval_physics_steps_per_second": 8.006, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.6417643938213586, | |
| "epoch": 0.324, | |
| "grad_norm": 54.0, | |
| "learning_rate": 1.618e-05, | |
| "loss": 10.54, | |
| "mean_token_accuracy": 0.8144381437450647, | |
| "num_tokens": 6699373.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.6635292621329427, | |
| "epoch": 0.328, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.638e-05, | |
| "loss": 10.6337, | |
| "mean_token_accuracy": 0.8078649930655957, | |
| "num_tokens": 6782985.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.6515420666895807, | |
| "epoch": 0.332, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.658e-05, | |
| "loss": 10.4447, | |
| "mean_token_accuracy": 0.8119669426232576, | |
| "num_tokens": 6868578.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.6081873361952603, | |
| "epoch": 0.336, | |
| "grad_norm": 40.0, | |
| "learning_rate": 1.6780000000000002e-05, | |
| "loss": 9.8522, | |
| "mean_token_accuracy": 0.8244299061596394, | |
| "num_tokens": 6953611.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.6503613390959799, | |
| "epoch": 0.34, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.698e-05, | |
| "loss": 10.6425, | |
| "mean_token_accuracy": 0.8126828409731388, | |
| "num_tokens": 7037831.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.611198160611093, | |
| "epoch": 0.344, | |
| "grad_norm": 56.75, | |
| "learning_rate": 1.718e-05, | |
| "loss": 9.9189, | |
| "mean_token_accuracy": 0.8207242820411921, | |
| "num_tokens": 7121949.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.6327245706692338, | |
| "epoch": 0.348, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.7380000000000003e-05, | |
| "loss": 10.2368, | |
| "mean_token_accuracy": 0.8166516173630953, | |
| "num_tokens": 7206056.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.6341161539778113, | |
| "epoch": 0.352, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.758e-05, | |
| "loss": 10.2941, | |
| "mean_token_accuracy": 0.816456102579832, | |
| "num_tokens": 7290378.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.6554373924620449, | |
| "epoch": 0.356, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.7780000000000003e-05, | |
| "loss": 10.5667, | |
| "mean_token_accuracy": 0.8104542337357998, | |
| "num_tokens": 7367952.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.6374449659138918, | |
| "epoch": 0.36, | |
| "grad_norm": 59.75, | |
| "learning_rate": 1.798e-05, | |
| "loss": 10.3565, | |
| "mean_token_accuracy": 0.816227027028799, | |
| "num_tokens": 7452920.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_chemistry_entropy": 0.6640482069849968, | |
| "eval_chemistry_loss": 0.6512866020202637, | |
| "eval_chemistry_mean_token_accuracy": 0.8145369750261306, | |
| "eval_chemistry_num_tokens": 7452920.0, | |
| "eval_chemistry_runtime": 53.5846, | |
| "eval_chemistry_samples_per_second": 9.331, | |
| "eval_chemistry_steps_per_second": 9.331, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_physics_entropy": 0.65689039093256, | |
| "eval_physics_loss": 0.64650559425354, | |
| "eval_physics_mean_token_accuracy": 0.8153353578448296, | |
| "eval_physics_num_tokens": 7452920.0, | |
| "eval_physics_runtime": 62.3633, | |
| "eval_physics_samples_per_second": 8.018, | |
| "eval_physics_steps_per_second": 8.018, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.6258705526590347, | |
| "epoch": 0.364, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.8180000000000002e-05, | |
| "loss": 10.013, | |
| "mean_token_accuracy": 0.8192542195320129, | |
| "num_tokens": 7535961.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.6131137845106422, | |
| "epoch": 0.368, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.8380000000000004e-05, | |
| "loss": 9.9858, | |
| "mean_token_accuracy": 0.8190763648599386, | |
| "num_tokens": 7621082.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.6138401796109975, | |
| "epoch": 0.372, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.858e-05, | |
| "loss": 10.0396, | |
| "mean_token_accuracy": 0.8174721483141184, | |
| "num_tokens": 7705651.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.7146476589143276, | |
| "epoch": 0.376, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.878e-05, | |
| "loss": 11.4452, | |
| "mean_token_accuracy": 0.7953129142522812, | |
| "num_tokens": 7787036.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.6485187427140773, | |
| "epoch": 0.38, | |
| "grad_norm": 56.5, | |
| "learning_rate": 1.898e-05, | |
| "loss": 10.5487, | |
| "mean_token_accuracy": 0.8087487578392029, | |
| "num_tokens": 7869437.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.633660759497434, | |
| "epoch": 0.384, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.918e-05, | |
| "loss": 10.1676, | |
| "mean_token_accuracy": 0.8168184392154216, | |
| "num_tokens": 7955570.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.5985365292057395, | |
| "epoch": 0.388, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.938e-05, | |
| "loss": 9.8157, | |
| "mean_token_accuracy": 0.8218576721847057, | |
| "num_tokens": 8038818.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 0.6344430455937982, | |
| "epoch": 0.392, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.9580000000000002e-05, | |
| "loss": 10.1821, | |
| "mean_token_accuracy": 0.8146390464156866, | |
| "num_tokens": 8123527.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.6378526957705617, | |
| "epoch": 0.396, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.978e-05, | |
| "loss": 10.2889, | |
| "mean_token_accuracy": 0.8144868839532137, | |
| "num_tokens": 8206501.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.6167035531252623, | |
| "epoch": 0.4, | |
| "grad_norm": 36.25, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 10.0591, | |
| "mean_token_accuracy": 0.819990050420165, | |
| "num_tokens": 8291049.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_chemistry_entropy": 0.6586475971937179, | |
| "eval_chemistry_loss": 0.6543877124786377, | |
| "eval_chemistry_mean_token_accuracy": 0.8140276271104813, | |
| "eval_chemistry_num_tokens": 8291049.0, | |
| "eval_chemistry_runtime": 53.801, | |
| "eval_chemistry_samples_per_second": 9.294, | |
| "eval_chemistry_steps_per_second": 9.294, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_physics_entropy": 0.645909442961216, | |
| "eval_physics_loss": 0.6488538980484009, | |
| "eval_physics_mean_token_accuracy": 0.8152267854809762, | |
| "eval_physics_num_tokens": 8291049.0, | |
| "eval_physics_runtime": 63.4318, | |
| "eval_physics_samples_per_second": 7.882, | |
| "eval_physics_steps_per_second": 7.882, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.6288712982088327, | |
| "epoch": 0.404, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 10.1535, | |
| "mean_token_accuracy": 0.8175500966608524, | |
| "num_tokens": 8373687.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 0.6477080984972418, | |
| "epoch": 0.408, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.995777777777778e-05, | |
| "loss": 10.4444, | |
| "mean_token_accuracy": 0.8120142377912998, | |
| "num_tokens": 8459795.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.6198763550259173, | |
| "epoch": 0.412, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.9935555555555557e-05, | |
| "loss": 10.145, | |
| "mean_token_accuracy": 0.8183699958026409, | |
| "num_tokens": 8544948.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.6268251063302159, | |
| "epoch": 0.416, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.9913333333333335e-05, | |
| "loss": 10.1171, | |
| "mean_token_accuracy": 0.8166644174605608, | |
| "num_tokens": 8627310.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.7168899232521653, | |
| "epoch": 0.42, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.9891111111111112e-05, | |
| "loss": 11.6477, | |
| "mean_token_accuracy": 0.7963819321244955, | |
| "num_tokens": 8707798.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.6254937417805195, | |
| "epoch": 0.424, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.986888888888889e-05, | |
| "loss": 10.0341, | |
| "mean_token_accuracy": 0.8202919337898493, | |
| "num_tokens": 8792574.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.6258060172200203, | |
| "epoch": 0.428, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.9846666666666668e-05, | |
| "loss": 10.2091, | |
| "mean_token_accuracy": 0.8151731941848993, | |
| "num_tokens": 8878637.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.6487020991742611, | |
| "epoch": 0.432, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.9824444444444445e-05, | |
| "loss": 10.4063, | |
| "mean_token_accuracy": 0.8136575162410736, | |
| "num_tokens": 8958371.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.6037185428664088, | |
| "epoch": 0.436, | |
| "grad_norm": 49.25, | |
| "learning_rate": 1.9802222222222226e-05, | |
| "loss": 9.8187, | |
| "mean_token_accuracy": 0.8217611808329821, | |
| "num_tokens": 9040649.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 0.5906071299687028, | |
| "epoch": 0.44, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.978e-05, | |
| "loss": 9.5257, | |
| "mean_token_accuracy": 0.8266360804438591, | |
| "num_tokens": 9125564.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_chemistry_entropy": 0.6296808683276176, | |
| "eval_chemistry_loss": 0.6599302291870117, | |
| "eval_chemistry_mean_token_accuracy": 0.8118611789941788, | |
| "eval_chemistry_num_tokens": 9125564.0, | |
| "eval_chemistry_runtime": 54.5116, | |
| "eval_chemistry_samples_per_second": 9.172, | |
| "eval_chemistry_steps_per_second": 9.172, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_physics_entropy": 0.6190407463014126, | |
| "eval_physics_loss": 0.6459570527076721, | |
| "eval_physics_mean_token_accuracy": 0.815562941968441, | |
| "eval_physics_num_tokens": 9125564.0, | |
| "eval_physics_runtime": 63.2267, | |
| "eval_physics_samples_per_second": 7.908, | |
| "eval_physics_steps_per_second": 7.908, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.6702354967594146, | |
| "epoch": 0.444, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.975777777777778e-05, | |
| "loss": 10.8062, | |
| "mean_token_accuracy": 0.810298365727067, | |
| "num_tokens": 9210443.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 0.6155520310625434, | |
| "epoch": 0.448, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.9735555555555556e-05, | |
| "loss": 10.1142, | |
| "mean_token_accuracy": 0.8205059375613928, | |
| "num_tokens": 9294388.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.5996189665980637, | |
| "epoch": 0.452, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.9713333333333337e-05, | |
| "loss": 9.4709, | |
| "mean_token_accuracy": 0.8281105298548936, | |
| "num_tokens": 9371203.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 0.6231001582928002, | |
| "epoch": 0.456, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.969111111111111e-05, | |
| "loss": 10.1464, | |
| "mean_token_accuracy": 0.8164454229176045, | |
| "num_tokens": 9455428.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.6021164663136005, | |
| "epoch": 0.46, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.9668888888888892e-05, | |
| "loss": 9.6895, | |
| "mean_token_accuracy": 0.8233658988028765, | |
| "num_tokens": 9539605.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.6290203684009612, | |
| "epoch": 0.464, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.9646666666666666e-05, | |
| "loss": 10.1725, | |
| "mean_token_accuracy": 0.8175587739795447, | |
| "num_tokens": 9620366.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.6108594150282443, | |
| "epoch": 0.468, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.9624444444444447e-05, | |
| "loss": 9.947, | |
| "mean_token_accuracy": 0.819192311167717, | |
| "num_tokens": 9706621.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.6204524893313647, | |
| "epoch": 0.472, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.9602222222222225e-05, | |
| "loss": 9.9455, | |
| "mean_token_accuracy": 0.8174036685377359, | |
| "num_tokens": 9788925.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.5935720985755324, | |
| "epoch": 0.476, | |
| "grad_norm": 36.5, | |
| "learning_rate": 1.9580000000000002e-05, | |
| "loss": 9.6394, | |
| "mean_token_accuracy": 0.8253184624016285, | |
| "num_tokens": 9869908.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 0.6630870910361409, | |
| "epoch": 0.48, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.955777777777778e-05, | |
| "loss": 10.7476, | |
| "mean_token_accuracy": 0.808968411386013, | |
| "num_tokens": 9955431.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_chemistry_entropy": 0.6792522894442081, | |
| "eval_chemistry_loss": 0.6633003354072571, | |
| "eval_chemistry_mean_token_accuracy": 0.8119940814971924, | |
| "eval_chemistry_num_tokens": 9955431.0, | |
| "eval_chemistry_runtime": 53.9215, | |
| "eval_chemistry_samples_per_second": 9.273, | |
| "eval_chemistry_steps_per_second": 9.273, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_physics_entropy": 0.650343769878149, | |
| "eval_physics_loss": 0.6420111656188965, | |
| "eval_physics_mean_token_accuracy": 0.8167362776398659, | |
| "eval_physics_num_tokens": 9955431.0, | |
| "eval_physics_runtime": 62.6062, | |
| "eval_physics_samples_per_second": 7.986, | |
| "eval_physics_steps_per_second": 7.986, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.612500429712236, | |
| "epoch": 0.484, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.9535555555555557e-05, | |
| "loss": 9.9347, | |
| "mean_token_accuracy": 0.8223837319761514, | |
| "num_tokens": 10038293.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 0.6377324659377337, | |
| "epoch": 0.488, | |
| "grad_norm": 40.5, | |
| "learning_rate": 1.9513333333333335e-05, | |
| "loss": 10.2831, | |
| "mean_token_accuracy": 0.817806663736701, | |
| "num_tokens": 10119599.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.6632728383876383, | |
| "epoch": 0.492, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.9491111111111113e-05, | |
| "loss": 10.6639, | |
| "mean_token_accuracy": 0.80819180496037, | |
| "num_tokens": 10204514.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 0.6558596555143594, | |
| "epoch": 0.496, | |
| "grad_norm": 36.0, | |
| "learning_rate": 1.946888888888889e-05, | |
| "loss": 10.5429, | |
| "mean_token_accuracy": 0.8120821505784989, | |
| "num_tokens": 10289883.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.6169624142348766, | |
| "epoch": 0.5, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.9446666666666668e-05, | |
| "loss": 10.0534, | |
| "mean_token_accuracy": 0.8196999322623014, | |
| "num_tokens": 10368762.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.6267159227281809, | |
| "epoch": 0.504, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.9424444444444446e-05, | |
| "loss": 10.096, | |
| "mean_token_accuracy": 0.8189252704381943, | |
| "num_tokens": 10454683.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 0.6419813117012382, | |
| "epoch": 0.508, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.9402222222222223e-05, | |
| "loss": 10.4847, | |
| "mean_token_accuracy": 0.8120944950729608, | |
| "num_tokens": 10539515.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 0.684658529702574, | |
| "epoch": 0.512, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.938e-05, | |
| "loss": 10.9876, | |
| "mean_token_accuracy": 0.8051567550748586, | |
| "num_tokens": 10619468.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.6482914222404361, | |
| "epoch": 0.516, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.935777777777778e-05, | |
| "loss": 10.3902, | |
| "mean_token_accuracy": 0.8132968720048666, | |
| "num_tokens": 10698439.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 0.6492844991385937, | |
| "epoch": 0.52, | |
| "grad_norm": 40.25, | |
| "learning_rate": 1.9335555555555556e-05, | |
| "loss": 10.5517, | |
| "mean_token_accuracy": 0.809664323553443, | |
| "num_tokens": 10781285.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_chemistry_entropy": 0.6645045831203461, | |
| "eval_chemistry_loss": 0.6653202176094055, | |
| "eval_chemistry_mean_token_accuracy": 0.811331033706665, | |
| "eval_chemistry_num_tokens": 10781285.0, | |
| "eval_chemistry_runtime": 55.4768, | |
| "eval_chemistry_samples_per_second": 9.013, | |
| "eval_chemistry_steps_per_second": 9.013, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_physics_entropy": 0.6388631807863713, | |
| "eval_physics_loss": 0.6363136172294617, | |
| "eval_physics_mean_token_accuracy": 0.8173955215811729, | |
| "eval_physics_num_tokens": 10781285.0, | |
| "eval_physics_runtime": 62.7946, | |
| "eval_physics_samples_per_second": 7.962, | |
| "eval_physics_steps_per_second": 7.962, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.6285697594285011, | |
| "epoch": 0.524, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.9313333333333334e-05, | |
| "loss": 10.221, | |
| "mean_token_accuracy": 0.8181417305022478, | |
| "num_tokens": 10864773.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 0.6136522050015628, | |
| "epoch": 0.528, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.9291111111111115e-05, | |
| "loss": 9.8866, | |
| "mean_token_accuracy": 0.8212735544890165, | |
| "num_tokens": 10944904.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.6457662120461464, | |
| "epoch": 0.532, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.926888888888889e-05, | |
| "loss": 10.3524, | |
| "mean_token_accuracy": 0.8151355512440205, | |
| "num_tokens": 11023958.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.5919691896066069, | |
| "epoch": 0.536, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.924666666666667e-05, | |
| "loss": 9.6708, | |
| "mean_token_accuracy": 0.8262011889368296, | |
| "num_tokens": 11106538.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.6490627369843424, | |
| "epoch": 0.54, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.9224444444444444e-05, | |
| "loss": 10.464, | |
| "mean_token_accuracy": 0.8134491160511971, | |
| "num_tokens": 11187142.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.6667550153099, | |
| "epoch": 0.544, | |
| "grad_norm": 54.5, | |
| "learning_rate": 1.9202222222222225e-05, | |
| "loss": 10.8278, | |
| "mean_token_accuracy": 0.8061591122299433, | |
| "num_tokens": 11266409.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.6294793977402151, | |
| "epoch": 0.548, | |
| "grad_norm": 48.5, | |
| "learning_rate": 1.918e-05, | |
| "loss": 10.153, | |
| "mean_token_accuracy": 0.8186534035950899, | |
| "num_tokens": 11350094.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.6307276577688754, | |
| "epoch": 0.552, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.915777777777778e-05, | |
| "loss": 10.2431, | |
| "mean_token_accuracy": 0.8161013640463353, | |
| "num_tokens": 11438391.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.6490003860555589, | |
| "epoch": 0.556, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.9135555555555555e-05, | |
| "loss": 10.292, | |
| "mean_token_accuracy": 0.8154143560677767, | |
| "num_tokens": 11521583.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.5914541745558382, | |
| "epoch": 0.56, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.9113333333333336e-05, | |
| "loss": 9.6568, | |
| "mean_token_accuracy": 0.8256826549768448, | |
| "num_tokens": 11605544.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_chemistry_entropy": 0.6548177749216556, | |
| "eval_chemistry_loss": 0.6667582988739014, | |
| "eval_chemistry_mean_token_accuracy": 0.8108637844324111, | |
| "eval_chemistry_num_tokens": 11605544.0, | |
| "eval_chemistry_runtime": 53.9633, | |
| "eval_chemistry_samples_per_second": 9.266, | |
| "eval_chemistry_steps_per_second": 9.266, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_physics_entropy": 0.6302824917435647, | |
| "eval_physics_loss": 0.6326262354850769, | |
| "eval_physics_mean_token_accuracy": 0.8184790652990341, | |
| "eval_physics_num_tokens": 11605544.0, | |
| "eval_physics_runtime": 62.6194, | |
| "eval_physics_samples_per_second": 7.985, | |
| "eval_physics_steps_per_second": 7.985, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.6050882572308183, | |
| "epoch": 0.564, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.9091111111111113e-05, | |
| "loss": 9.7547, | |
| "mean_token_accuracy": 0.8240263950079679, | |
| "num_tokens": 11686173.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.637587258964777, | |
| "epoch": 0.568, | |
| "grad_norm": 36.75, | |
| "learning_rate": 1.906888888888889e-05, | |
| "loss": 10.1842, | |
| "mean_token_accuracy": 0.818510303273797, | |
| "num_tokens": 11765430.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.6074552223086357, | |
| "epoch": 0.572, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.904666666666667e-05, | |
| "loss": 9.9644, | |
| "mean_token_accuracy": 0.8220620211213827, | |
| "num_tokens": 11850973.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 0.617229042109102, | |
| "epoch": 0.576, | |
| "grad_norm": 53.0, | |
| "learning_rate": 1.9024444444444446e-05, | |
| "loss": 9.9445, | |
| "mean_token_accuracy": 0.820784068480134, | |
| "num_tokens": 11931715.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.59880430418998, | |
| "epoch": 0.58, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.9002222222222224e-05, | |
| "loss": 9.5273, | |
| "mean_token_accuracy": 0.8268065106123685, | |
| "num_tokens": 12012155.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.6227379216812551, | |
| "epoch": 0.584, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.898e-05, | |
| "loss": 10.1233, | |
| "mean_token_accuracy": 0.8189155597239732, | |
| "num_tokens": 12093686.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 0.6201856574974954, | |
| "epoch": 0.588, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.895777777777778e-05, | |
| "loss": 9.9752, | |
| "mean_token_accuracy": 0.8193617489188909, | |
| "num_tokens": 12177581.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.6364695785567165, | |
| "epoch": 0.592, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.8935555555555556e-05, | |
| "loss": 10.3954, | |
| "mean_token_accuracy": 0.8134158592671156, | |
| "num_tokens": 12257243.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.6294575407169759, | |
| "epoch": 0.596, | |
| "grad_norm": 59.0, | |
| "learning_rate": 1.8913333333333334e-05, | |
| "loss": 10.1581, | |
| "mean_token_accuracy": 0.8173588078469038, | |
| "num_tokens": 12338544.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 0.6043597641400993, | |
| "epoch": 0.6, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.8891111111111115e-05, | |
| "loss": 9.7064, | |
| "mean_token_accuracy": 0.8232355277985335, | |
| "num_tokens": 12422008.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_chemistry_entropy": 0.6489740502238274, | |
| "eval_chemistry_loss": 0.6658231019973755, | |
| "eval_chemistry_mean_token_accuracy": 0.8121171447038651, | |
| "eval_chemistry_num_tokens": 12422008.0, | |
| "eval_chemistry_runtime": 53.9084, | |
| "eval_chemistry_samples_per_second": 9.275, | |
| "eval_chemistry_steps_per_second": 9.275, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_physics_entropy": 0.6161783193051815, | |
| "eval_physics_loss": 0.628398597240448, | |
| "eval_physics_mean_token_accuracy": 0.8198308848142624, | |
| "eval_physics_num_tokens": 12422008.0, | |
| "eval_physics_runtime": 62.5175, | |
| "eval_physics_samples_per_second": 7.998, | |
| "eval_physics_steps_per_second": 7.998, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.6060653101652861, | |
| "epoch": 0.604, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.886888888888889e-05, | |
| "loss": 9.894, | |
| "mean_token_accuracy": 0.8234386540949344, | |
| "num_tokens": 12505134.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 0.5880289922468365, | |
| "epoch": 0.608, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.884666666666667e-05, | |
| "loss": 9.4441, | |
| "mean_token_accuracy": 0.8280234813690186, | |
| "num_tokens": 12583154.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.625719802454114, | |
| "epoch": 0.612, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.8824444444444445e-05, | |
| "loss": 10.0379, | |
| "mean_token_accuracy": 0.8188928976655007, | |
| "num_tokens": 12666601.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 0.6189737745560706, | |
| "epoch": 0.616, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.8802222222222226e-05, | |
| "loss": 9.953, | |
| "mean_token_accuracy": 0.8189409743994475, | |
| "num_tokens": 12746458.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.6307426829822361, | |
| "epoch": 0.62, | |
| "grad_norm": 37.0, | |
| "learning_rate": 1.878e-05, | |
| "loss": 10.3275, | |
| "mean_token_accuracy": 0.8165652919560671, | |
| "num_tokens": 12831572.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.633724972512573, | |
| "epoch": 0.624, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.875777777777778e-05, | |
| "loss": 10.0729, | |
| "mean_token_accuracy": 0.8192374683916569, | |
| "num_tokens": 12905392.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.6234388993121683, | |
| "epoch": 0.628, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.873555555555556e-05, | |
| "loss": 10.1334, | |
| "mean_token_accuracy": 0.8167661242187023, | |
| "num_tokens": 12986513.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.6102577080950141, | |
| "epoch": 0.632, | |
| "grad_norm": 36.75, | |
| "learning_rate": 1.8713333333333336e-05, | |
| "loss": 9.8003, | |
| "mean_token_accuracy": 0.8206873726099729, | |
| "num_tokens": 13071058.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.6236647194251418, | |
| "epoch": 0.636, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.8691111111111114e-05, | |
| "loss": 10.1304, | |
| "mean_token_accuracy": 0.816493471711874, | |
| "num_tokens": 13151842.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 0.6845891922712326, | |
| "epoch": 0.64, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.866888888888889e-05, | |
| "loss": 11.213, | |
| "mean_token_accuracy": 0.8044512301683426, | |
| "num_tokens": 13232198.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_chemistry_entropy": 0.6778704892992974, | |
| "eval_chemistry_loss": 0.6664143204689026, | |
| "eval_chemistry_mean_token_accuracy": 0.8122610586881638, | |
| "eval_chemistry_num_tokens": 13232198.0, | |
| "eval_chemistry_runtime": 53.6359, | |
| "eval_chemistry_samples_per_second": 9.322, | |
| "eval_chemistry_steps_per_second": 9.322, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_physics_entropy": 0.6401235084533692, | |
| "eval_physics_loss": 0.6214710474014282, | |
| "eval_physics_mean_token_accuracy": 0.8208677229285241, | |
| "eval_physics_num_tokens": 13232198.0, | |
| "eval_physics_runtime": 62.3791, | |
| "eval_physics_samples_per_second": 8.016, | |
| "eval_physics_steps_per_second": 8.016, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.6061294285580516, | |
| "epoch": 0.644, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.864666666666667e-05, | |
| "loss": 9.6671, | |
| "mean_token_accuracy": 0.8222626011818648, | |
| "num_tokens": 13318037.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 0.6353385791182518, | |
| "epoch": 0.648, | |
| "grad_norm": 47.5, | |
| "learning_rate": 1.8624444444444446e-05, | |
| "loss": 10.2781, | |
| "mean_token_accuracy": 0.8162215132266283, | |
| "num_tokens": 13401064.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.6056097308173776, | |
| "epoch": 0.652, | |
| "grad_norm": 51.25, | |
| "learning_rate": 1.8602222222222224e-05, | |
| "loss": 9.9383, | |
| "mean_token_accuracy": 0.8226072300225497, | |
| "num_tokens": 13488222.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.6103159219957888, | |
| "epoch": 0.656, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.858e-05, | |
| "loss": 9.7041, | |
| "mean_token_accuracy": 0.8251269839704036, | |
| "num_tokens": 13575902.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.6257463837042451, | |
| "epoch": 0.66, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.855777777777778e-05, | |
| "loss": 10.1428, | |
| "mean_token_accuracy": 0.8161589603871107, | |
| "num_tokens": 13655826.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.6265889365226031, | |
| "epoch": 0.664, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.8535555555555557e-05, | |
| "loss": 10.1095, | |
| "mean_token_accuracy": 0.8177770785987377, | |
| "num_tokens": 13733192.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.5957706947810948, | |
| "epoch": 0.668, | |
| "grad_norm": 35.75, | |
| "learning_rate": 1.8513333333333335e-05, | |
| "loss": 9.5903, | |
| "mean_token_accuracy": 0.8270833406597375, | |
| "num_tokens": 13816716.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 0.6371551887132227, | |
| "epoch": 0.672, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.8491111111111112e-05, | |
| "loss": 10.4787, | |
| "mean_token_accuracy": 0.8113816611468792, | |
| "num_tokens": 13895997.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.5986419206485152, | |
| "epoch": 0.676, | |
| "grad_norm": 40.5, | |
| "learning_rate": 1.846888888888889e-05, | |
| "loss": 9.5155, | |
| "mean_token_accuracy": 0.8267723452299833, | |
| "num_tokens": 13988168.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.5948348139412701, | |
| "epoch": 0.68, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.8446666666666667e-05, | |
| "loss": 9.6889, | |
| "mean_token_accuracy": 0.8268945027142763, | |
| "num_tokens": 14074579.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_chemistry_entropy": 0.6539524466097355, | |
| "eval_chemistry_loss": 0.6660082340240479, | |
| "eval_chemistry_mean_token_accuracy": 0.8111258904933929, | |
| "eval_chemistry_num_tokens": 14074579.0, | |
| "eval_chemistry_runtime": 53.558, | |
| "eval_chemistry_samples_per_second": 9.336, | |
| "eval_chemistry_steps_per_second": 9.336, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_physics_entropy": 0.6102372957170009, | |
| "eval_physics_loss": 0.6197578310966492, | |
| "eval_physics_mean_token_accuracy": 0.8214541796445847, | |
| "eval_physics_num_tokens": 14074579.0, | |
| "eval_physics_runtime": 62.3426, | |
| "eval_physics_samples_per_second": 8.02, | |
| "eval_physics_steps_per_second": 8.02, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.6658952804282308, | |
| "epoch": 0.684, | |
| "grad_norm": 36.75, | |
| "learning_rate": 1.842444444444445e-05, | |
| "loss": 10.7562, | |
| "mean_token_accuracy": 0.8074348546564579, | |
| "num_tokens": 14153649.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 0.6083603931590915, | |
| "epoch": 0.688, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.8402222222222223e-05, | |
| "loss": 9.7355, | |
| "mean_token_accuracy": 0.822414780780673, | |
| "num_tokens": 14234888.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.6043023524805904, | |
| "epoch": 0.692, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.8380000000000004e-05, | |
| "loss": 9.8462, | |
| "mean_token_accuracy": 0.8208967238664627, | |
| "num_tokens": 14323183.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 0.613863595854491, | |
| "epoch": 0.696, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.8357777777777778e-05, | |
| "loss": 9.8414, | |
| "mean_token_accuracy": 0.821249233186245, | |
| "num_tokens": 14411150.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.6170497369021177, | |
| "epoch": 0.7, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.833555555555556e-05, | |
| "loss": 10.0544, | |
| "mean_token_accuracy": 0.8183467876166105, | |
| "num_tokens": 14490628.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.6329357607290149, | |
| "epoch": 0.704, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.8313333333333333e-05, | |
| "loss": 10.3231, | |
| "mean_token_accuracy": 0.814619118347764, | |
| "num_tokens": 14567908.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.6202297451905906, | |
| "epoch": 0.708, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.8291111111111114e-05, | |
| "loss": 9.8856, | |
| "mean_token_accuracy": 0.8198208883404732, | |
| "num_tokens": 14647291.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 0.5805901409126818, | |
| "epoch": 0.712, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.8268888888888888e-05, | |
| "loss": 9.4144, | |
| "mean_token_accuracy": 0.8300752732902765, | |
| "num_tokens": 14728975.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 0.6089818102307618, | |
| "epoch": 0.716, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.824666666666667e-05, | |
| "loss": 9.7038, | |
| "mean_token_accuracy": 0.825373613089323, | |
| "num_tokens": 14804586.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 0.6049615413881838, | |
| "epoch": 0.72, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.8224444444444447e-05, | |
| "loss": 9.9003, | |
| "mean_token_accuracy": 0.8198985580354929, | |
| "num_tokens": 14882927.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_chemistry_entropy": 0.6426224761903286, | |
| "eval_chemistry_loss": 0.6654659509658813, | |
| "eval_chemistry_mean_token_accuracy": 0.8117904326915741, | |
| "eval_chemistry_num_tokens": 14882927.0, | |
| "eval_chemistry_runtime": 53.7762, | |
| "eval_chemistry_samples_per_second": 9.298, | |
| "eval_chemistry_steps_per_second": 9.298, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_physics_entropy": 0.6037323722243308, | |
| "eval_physics_loss": 0.616446316242218, | |
| "eval_physics_mean_token_accuracy": 0.8222740324139595, | |
| "eval_physics_num_tokens": 14882927.0, | |
| "eval_physics_runtime": 62.498, | |
| "eval_physics_samples_per_second": 8.0, | |
| "eval_physics_steps_per_second": 8.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.6227391706779599, | |
| "epoch": 0.724, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.8202222222222225e-05, | |
| "loss": 9.9987, | |
| "mean_token_accuracy": 0.818676395714283, | |
| "num_tokens": 14966717.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 0.5725922109559178, | |
| "epoch": 0.728, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.8180000000000002e-05, | |
| "loss": 9.3114, | |
| "mean_token_accuracy": 0.8298240959644317, | |
| "num_tokens": 15052770.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.6312758250162005, | |
| "epoch": 0.732, | |
| "grad_norm": 35.0, | |
| "learning_rate": 1.815777777777778e-05, | |
| "loss": 10.2655, | |
| "mean_token_accuracy": 0.8184621930122375, | |
| "num_tokens": 15135882.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 0.6022148390300572, | |
| "epoch": 0.736, | |
| "grad_norm": 48.0, | |
| "learning_rate": 1.8135555555555557e-05, | |
| "loss": 9.6266, | |
| "mean_token_accuracy": 0.8265880558639764, | |
| "num_tokens": 15217342.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.6001213543117047, | |
| "epoch": 0.74, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.8113333333333335e-05, | |
| "loss": 9.7379, | |
| "mean_token_accuracy": 0.8232456650584936, | |
| "num_tokens": 15303623.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.5776225552894175, | |
| "epoch": 0.744, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.8091111111111113e-05, | |
| "loss": 9.2318, | |
| "mean_token_accuracy": 0.8330205090343952, | |
| "num_tokens": 15385161.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 0.6180003954097628, | |
| "epoch": 0.748, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.806888888888889e-05, | |
| "loss": 10.2082, | |
| "mean_token_accuracy": 0.8174183562397956, | |
| "num_tokens": 15466126.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.5681312756612897, | |
| "epoch": 0.752, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.8046666666666668e-05, | |
| "loss": 9.0644, | |
| "mean_token_accuracy": 0.8345625158399343, | |
| "num_tokens": 15547710.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 0.6220970245078206, | |
| "epoch": 0.756, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.8024444444444445e-05, | |
| "loss": 9.9613, | |
| "mean_token_accuracy": 0.8187067184597254, | |
| "num_tokens": 15625904.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 0.6236506992951035, | |
| "epoch": 0.76, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.8002222222222223e-05, | |
| "loss": 10.1366, | |
| "mean_token_accuracy": 0.8179228503257037, | |
| "num_tokens": 15704494.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_chemistry_entropy": 0.653880989164114, | |
| "eval_chemistry_loss": 0.6663482189178467, | |
| "eval_chemistry_mean_token_accuracy": 0.8113617711067199, | |
| "eval_chemistry_num_tokens": 15704494.0, | |
| "eval_chemistry_runtime": 53.799, | |
| "eval_chemistry_samples_per_second": 9.294, | |
| "eval_chemistry_steps_per_second": 9.294, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_physics_entropy": 0.6113817873895169, | |
| "eval_physics_loss": 0.6128407716751099, | |
| "eval_physics_mean_token_accuracy": 0.8226233792304992, | |
| "eval_physics_num_tokens": 15704494.0, | |
| "eval_physics_runtime": 62.4981, | |
| "eval_physics_samples_per_second": 8.0, | |
| "eval_physics_steps_per_second": 8.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.5962172176688909, | |
| "epoch": 0.764, | |
| "grad_norm": 37.0, | |
| "learning_rate": 1.798e-05, | |
| "loss": 9.5664, | |
| "mean_token_accuracy": 0.8239004660397768, | |
| "num_tokens": 15792219.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 0.5950108816847205, | |
| "epoch": 0.768, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.7957777777777778e-05, | |
| "loss": 9.6608, | |
| "mean_token_accuracy": 0.8227977603673935, | |
| "num_tokens": 15877177.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.5919695955701172, | |
| "epoch": 0.772, | |
| "grad_norm": 35.0, | |
| "learning_rate": 1.7935555555555556e-05, | |
| "loss": 9.5476, | |
| "mean_token_accuracy": 0.8272939085960388, | |
| "num_tokens": 15965749.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.6036200723610818, | |
| "epoch": 0.776, | |
| "grad_norm": 37.75, | |
| "learning_rate": 1.7913333333333337e-05, | |
| "loss": 9.7425, | |
| "mean_token_accuracy": 0.8256110638380051, | |
| "num_tokens": 16050530.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 0.6186987873166799, | |
| "epoch": 0.78, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.789111111111111e-05, | |
| "loss": 10.0185, | |
| "mean_token_accuracy": 0.8216598622500897, | |
| "num_tokens": 16136804.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.6003677137196064, | |
| "epoch": 0.784, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.7868888888888892e-05, | |
| "loss": 9.6501, | |
| "mean_token_accuracy": 0.8250759091228247, | |
| "num_tokens": 16219640.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.5993524115532637, | |
| "epoch": 0.788, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.7846666666666666e-05, | |
| "loss": 9.5775, | |
| "mean_token_accuracy": 0.8249216306954622, | |
| "num_tokens": 16299595.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 0.616146298404783, | |
| "epoch": 0.792, | |
| "grad_norm": 47.5, | |
| "learning_rate": 1.7824444444444447e-05, | |
| "loss": 10.0021, | |
| "mean_token_accuracy": 0.8181390602141618, | |
| "num_tokens": 16378820.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.6191745470277965, | |
| "epoch": 0.796, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.780222222222222e-05, | |
| "loss": 10.0675, | |
| "mean_token_accuracy": 0.8181126184761525, | |
| "num_tokens": 16464611.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.6169021054171026, | |
| "epoch": 0.8, | |
| "grad_norm": 34.5, | |
| "learning_rate": 1.7780000000000003e-05, | |
| "loss": 9.9317, | |
| "mean_token_accuracy": 0.8213718693703413, | |
| "num_tokens": 16548261.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_chemistry_entropy": 0.6557240565121174, | |
| "eval_chemistry_loss": 0.6643321514129639, | |
| "eval_chemistry_mean_token_accuracy": 0.810910725414753, | |
| "eval_chemistry_num_tokens": 16548261.0, | |
| "eval_chemistry_runtime": 53.5999, | |
| "eval_chemistry_samples_per_second": 9.328, | |
| "eval_chemistry_steps_per_second": 9.328, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_physics_entropy": 0.6022120754122734, | |
| "eval_physics_loss": 0.6109762787818909, | |
| "eval_physics_mean_token_accuracy": 0.8232671258449554, | |
| "eval_physics_num_tokens": 16548261.0, | |
| "eval_physics_runtime": 62.4211, | |
| "eval_physics_samples_per_second": 8.01, | |
| "eval_physics_steps_per_second": 8.01, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 0.5769328207708895, | |
| "epoch": 0.804, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.7757777777777777e-05, | |
| "loss": 9.3469, | |
| "mean_token_accuracy": 0.8289602052420377, | |
| "num_tokens": 16633428.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 0.639338431507349, | |
| "epoch": 0.808, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.7735555555555558e-05, | |
| "loss": 10.3607, | |
| "mean_token_accuracy": 0.8114439073950053, | |
| "num_tokens": 16711667.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 0.6080935266800225, | |
| "epoch": 0.812, | |
| "grad_norm": 37.5, | |
| "learning_rate": 1.7713333333333335e-05, | |
| "loss": 9.5741, | |
| "mean_token_accuracy": 0.8260014273226262, | |
| "num_tokens": 16789903.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 0.6141686610877514, | |
| "epoch": 0.816, | |
| "grad_norm": 34.0, | |
| "learning_rate": 1.7691111111111113e-05, | |
| "loss": 10.2542, | |
| "mean_token_accuracy": 0.8148800980299711, | |
| "num_tokens": 16870533.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 0.6198380364105105, | |
| "epoch": 0.82, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.766888888888889e-05, | |
| "loss": 9.8252, | |
| "mean_token_accuracy": 0.8211232393980026, | |
| "num_tokens": 16955146.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 0.6109458804130554, | |
| "epoch": 0.824, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.7646666666666668e-05, | |
| "loss": 9.9951, | |
| "mean_token_accuracy": 0.8190602369606494, | |
| "num_tokens": 17035600.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 0.616765193361789, | |
| "epoch": 0.828, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.7624444444444446e-05, | |
| "loss": 9.8759, | |
| "mean_token_accuracy": 0.8201179711148143, | |
| "num_tokens": 17114437.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 0.585936988517642, | |
| "epoch": 0.832, | |
| "grad_norm": 37.25, | |
| "learning_rate": 1.7602222222222223e-05, | |
| "loss": 9.4614, | |
| "mean_token_accuracy": 0.8275429684668779, | |
| "num_tokens": 17196371.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 0.6425325162708759, | |
| "epoch": 0.836, | |
| "grad_norm": 48.0, | |
| "learning_rate": 1.758e-05, | |
| "loss": 10.4242, | |
| "mean_token_accuracy": 0.8143802944570779, | |
| "num_tokens": 17277519.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 0.594513074401766, | |
| "epoch": 0.84, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.755777777777778e-05, | |
| "loss": 9.5434, | |
| "mean_token_accuracy": 0.8270172599703074, | |
| "num_tokens": 17356919.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_chemistry_entropy": 0.6422495054602623, | |
| "eval_chemistry_loss": 0.6657618284225464, | |
| "eval_chemistry_mean_token_accuracy": 0.8119068529605865, | |
| "eval_chemistry_num_tokens": 17356919.0, | |
| "eval_chemistry_runtime": 53.6953, | |
| "eval_chemistry_samples_per_second": 9.312, | |
| "eval_chemistry_steps_per_second": 9.312, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_physics_entropy": 0.5935313543379307, | |
| "eval_physics_loss": 0.6082729697227478, | |
| "eval_physics_mean_token_accuracy": 0.8237810920476913, | |
| "eval_physics_num_tokens": 17356919.0, | |
| "eval_physics_runtime": 62.7719, | |
| "eval_physics_samples_per_second": 7.965, | |
| "eval_physics_steps_per_second": 7.965, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 0.5766649033874274, | |
| "epoch": 0.844, | |
| "grad_norm": 36.75, | |
| "learning_rate": 1.7535555555555556e-05, | |
| "loss": 9.3985, | |
| "mean_token_accuracy": 0.8289091024547816, | |
| "num_tokens": 17438912.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 0.6027551379054785, | |
| "epoch": 0.848, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.7513333333333334e-05, | |
| "loss": 9.5153, | |
| "mean_token_accuracy": 0.8274124126881361, | |
| "num_tokens": 17527066.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 0.6090564800426364, | |
| "epoch": 0.852, | |
| "grad_norm": 36.25, | |
| "learning_rate": 1.749111111111111e-05, | |
| "loss": 9.9486, | |
| "mean_token_accuracy": 0.8203190270811319, | |
| "num_tokens": 17610559.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 0.6409425543621182, | |
| "epoch": 0.856, | |
| "grad_norm": 40.5, | |
| "learning_rate": 1.746888888888889e-05, | |
| "loss": 10.2511, | |
| "mean_token_accuracy": 0.8137188758701086, | |
| "num_tokens": 17689902.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 0.604229504428804, | |
| "epoch": 0.86, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.7446666666666667e-05, | |
| "loss": 9.8022, | |
| "mean_token_accuracy": 0.8243927512317896, | |
| "num_tokens": 17774080.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 0.608985879831016, | |
| "epoch": 0.864, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.7424444444444444e-05, | |
| "loss": 9.8919, | |
| "mean_token_accuracy": 0.8220306746661663, | |
| "num_tokens": 17859605.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 0.58911108141765, | |
| "epoch": 0.868, | |
| "grad_norm": 36.5, | |
| "learning_rate": 1.7402222222222222e-05, | |
| "loss": 9.4426, | |
| "mean_token_accuracy": 0.8251889403909445, | |
| "num_tokens": 17942040.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 0.6174473416060209, | |
| "epoch": 0.872, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.7380000000000003e-05, | |
| "loss": 9.9753, | |
| "mean_token_accuracy": 0.8203635964542627, | |
| "num_tokens": 18019924.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 0.6417605724185705, | |
| "epoch": 0.876, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.735777777777778e-05, | |
| "loss": 10.2624, | |
| "mean_token_accuracy": 0.8150792047381401, | |
| "num_tokens": 18102852.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 0.6028793597593903, | |
| "epoch": 0.88, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.7335555555555558e-05, | |
| "loss": 9.7194, | |
| "mean_token_accuracy": 0.8240158323198556, | |
| "num_tokens": 18187315.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "eval_chemistry_entropy": 0.6486237399280071, | |
| "eval_chemistry_loss": 0.6654757261276245, | |
| "eval_chemistry_mean_token_accuracy": 0.812120854973793, | |
| "eval_chemistry_num_tokens": 18187315.0, | |
| "eval_chemistry_runtime": 54.4927, | |
| "eval_chemistry_samples_per_second": 9.176, | |
| "eval_chemistry_steps_per_second": 9.176, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "eval_physics_entropy": 0.5944115283489227, | |
| "eval_physics_loss": 0.6044148206710815, | |
| "eval_physics_mean_token_accuracy": 0.8247024840712547, | |
| "eval_physics_num_tokens": 18187315.0, | |
| "eval_physics_runtime": 63.2321, | |
| "eval_physics_samples_per_second": 7.907, | |
| "eval_physics_steps_per_second": 7.907, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 0.5587692123837769, | |
| "epoch": 0.884, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.7313333333333336e-05, | |
| "loss": 9.1233, | |
| "mean_token_accuracy": 0.8343214184045792, | |
| "num_tokens": 18275135.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 0.5347248839214445, | |
| "epoch": 0.888, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.7291111111111113e-05, | |
| "loss": 8.4862, | |
| "mean_token_accuracy": 0.8402233418077231, | |
| "num_tokens": 18366329.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 0.6030035736039281, | |
| "epoch": 0.892, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.726888888888889e-05, | |
| "loss": 9.7969, | |
| "mean_token_accuracy": 0.8230024907737971, | |
| "num_tokens": 18449966.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 0.5451280565932393, | |
| "epoch": 0.896, | |
| "grad_norm": 35.75, | |
| "learning_rate": 1.724666666666667e-05, | |
| "loss": 8.8341, | |
| "mean_token_accuracy": 0.836723905801773, | |
| "num_tokens": 18536790.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 0.5979954308830202, | |
| "epoch": 0.9, | |
| "grad_norm": 35.75, | |
| "learning_rate": 1.7224444444444446e-05, | |
| "loss": 9.6096, | |
| "mean_token_accuracy": 0.8243920177221298, | |
| "num_tokens": 18619194.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 0.594355533272028, | |
| "epoch": 0.904, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.7202222222222224e-05, | |
| "loss": 9.5494, | |
| "mean_token_accuracy": 0.8267385013401508, | |
| "num_tokens": 18701809.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 0.6269824390299619, | |
| "epoch": 0.908, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.718e-05, | |
| "loss": 10.0915, | |
| "mean_token_accuracy": 0.8181608844548464, | |
| "num_tokens": 18785677.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 0.6125199305824935, | |
| "epoch": 0.912, | |
| "grad_norm": 37.25, | |
| "learning_rate": 1.715777777777778e-05, | |
| "loss": 9.9484, | |
| "mean_token_accuracy": 0.818399741128087, | |
| "num_tokens": 18869344.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 0.6096226529218256, | |
| "epoch": 0.916, | |
| "grad_norm": 35.25, | |
| "learning_rate": 1.7135555555555557e-05, | |
| "loss": 9.8473, | |
| "mean_token_accuracy": 0.8235837988555431, | |
| "num_tokens": 18954158.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 0.5883321419358254, | |
| "epoch": 0.92, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.7113333333333334e-05, | |
| "loss": 9.5237, | |
| "mean_token_accuracy": 0.8259769681841135, | |
| "num_tokens": 19033910.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "eval_chemistry_entropy": 0.6482902157604694, | |
| "eval_chemistry_loss": 0.6630954742431641, | |
| "eval_chemistry_mean_token_accuracy": 0.8121933689117432, | |
| "eval_chemistry_num_tokens": 19033910.0, | |
| "eval_chemistry_runtime": 53.6836, | |
| "eval_chemistry_samples_per_second": 9.314, | |
| "eval_chemistry_steps_per_second": 9.314, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "eval_physics_entropy": 0.5942936807274818, | |
| "eval_physics_loss": 0.5992915034294128, | |
| "eval_physics_mean_token_accuracy": 0.8255074183344842, | |
| "eval_physics_num_tokens": 19033910.0, | |
| "eval_physics_runtime": 62.5111, | |
| "eval_physics_samples_per_second": 7.999, | |
| "eval_physics_steps_per_second": 7.999, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 0.5868122247047722, | |
| "epoch": 0.924, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.7091111111111112e-05, | |
| "loss": 9.4248, | |
| "mean_token_accuracy": 0.8281576413661241, | |
| "num_tokens": 19116000.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 0.6330606296658516, | |
| "epoch": 0.928, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.706888888888889e-05, | |
| "loss": 10.0342, | |
| "mean_token_accuracy": 0.8173846624791622, | |
| "num_tokens": 19199834.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 0.5831953101791442, | |
| "epoch": 0.932, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.704666666666667e-05, | |
| "loss": 9.3531, | |
| "mean_token_accuracy": 0.8273360066115856, | |
| "num_tokens": 19277906.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 0.5802978583611548, | |
| "epoch": 0.936, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.7024444444444445e-05, | |
| "loss": 9.5706, | |
| "mean_token_accuracy": 0.8274037476629019, | |
| "num_tokens": 19361703.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 0.6136010095477105, | |
| "epoch": 0.94, | |
| "grad_norm": 34.75, | |
| "learning_rate": 1.7002222222222226e-05, | |
| "loss": 9.7159, | |
| "mean_token_accuracy": 0.8238754648715257, | |
| "num_tokens": 19444036.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 0.5773532028310001, | |
| "epoch": 0.944, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.698e-05, | |
| "loss": 9.3904, | |
| "mean_token_accuracy": 0.8301644638180733, | |
| "num_tokens": 19525385.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 0.5946352357976139, | |
| "epoch": 0.948, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.695777777777778e-05, | |
| "loss": 9.5084, | |
| "mean_token_accuracy": 0.8257039908319712, | |
| "num_tokens": 19606660.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 0.5877110615372658, | |
| "epoch": 0.952, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.6935555555555555e-05, | |
| "loss": 9.5004, | |
| "mean_token_accuracy": 0.8257838502526283, | |
| "num_tokens": 19690715.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 0.5759502210654318, | |
| "epoch": 0.956, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.6913333333333336e-05, | |
| "loss": 9.3654, | |
| "mean_token_accuracy": 0.8302419554442168, | |
| "num_tokens": 19773293.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 0.6265336956828833, | |
| "epoch": 0.96, | |
| "grad_norm": 36.25, | |
| "learning_rate": 1.689111111111111e-05, | |
| "loss": 10.1027, | |
| "mean_token_accuracy": 0.8185567751526832, | |
| "num_tokens": 19856973.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_chemistry_entropy": 0.6710790105164051, | |
| "eval_chemistry_loss": 0.66240394115448, | |
| "eval_chemistry_mean_token_accuracy": 0.8114456446170807, | |
| "eval_chemistry_num_tokens": 19856973.0, | |
| "eval_chemistry_runtime": 53.5716, | |
| "eval_chemistry_samples_per_second": 9.333, | |
| "eval_chemistry_steps_per_second": 9.333, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_physics_entropy": 0.6094872635900974, | |
| "eval_physics_loss": 0.5976263284683228, | |
| "eval_physics_mean_token_accuracy": 0.826532019495964, | |
| "eval_physics_num_tokens": 19856973.0, | |
| "eval_physics_runtime": 62.3185, | |
| "eval_physics_samples_per_second": 8.023, | |
| "eval_physics_steps_per_second": 8.023, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 0.5884977007284761, | |
| "epoch": 0.964, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.686888888888889e-05, | |
| "loss": 9.3858, | |
| "mean_token_accuracy": 0.8279317501932383, | |
| "num_tokens": 19939168.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 0.5823706248775125, | |
| "epoch": 0.968, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.684666666666667e-05, | |
| "loss": 9.5014, | |
| "mean_token_accuracy": 0.8305404994636774, | |
| "num_tokens": 20021093.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 0.574251430016011, | |
| "epoch": 0.972, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.6824444444444447e-05, | |
| "loss": 9.1394, | |
| "mean_token_accuracy": 0.8329744711518288, | |
| "num_tokens": 20109033.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 0.6081912533380092, | |
| "epoch": 0.976, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.6802222222222224e-05, | |
| "loss": 9.9884, | |
| "mean_token_accuracy": 0.8211883638054133, | |
| "num_tokens": 20190600.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 0.5859704153612256, | |
| "epoch": 0.98, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.6780000000000002e-05, | |
| "loss": 9.3289, | |
| "mean_token_accuracy": 0.8274181935936212, | |
| "num_tokens": 20269722.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 0.5964566855691373, | |
| "epoch": 0.984, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.675777777777778e-05, | |
| "loss": 9.6344, | |
| "mean_token_accuracy": 0.8237358272075653, | |
| "num_tokens": 20352255.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 0.5837171670980752, | |
| "epoch": 0.988, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.6735555555555557e-05, | |
| "loss": 9.4718, | |
| "mean_token_accuracy": 0.8284726958721876, | |
| "num_tokens": 20433069.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 0.5878238414414227, | |
| "epoch": 0.992, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.6713333333333335e-05, | |
| "loss": 9.5302, | |
| "mean_token_accuracy": 0.8285502199083566, | |
| "num_tokens": 20518014.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 0.5914754659868777, | |
| "epoch": 0.996, | |
| "grad_norm": 35.5, | |
| "learning_rate": 1.6691111111111112e-05, | |
| "loss": 9.4272, | |
| "mean_token_accuracy": 0.8259847860783338, | |
| "num_tokens": 20602688.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 0.565635210648179, | |
| "epoch": 1.0, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.666888888888889e-05, | |
| "loss": 9.1451, | |
| "mean_token_accuracy": 0.8311932869255543, | |
| "num_tokens": 20686804.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_chemistry_entropy": 0.6437437009811401, | |
| "eval_chemistry_loss": 0.6624347567558289, | |
| "eval_chemistry_mean_token_accuracy": 0.8123893816471099, | |
| "eval_chemistry_num_tokens": 20686804.0, | |
| "eval_chemistry_runtime": 53.5577, | |
| "eval_chemistry_samples_per_second": 9.336, | |
| "eval_chemistry_steps_per_second": 9.336, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_physics_entropy": 0.5847585182189942, | |
| "eval_physics_loss": 0.5949397087097168, | |
| "eval_physics_mean_token_accuracy": 0.8267857027053833, | |
| "eval_physics_num_tokens": 20686804.0, | |
| "eval_physics_runtime": 62.3711, | |
| "eval_physics_samples_per_second": 8.017, | |
| "eval_physics_steps_per_second": 8.017, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 0.43183089010417464, | |
| "epoch": 1.004, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.6646666666666668e-05, | |
| "loss": 6.4646, | |
| "mean_token_accuracy": 0.8771685600280762, | |
| "num_tokens": 20773982.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 0.3832437695004046, | |
| "epoch": 1.008, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.6624444444444445e-05, | |
| "loss": 6.4611, | |
| "mean_token_accuracy": 0.878194584697485, | |
| "num_tokens": 20859215.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 0.4238553931005299, | |
| "epoch": 1.012, | |
| "grad_norm": 71.5, | |
| "learning_rate": 1.6602222222222223e-05, | |
| "loss": 6.7028, | |
| "mean_token_accuracy": 0.8725548766553402, | |
| "num_tokens": 20939306.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 0.39157483652234076, | |
| "epoch": 1.016, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.658e-05, | |
| "loss": 6.2925, | |
| "mean_token_accuracy": 0.8792892958968878, | |
| "num_tokens": 21019542.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 0.3955075446516275, | |
| "epoch": 1.02, | |
| "grad_norm": 51.0, | |
| "learning_rate": 1.6557777777777778e-05, | |
| "loss": 6.4485, | |
| "mean_token_accuracy": 0.8764517173171044, | |
| "num_tokens": 21104882.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 0.41332366890273986, | |
| "epoch": 1.024, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.6535555555555556e-05, | |
| "loss": 6.582, | |
| "mean_token_accuracy": 0.8762725710868835, | |
| "num_tokens": 21186658.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 0.38238424006849525, | |
| "epoch": 1.028, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.6513333333333333e-05, | |
| "loss": 6.2961, | |
| "mean_token_accuracy": 0.8797769464552403, | |
| "num_tokens": 21268030.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 0.4287768125534058, | |
| "epoch": 1.032, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.6491111111111114e-05, | |
| "loss": 6.7601, | |
| "mean_token_accuracy": 0.8724630124866962, | |
| "num_tokens": 21351715.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 0.37041244744323193, | |
| "epoch": 1.036, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.646888888888889e-05, | |
| "loss": 6.1737, | |
| "mean_token_accuracy": 0.8821764189749957, | |
| "num_tokens": 21431494.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 0.41900953110307454, | |
| "epoch": 1.04, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.644666666666667e-05, | |
| "loss": 6.6902, | |
| "mean_token_accuracy": 0.8732660066336393, | |
| "num_tokens": 21517237.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "eval_chemistry_entropy": 0.5526932182312012, | |
| "eval_chemistry_loss": 0.6754207015037537, | |
| "eval_chemistry_mean_token_accuracy": 0.8118693482875824, | |
| "eval_chemistry_num_tokens": 21517237.0, | |
| "eval_chemistry_runtime": 53.6728, | |
| "eval_chemistry_samples_per_second": 9.316, | |
| "eval_chemistry_steps_per_second": 9.316, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "eval_physics_entropy": 0.5128993675708771, | |
| "eval_physics_loss": 0.6065174341201782, | |
| "eval_physics_mean_token_accuracy": 0.8264631150960923, | |
| "eval_physics_num_tokens": 21517237.0, | |
| "eval_physics_runtime": 62.4303, | |
| "eval_physics_samples_per_second": 8.009, | |
| "eval_physics_steps_per_second": 8.009, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 0.3923122527077794, | |
| "epoch": 1.044, | |
| "grad_norm": 52.5, | |
| "learning_rate": 1.6424444444444444e-05, | |
| "loss": 6.23, | |
| "mean_token_accuracy": 0.882063326984644, | |
| "num_tokens": 21597915.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 0.4064705569297075, | |
| "epoch": 1.048, | |
| "grad_norm": 36.25, | |
| "learning_rate": 1.6402222222222225e-05, | |
| "loss": 6.488, | |
| "mean_token_accuracy": 0.8772788297384977, | |
| "num_tokens": 21682780.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 0.38905340125784277, | |
| "epoch": 1.052, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.638e-05, | |
| "loss": 6.371, | |
| "mean_token_accuracy": 0.8803388915956021, | |
| "num_tokens": 21768676.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 0.3746905536390841, | |
| "epoch": 1.056, | |
| "grad_norm": 36.75, | |
| "learning_rate": 1.635777777777778e-05, | |
| "loss": 5.9046, | |
| "mean_token_accuracy": 0.8856078766286373, | |
| "num_tokens": 21852528.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 0.3955009472556412, | |
| "epoch": 1.06, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.6335555555555558e-05, | |
| "loss": 6.4522, | |
| "mean_token_accuracy": 0.8777095880359411, | |
| "num_tokens": 21939230.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 0.4184166899416596, | |
| "epoch": 1.064, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.6313333333333335e-05, | |
| "loss": 6.6604, | |
| "mean_token_accuracy": 0.8737866301089525, | |
| "num_tokens": 22021514.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 0.4157205421477556, | |
| "epoch": 1.068, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.6291111111111113e-05, | |
| "loss": 6.8267, | |
| "mean_token_accuracy": 0.8705848950892687, | |
| "num_tokens": 22106780.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 0.4116486982442439, | |
| "epoch": 1.072, | |
| "grad_norm": 35.75, | |
| "learning_rate": 1.626888888888889e-05, | |
| "loss": 6.48, | |
| "mean_token_accuracy": 0.8772596288472414, | |
| "num_tokens": 22192380.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 0.40143947368487715, | |
| "epoch": 1.076, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.6246666666666668e-05, | |
| "loss": 6.5291, | |
| "mean_token_accuracy": 0.87435936704278, | |
| "num_tokens": 22276772.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 0.4018312363885343, | |
| "epoch": 1.08, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.6224444444444446e-05, | |
| "loss": 6.2389, | |
| "mean_token_accuracy": 0.879636450484395, | |
| "num_tokens": 22356835.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "eval_chemistry_entropy": 0.528339877396822, | |
| "eval_chemistry_loss": 0.6824756860733032, | |
| "eval_chemistry_mean_token_accuracy": 0.810719025850296, | |
| "eval_chemistry_num_tokens": 22356835.0, | |
| "eval_chemistry_runtime": 54.3738, | |
| "eval_chemistry_samples_per_second": 9.196, | |
| "eval_chemistry_steps_per_second": 9.196, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "eval_physics_entropy": 0.48860630974173547, | |
| "eval_physics_loss": 0.6100344061851501, | |
| "eval_physics_mean_token_accuracy": 0.8260838712453842, | |
| "eval_physics_num_tokens": 22356835.0, | |
| "eval_physics_runtime": 62.4853, | |
| "eval_physics_samples_per_second": 8.002, | |
| "eval_physics_steps_per_second": 8.002, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 0.3871776068583131, | |
| "epoch": 1.084, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.6202222222222223e-05, | |
| "loss": 6.3317, | |
| "mean_token_accuracy": 0.8779322851449252, | |
| "num_tokens": 22438638.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 0.4119948163162917, | |
| "epoch": 1.088, | |
| "grad_norm": 59.75, | |
| "learning_rate": 1.618e-05, | |
| "loss": 6.6244, | |
| "mean_token_accuracy": 0.8736134715378284, | |
| "num_tokens": 22517971.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 0.39056964181363585, | |
| "epoch": 1.092, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.615777777777778e-05, | |
| "loss": 6.2406, | |
| "mean_token_accuracy": 0.8803118593990803, | |
| "num_tokens": 22599297.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 0.4071198822930455, | |
| "epoch": 1.096, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.6135555555555556e-05, | |
| "loss": 6.5373, | |
| "mean_token_accuracy": 0.8748777855187655, | |
| "num_tokens": 22681680.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 0.38830908089876176, | |
| "epoch": 1.1, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.6113333333333334e-05, | |
| "loss": 6.2908, | |
| "mean_token_accuracy": 0.8802915759384632, | |
| "num_tokens": 22768521.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 0.3899666819255799, | |
| "epoch": 1.104, | |
| "grad_norm": 35.0, | |
| "learning_rate": 1.609111111111111e-05, | |
| "loss": 6.1995, | |
| "mean_token_accuracy": 0.8794200260192155, | |
| "num_tokens": 22848694.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 0.3982271538116038, | |
| "epoch": 1.108, | |
| "grad_norm": 49.5, | |
| "learning_rate": 1.606888888888889e-05, | |
| "loss": 6.5132, | |
| "mean_token_accuracy": 0.8748826995491982, | |
| "num_tokens": 22934675.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 0.40925240265205504, | |
| "epoch": 1.112, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.6046666666666667e-05, | |
| "loss": 6.3908, | |
| "mean_token_accuracy": 0.8794612839818001, | |
| "num_tokens": 23016014.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 0.3979142210446298, | |
| "epoch": 1.116, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.6024444444444444e-05, | |
| "loss": 6.6062, | |
| "mean_token_accuracy": 0.8738579101860523, | |
| "num_tokens": 23099672.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 0.390972594358027, | |
| "epoch": 1.12, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.6002222222222222e-05, | |
| "loss": 6.0516, | |
| "mean_token_accuracy": 0.8840138420462609, | |
| "num_tokens": 23183889.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_chemistry_entropy": 0.5277839957475662, | |
| "eval_chemistry_loss": 0.6832934617996216, | |
| "eval_chemistry_mean_token_accuracy": 0.8105767689943314, | |
| "eval_chemistry_num_tokens": 23183889.0, | |
| "eval_chemistry_runtime": 53.7065, | |
| "eval_chemistry_samples_per_second": 9.31, | |
| "eval_chemistry_steps_per_second": 9.31, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_physics_entropy": 0.49138612353801725, | |
| "eval_physics_loss": 0.6074111461639404, | |
| "eval_physics_mean_token_accuracy": 0.8264175375699997, | |
| "eval_physics_num_tokens": 23183889.0, | |
| "eval_physics_runtime": 62.3744, | |
| "eval_physics_samples_per_second": 8.016, | |
| "eval_physics_steps_per_second": 8.016, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 0.35932957073673605, | |
| "epoch": 1.124, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.5980000000000003e-05, | |
| "loss": 6.0668, | |
| "mean_token_accuracy": 0.8830338750034571, | |
| "num_tokens": 23273263.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 0.3861358379013836, | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.5957777777777777e-05, | |
| "loss": 6.0376, | |
| "mean_token_accuracy": 0.8845888160169124, | |
| "num_tokens": 23354096.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 0.3910831346176565, | |
| "epoch": 1.1320000000000001, | |
| "grad_norm": 40.5, | |
| "learning_rate": 1.5935555555555558e-05, | |
| "loss": 6.3203, | |
| "mean_token_accuracy": 0.8786633264273405, | |
| "num_tokens": 23435009.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 0.39575566886924207, | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.5913333333333332e-05, | |
| "loss": 6.4965, | |
| "mean_token_accuracy": 0.8768609743565321, | |
| "num_tokens": 23523903.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 0.41750739673152565, | |
| "epoch": 1.1400000000000001, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.5891111111111113e-05, | |
| "loss": 6.597, | |
| "mean_token_accuracy": 0.8746554154902697, | |
| "num_tokens": 23609422.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 0.40066886888816955, | |
| "epoch": 1.144, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.5868888888888888e-05, | |
| "loss": 6.5186, | |
| "mean_token_accuracy": 0.8758725844323635, | |
| "num_tokens": 23684571.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 0.4339528376236558, | |
| "epoch": 1.148, | |
| "grad_norm": 40.0, | |
| "learning_rate": 1.584666666666667e-05, | |
| "loss": 6.9187, | |
| "mean_token_accuracy": 0.8712858468294143, | |
| "num_tokens": 23772422.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 0.407416142616421, | |
| "epoch": 1.152, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.5824444444444446e-05, | |
| "loss": 6.623, | |
| "mean_token_accuracy": 0.8760526139289141, | |
| "num_tokens": 23848848.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 0.41878221230581403, | |
| "epoch": 1.156, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.5802222222222224e-05, | |
| "loss": 6.6909, | |
| "mean_token_accuracy": 0.8737323805689812, | |
| "num_tokens": 23927139.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 0.3954612009227276, | |
| "epoch": 1.16, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.578e-05, | |
| "loss": 6.3773, | |
| "mean_token_accuracy": 0.8771356221288442, | |
| "num_tokens": 24011654.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "eval_chemistry_entropy": 0.5298384475111961, | |
| "eval_chemistry_loss": 0.6839735507965088, | |
| "eval_chemistry_mean_token_accuracy": 0.8118355718851089, | |
| "eval_chemistry_num_tokens": 24011654.0, | |
| "eval_chemistry_runtime": 53.5702, | |
| "eval_chemistry_samples_per_second": 9.334, | |
| "eval_chemistry_steps_per_second": 9.334, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "eval_physics_entropy": 0.4960176517665386, | |
| "eval_physics_loss": 0.6066020131111145, | |
| "eval_physics_mean_token_accuracy": 0.8260394994020462, | |
| "eval_physics_num_tokens": 24011654.0, | |
| "eval_physics_runtime": 62.3484, | |
| "eval_physics_samples_per_second": 8.019, | |
| "eval_physics_steps_per_second": 8.019, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 0.38181931935250757, | |
| "epoch": 1.164, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.575777777777778e-05, | |
| "loss": 6.1702, | |
| "mean_token_accuracy": 0.8803447645157576, | |
| "num_tokens": 24090340.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 0.41627788487821815, | |
| "epoch": 1.168, | |
| "grad_norm": 54.75, | |
| "learning_rate": 1.5735555555555557e-05, | |
| "loss": 6.6549, | |
| "mean_token_accuracy": 0.8718316704034805, | |
| "num_tokens": 24174589.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 0.3925532539375126, | |
| "epoch": 1.172, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.5713333333333334e-05, | |
| "loss": 6.2928, | |
| "mean_token_accuracy": 0.8777770098298788, | |
| "num_tokens": 24256063.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 0.43556569516658783, | |
| "epoch": 1.176, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.5691111111111112e-05, | |
| "loss": 7.023, | |
| "mean_token_accuracy": 0.8686692509800196, | |
| "num_tokens": 24336249.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 0.3898361031897366, | |
| "epoch": 1.18, | |
| "grad_norm": 36.5, | |
| "learning_rate": 1.5668888888888893e-05, | |
| "loss": 6.1418, | |
| "mean_token_accuracy": 0.8812136992812156, | |
| "num_tokens": 24415615.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 0.37849364345893266, | |
| "epoch": 1.184, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.5646666666666667e-05, | |
| "loss": 6.2013, | |
| "mean_token_accuracy": 0.8820504449307919, | |
| "num_tokens": 24495981.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 0.4079933548346162, | |
| "epoch": 1.188, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.5624444444444448e-05, | |
| "loss": 6.6523, | |
| "mean_token_accuracy": 0.8723030515015125, | |
| "num_tokens": 24575688.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 0.3756624376401305, | |
| "epoch": 1.192, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.5602222222222222e-05, | |
| "loss": 5.8133, | |
| "mean_token_accuracy": 0.8876727018505335, | |
| "num_tokens": 24658862.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 0.3933048089966178, | |
| "epoch": 1.196, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.5580000000000003e-05, | |
| "loss": 6.577, | |
| "mean_token_accuracy": 0.8762084610760212, | |
| "num_tokens": 24742908.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 0.41905345749109985, | |
| "epoch": 1.2, | |
| "grad_norm": 51.0, | |
| "learning_rate": 1.5557777777777778e-05, | |
| "loss": 6.5299, | |
| "mean_token_accuracy": 0.8759240590035915, | |
| "num_tokens": 24826245.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_chemistry_entropy": 0.5453705522418022, | |
| "eval_chemistry_loss": 0.6793319582939148, | |
| "eval_chemistry_mean_token_accuracy": 0.8109741320610047, | |
| "eval_chemistry_num_tokens": 24826245.0, | |
| "eval_chemistry_runtime": 53.5792, | |
| "eval_chemistry_samples_per_second": 9.332, | |
| "eval_chemistry_steps_per_second": 9.332, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_physics_entropy": 0.5037744672298431, | |
| "eval_physics_loss": 0.6024576425552368, | |
| "eval_physics_mean_token_accuracy": 0.8275209935307503, | |
| "eval_physics_num_tokens": 24826245.0, | |
| "eval_physics_runtime": 62.4678, | |
| "eval_physics_samples_per_second": 8.004, | |
| "eval_physics_steps_per_second": 8.004, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 0.3997825036756694, | |
| "epoch": 1.204, | |
| "grad_norm": 37.5, | |
| "learning_rate": 1.553555555555556e-05, | |
| "loss": 6.5565, | |
| "mean_token_accuracy": 0.8763858702033758, | |
| "num_tokens": 24909586.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 0.38890892546623945, | |
| "epoch": 1.208, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.5513333333333333e-05, | |
| "loss": 6.2919, | |
| "mean_token_accuracy": 0.88029565513134, | |
| "num_tokens": 24988821.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 0.4091858469881117, | |
| "epoch": 1.212, | |
| "grad_norm": 51.25, | |
| "learning_rate": 1.5491111111111114e-05, | |
| "loss": 6.5897, | |
| "mean_token_accuracy": 0.8751243494451046, | |
| "num_tokens": 25069416.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 0.4264195324853063, | |
| "epoch": 1.216, | |
| "grad_norm": 54.25, | |
| "learning_rate": 1.546888888888889e-05, | |
| "loss": 6.8662, | |
| "mean_token_accuracy": 0.869670495390892, | |
| "num_tokens": 25147775.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 0.3972005158662796, | |
| "epoch": 1.22, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.544666666666667e-05, | |
| "loss": 6.4659, | |
| "mean_token_accuracy": 0.8787313677370548, | |
| "num_tokens": 25228447.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 0.42326033059507606, | |
| "epoch": 1.224, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.5424444444444447e-05, | |
| "loss": 6.635, | |
| "mean_token_accuracy": 0.8740264683961868, | |
| "num_tokens": 25312652.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 0.3997484935447574, | |
| "epoch": 1.228, | |
| "grad_norm": 37.5, | |
| "learning_rate": 1.5402222222222224e-05, | |
| "loss": 6.4985, | |
| "mean_token_accuracy": 0.8779609993100166, | |
| "num_tokens": 25395706.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 0.4184209566563368, | |
| "epoch": 1.232, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.5380000000000002e-05, | |
| "loss": 6.6546, | |
| "mean_token_accuracy": 0.8734137929975987, | |
| "num_tokens": 25480033.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 0.3735330808442086, | |
| "epoch": 1.236, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.535777777777778e-05, | |
| "loss": 6.165, | |
| "mean_token_accuracy": 0.8828396521508693, | |
| "num_tokens": 25559873.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 0.3877786135300994, | |
| "epoch": 1.24, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.5335555555555557e-05, | |
| "loss": 6.1214, | |
| "mean_token_accuracy": 0.8815136577934026, | |
| "num_tokens": 25643821.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "eval_chemistry_entropy": 0.5628473788201809, | |
| "eval_chemistry_loss": 0.6753941774368286, | |
| "eval_chemistry_mean_token_accuracy": 0.8121462626457214, | |
| "eval_chemistry_num_tokens": 25643821.0, | |
| "eval_chemistry_runtime": 53.6798, | |
| "eval_chemistry_samples_per_second": 9.314, | |
| "eval_chemistry_steps_per_second": 9.314, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "eval_physics_entropy": 0.5132348291277885, | |
| "eval_physics_loss": 0.5996105074882507, | |
| "eval_physics_mean_token_accuracy": 0.8279496403336525, | |
| "eval_physics_num_tokens": 25643821.0, | |
| "eval_physics_runtime": 62.4176, | |
| "eval_physics_samples_per_second": 8.011, | |
| "eval_physics_steps_per_second": 8.011, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 0.3988831571303308, | |
| "epoch": 1.244, | |
| "grad_norm": 53.0, | |
| "learning_rate": 1.5313333333333335e-05, | |
| "loss": 6.3279, | |
| "mean_token_accuracy": 0.8781545918434859, | |
| "num_tokens": 25725796.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 0.39984851367771623, | |
| "epoch": 1.248, | |
| "grad_norm": 50.0, | |
| "learning_rate": 1.5291111111111112e-05, | |
| "loss": 6.6278, | |
| "mean_token_accuracy": 0.8735591024160385, | |
| "num_tokens": 25811978.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 0.42747304858639834, | |
| "epoch": 1.252, | |
| "grad_norm": 54.25, | |
| "learning_rate": 1.526888888888889e-05, | |
| "loss": 6.744, | |
| "mean_token_accuracy": 0.873236209899187, | |
| "num_tokens": 25889082.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 0.3893546565435827, | |
| "epoch": 1.256, | |
| "grad_norm": 45.5, | |
| "learning_rate": 1.5246666666666668e-05, | |
| "loss": 6.2849, | |
| "mean_token_accuracy": 0.8787511799484491, | |
| "num_tokens": 25980217.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 0.3870727655477822, | |
| "epoch": 1.26, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.5224444444444447e-05, | |
| "loss": 6.2027, | |
| "mean_token_accuracy": 0.8804555464535951, | |
| "num_tokens": 26064259.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 0.3931499463506043, | |
| "epoch": 1.264, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.5202222222222223e-05, | |
| "loss": 6.2611, | |
| "mean_token_accuracy": 0.8800795335322619, | |
| "num_tokens": 26146299.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 0.4023557474836707, | |
| "epoch": 1.268, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.5180000000000002e-05, | |
| "loss": 6.3743, | |
| "mean_token_accuracy": 0.8788915742188692, | |
| "num_tokens": 26224759.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 0.39270303705707194, | |
| "epoch": 1.272, | |
| "grad_norm": 40.75, | |
| "learning_rate": 1.5157777777777778e-05, | |
| "loss": 6.423, | |
| "mean_token_accuracy": 0.8762969862669706, | |
| "num_tokens": 26315594.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 0.43091254495084286, | |
| "epoch": 1.276, | |
| "grad_norm": 49.75, | |
| "learning_rate": 1.5135555555555557e-05, | |
| "loss": 6.909, | |
| "mean_token_accuracy": 0.8699054338037968, | |
| "num_tokens": 26394622.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 0.43948563635349275, | |
| "epoch": 1.28, | |
| "grad_norm": 54.25, | |
| "learning_rate": 1.5113333333333335e-05, | |
| "loss": 7.0251, | |
| "mean_token_accuracy": 0.8672812581062317, | |
| "num_tokens": 26478207.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_chemistry_entropy": 0.5399982813298703, | |
| "eval_chemistry_loss": 0.6762998104095459, | |
| "eval_chemistry_mean_token_accuracy": 0.8126340419054031, | |
| "eval_chemistry_num_tokens": 26478207.0, | |
| "eval_chemistry_runtime": 53.9132, | |
| "eval_chemistry_samples_per_second": 9.274, | |
| "eval_chemistry_steps_per_second": 9.274, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_physics_entropy": 0.5009064598977566, | |
| "eval_physics_loss": 0.5996403694152832, | |
| "eval_physics_mean_token_accuracy": 0.828355691075325, | |
| "eval_physics_num_tokens": 26478207.0, | |
| "eval_physics_runtime": 62.7825, | |
| "eval_physics_samples_per_second": 7.964, | |
| "eval_physics_steps_per_second": 7.964, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 0.3829960603266954, | |
| "epoch": 1.284, | |
| "grad_norm": 55.25, | |
| "learning_rate": 1.5091111111111113e-05, | |
| "loss": 6.1965, | |
| "mean_token_accuracy": 0.8819497369229794, | |
| "num_tokens": 26565510.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 0.4178653988055885, | |
| "epoch": 1.288, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.506888888888889e-05, | |
| "loss": 6.6563, | |
| "mean_token_accuracy": 0.8736223887652159, | |
| "num_tokens": 26647567.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 0.36892555058002474, | |
| "epoch": 1.292, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.5046666666666668e-05, | |
| "loss": 5.88, | |
| "mean_token_accuracy": 0.8862396698445082, | |
| "num_tokens": 26725591.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 0.43142272820696237, | |
| "epoch": 1.296, | |
| "grad_norm": 51.0, | |
| "learning_rate": 1.5024444444444445e-05, | |
| "loss": 7.2241, | |
| "mean_token_accuracy": 0.8642895489931106, | |
| "num_tokens": 26806477.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 0.4274237054400146, | |
| "epoch": 1.3, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.5002222222222223e-05, | |
| "loss": 6.6789, | |
| "mean_token_accuracy": 0.8733241800218821, | |
| "num_tokens": 26889441.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 0.40652468013577164, | |
| "epoch": 1.304, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.498e-05, | |
| "loss": 6.6179, | |
| "mean_token_accuracy": 0.8742008306086063, | |
| "num_tokens": 26973124.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 0.42686001770198345, | |
| "epoch": 1.308, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.495777777777778e-05, | |
| "loss": 6.7405, | |
| "mean_token_accuracy": 0.8706731535494328, | |
| "num_tokens": 27055996.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 0.37794668129645287, | |
| "epoch": 1.312, | |
| "grad_norm": 50.5, | |
| "learning_rate": 1.4935555555555556e-05, | |
| "loss": 6.1894, | |
| "mean_token_accuracy": 0.8806595619767904, | |
| "num_tokens": 27137528.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 0.40770810479298236, | |
| "epoch": 1.316, | |
| "grad_norm": 53.5, | |
| "learning_rate": 1.4913333333333335e-05, | |
| "loss": 6.4258, | |
| "mean_token_accuracy": 0.8776110667735338, | |
| "num_tokens": 27217725.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 0.3946953667793423, | |
| "epoch": 1.32, | |
| "grad_norm": 37.25, | |
| "learning_rate": 1.4891111111111111e-05, | |
| "loss": 6.2872, | |
| "mean_token_accuracy": 0.878388949483633, | |
| "num_tokens": 27300986.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "eval_chemistry_entropy": 0.5206425618231296, | |
| "eval_chemistry_loss": 0.6836199760437012, | |
| "eval_chemistry_mean_token_accuracy": 0.8110970590114593, | |
| "eval_chemistry_num_tokens": 27300986.0, | |
| "eval_chemistry_runtime": 53.7324, | |
| "eval_chemistry_samples_per_second": 9.305, | |
| "eval_chemistry_steps_per_second": 9.305, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "eval_physics_entropy": 0.49023931473493576, | |
| "eval_physics_loss": 0.601118266582489, | |
| "eval_physics_mean_token_accuracy": 0.8291674974560738, | |
| "eval_physics_num_tokens": 27300986.0, | |
| "eval_physics_runtime": 62.4572, | |
| "eval_physics_samples_per_second": 8.005, | |
| "eval_physics_steps_per_second": 8.005, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 0.41491028452292084, | |
| "epoch": 1.324, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.486888888888889e-05, | |
| "loss": 6.8186, | |
| "mean_token_accuracy": 0.871044621989131, | |
| "num_tokens": 27382547.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 0.39110199781134725, | |
| "epoch": 1.328, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.4846666666666666e-05, | |
| "loss": 6.2792, | |
| "mean_token_accuracy": 0.8789916034787894, | |
| "num_tokens": 27465859.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 0.38766958490014075, | |
| "epoch": 1.332, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.4824444444444446e-05, | |
| "loss": 6.197, | |
| "mean_token_accuracy": 0.8813134826719761, | |
| "num_tokens": 27545600.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 0.39093358917161825, | |
| "epoch": 1.336, | |
| "grad_norm": 63.75, | |
| "learning_rate": 1.4802222222222225e-05, | |
| "loss": 6.2033, | |
| "mean_token_accuracy": 0.8811442971229553, | |
| "num_tokens": 27627148.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 0.40219460977241395, | |
| "epoch": 1.34, | |
| "grad_norm": 53.25, | |
| "learning_rate": 1.478e-05, | |
| "loss": 6.6373, | |
| "mean_token_accuracy": 0.8748365368694067, | |
| "num_tokens": 27711556.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 0.41494241636246443, | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 40.75, | |
| "learning_rate": 1.475777777777778e-05, | |
| "loss": 6.6603, | |
| "mean_token_accuracy": 0.8731846924871206, | |
| "num_tokens": 27795568.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 0.4036995633505285, | |
| "epoch": 1.3479999999999999, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.4735555555555556e-05, | |
| "loss": 6.3684, | |
| "mean_token_accuracy": 0.8784338608384132, | |
| "num_tokens": 27877049.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 0.39675605152733623, | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.4713333333333335e-05, | |
| "loss": 6.492, | |
| "mean_token_accuracy": 0.8774603210389614, | |
| "num_tokens": 27958162.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 0.40427405657246707, | |
| "epoch": 1.3559999999999999, | |
| "grad_norm": 53.0, | |
| "learning_rate": 1.4691111111111111e-05, | |
| "loss": 6.4467, | |
| "mean_token_accuracy": 0.8750461477786302, | |
| "num_tokens": 28040211.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 0.4059937232173979, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.466888888888889e-05, | |
| "loss": 6.5553, | |
| "mean_token_accuracy": 0.8755772355943918, | |
| "num_tokens": 28129988.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "eval_chemistry_entropy": 0.5606725987792015, | |
| "eval_chemistry_loss": 0.6744914054870605, | |
| "eval_chemistry_mean_token_accuracy": 0.8119902350902557, | |
| "eval_chemistry_num_tokens": 28129988.0, | |
| "eval_chemistry_runtime": 53.6438, | |
| "eval_chemistry_samples_per_second": 9.321, | |
| "eval_chemistry_steps_per_second": 9.321, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "eval_physics_entropy": 0.5155527636408805, | |
| "eval_physics_loss": 0.5951682925224304, | |
| "eval_physics_mean_token_accuracy": 0.8287101131677628, | |
| "eval_physics_num_tokens": 28129988.0, | |
| "eval_physics_runtime": 62.3428, | |
| "eval_physics_samples_per_second": 8.02, | |
| "eval_physics_steps_per_second": 8.02, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 0.41343814926221967, | |
| "epoch": 1.3639999999999999, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.4646666666666666e-05, | |
| "loss": 6.6457, | |
| "mean_token_accuracy": 0.8740956641733646, | |
| "num_tokens": 28213255.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 0.40158893847838045, | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.4624444444444446e-05, | |
| "loss": 6.3413, | |
| "mean_token_accuracy": 0.8786153614521026, | |
| "num_tokens": 28299534.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 0.40043277870863675, | |
| "epoch": 1.3719999999999999, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.4602222222222225e-05, | |
| "loss": 6.4509, | |
| "mean_token_accuracy": 0.8763358425348997, | |
| "num_tokens": 28380294.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 0.40434538712725043, | |
| "epoch": 1.376, | |
| "grad_norm": 45.5, | |
| "learning_rate": 1.4580000000000001e-05, | |
| "loss": 6.4532, | |
| "mean_token_accuracy": 0.8767665542662144, | |
| "num_tokens": 28465903.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 0.40466274144127967, | |
| "epoch": 1.38, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.455777777777778e-05, | |
| "loss": 6.6484, | |
| "mean_token_accuracy": 0.8732611689716577, | |
| "num_tokens": 28551632.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 0.3937113340944052, | |
| "epoch": 1.384, | |
| "grad_norm": 52.75, | |
| "learning_rate": 1.4535555555555556e-05, | |
| "loss": 6.3348, | |
| "mean_token_accuracy": 0.8784506808966398, | |
| "num_tokens": 28639289.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 0.37205489771440625, | |
| "epoch": 1.388, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.4513333333333336e-05, | |
| "loss": 6.0176, | |
| "mean_token_accuracy": 0.8824555825442075, | |
| "num_tokens": 28718836.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 0.3729272528551519, | |
| "epoch": 1.392, | |
| "grad_norm": 61.75, | |
| "learning_rate": 1.4491111111111111e-05, | |
| "loss": 6.0158, | |
| "mean_token_accuracy": 0.8861456740647554, | |
| "num_tokens": 28801554.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 0.3962272494100034, | |
| "epoch": 1.396, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.446888888888889e-05, | |
| "loss": 6.415, | |
| "mean_token_accuracy": 0.8784573219716549, | |
| "num_tokens": 28880616.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 0.3926007304340601, | |
| "epoch": 1.4, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.4446666666666668e-05, | |
| "loss": 6.2795, | |
| "mean_token_accuracy": 0.8782020095735789, | |
| "num_tokens": 28963885.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_chemistry_entropy": 0.5384700215160847, | |
| "eval_chemistry_loss": 0.6786037683486938, | |
| "eval_chemistry_mean_token_accuracy": 0.8117350234985351, | |
| "eval_chemistry_num_tokens": 28963885.0, | |
| "eval_chemistry_runtime": 53.6924, | |
| "eval_chemistry_samples_per_second": 9.312, | |
| "eval_chemistry_steps_per_second": 9.312, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_physics_entropy": 0.4976633660644293, | |
| "eval_physics_loss": 0.5983567833900452, | |
| "eval_physics_mean_token_accuracy": 0.8291077345609665, | |
| "eval_physics_num_tokens": 28963885.0, | |
| "eval_physics_runtime": 62.5254, | |
| "eval_physics_samples_per_second": 7.997, | |
| "eval_physics_steps_per_second": 7.997, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 0.43768466394394634, | |
| "epoch": 1.404, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.4424444444444446e-05, | |
| "loss": 7.0969, | |
| "mean_token_accuracy": 0.8666778188198805, | |
| "num_tokens": 29044917.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 0.39439385654404757, | |
| "epoch": 1.408, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.4402222222222224e-05, | |
| "loss": 6.2961, | |
| "mean_token_accuracy": 0.8791340485215187, | |
| "num_tokens": 29125114.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 0.417308490537107, | |
| "epoch": 1.412, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.4380000000000001e-05, | |
| "loss": 6.7262, | |
| "mean_token_accuracy": 0.8723568994551897, | |
| "num_tokens": 29205761.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 0.44801882533356546, | |
| "epoch": 1.416, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.4357777777777779e-05, | |
| "loss": 7.2488, | |
| "mean_token_accuracy": 0.8641116410493851, | |
| "num_tokens": 29285272.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 0.40163839077576996, | |
| "epoch": 1.42, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.4335555555555556e-05, | |
| "loss": 6.4598, | |
| "mean_token_accuracy": 0.8771747525781393, | |
| "num_tokens": 29370113.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 0.4159936495125294, | |
| "epoch": 1.424, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.4313333333333334e-05, | |
| "loss": 6.6871, | |
| "mean_token_accuracy": 0.8722648743540049, | |
| "num_tokens": 29454334.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 0.40116758705116806, | |
| "epoch": 1.428, | |
| "grad_norm": 50.5, | |
| "learning_rate": 1.4291111111111113e-05, | |
| "loss": 6.2968, | |
| "mean_token_accuracy": 0.87912851087749, | |
| "num_tokens": 29531227.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 0.39880895167589187, | |
| "epoch": 1.432, | |
| "grad_norm": 37.25, | |
| "learning_rate": 1.426888888888889e-05, | |
| "loss": 6.4627, | |
| "mean_token_accuracy": 0.8770294614136219, | |
| "num_tokens": 29616865.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 0.40523217897862196, | |
| "epoch": 1.436, | |
| "grad_norm": 52.5, | |
| "learning_rate": 1.4246666666666669e-05, | |
| "loss": 6.5345, | |
| "mean_token_accuracy": 0.8740318398922682, | |
| "num_tokens": 29700797.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 0.4052128647454083, | |
| "epoch": 1.44, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.4224444444444445e-05, | |
| "loss": 6.4769, | |
| "mean_token_accuracy": 0.8760498870164156, | |
| "num_tokens": 29789253.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_chemistry_entropy": 0.5394897739291191, | |
| "eval_chemistry_loss": 0.6791832447052002, | |
| "eval_chemistry_mean_token_accuracy": 0.8112371909618378, | |
| "eval_chemistry_num_tokens": 29789253.0, | |
| "eval_chemistry_runtime": 53.648, | |
| "eval_chemistry_samples_per_second": 9.32, | |
| "eval_chemistry_steps_per_second": 9.32, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_physics_entropy": 0.4982254881858826, | |
| "eval_physics_loss": 0.5965704321861267, | |
| "eval_physics_mean_token_accuracy": 0.8296631572246551, | |
| "eval_physics_num_tokens": 29789253.0, | |
| "eval_physics_runtime": 62.3705, | |
| "eval_physics_samples_per_second": 8.017, | |
| "eval_physics_steps_per_second": 8.017, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 0.393766185734421, | |
| "epoch": 1.444, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.4202222222222224e-05, | |
| "loss": 6.4706, | |
| "mean_token_accuracy": 0.8774504791945219, | |
| "num_tokens": 29868398.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 0.41584292319603267, | |
| "epoch": 1.448, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.418e-05, | |
| "loss": 6.6261, | |
| "mean_token_accuracy": 0.8746451210230589, | |
| "num_tokens": 29953006.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 0.3903368269558996, | |
| "epoch": 1.452, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.4157777777777779e-05, | |
| "loss": 6.3795, | |
| "mean_token_accuracy": 0.8790011119097472, | |
| "num_tokens": 30038607.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 0.39744718382135036, | |
| "epoch": 1.456, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.4135555555555555e-05, | |
| "loss": 6.2495, | |
| "mean_token_accuracy": 0.8795612748712301, | |
| "num_tokens": 30128576.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 0.40468403697013855, | |
| "epoch": 1.46, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.4113333333333334e-05, | |
| "loss": 6.7435, | |
| "mean_token_accuracy": 0.8736757151782513, | |
| "num_tokens": 30210823.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 0.4109737670980394, | |
| "epoch": 1.464, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.4091111111111114e-05, | |
| "loss": 6.3109, | |
| "mean_token_accuracy": 0.8789023902267218, | |
| "num_tokens": 30288894.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 0.4134160808287561, | |
| "epoch": 1.468, | |
| "grad_norm": 52.25, | |
| "learning_rate": 1.406888888888889e-05, | |
| "loss": 6.8543, | |
| "mean_token_accuracy": 0.8711716037243604, | |
| "num_tokens": 30367689.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 0.4443310149013996, | |
| "epoch": 1.472, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.4046666666666669e-05, | |
| "loss": 6.9875, | |
| "mean_token_accuracy": 0.8682264130562544, | |
| "num_tokens": 30452719.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 0.3913025620393455, | |
| "epoch": 1.476, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.4024444444444445e-05, | |
| "loss": 6.268, | |
| "mean_token_accuracy": 0.8797603964805603, | |
| "num_tokens": 30536190.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 0.3860092952847481, | |
| "epoch": 1.48, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.4002222222222224e-05, | |
| "loss": 6.3353, | |
| "mean_token_accuracy": 0.8799397245049476, | |
| "num_tokens": 30620659.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "eval_chemistry_entropy": 0.5516593392193317, | |
| "eval_chemistry_loss": 0.6761385202407837, | |
| "eval_chemistry_mean_token_accuracy": 0.8124918038845063, | |
| "eval_chemistry_num_tokens": 30620659.0, | |
| "eval_chemistry_runtime": 54.0653, | |
| "eval_chemistry_samples_per_second": 9.248, | |
| "eval_chemistry_steps_per_second": 9.248, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "eval_physics_entropy": 0.5061543031334877, | |
| "eval_physics_loss": 0.5945616364479065, | |
| "eval_physics_mean_token_accuracy": 0.8298771587014199, | |
| "eval_physics_num_tokens": 30620659.0, | |
| "eval_physics_runtime": 62.705, | |
| "eval_physics_samples_per_second": 7.974, | |
| "eval_physics_steps_per_second": 7.974, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 0.4068119526375085, | |
| "epoch": 1.484, | |
| "grad_norm": 39.75, | |
| "learning_rate": 1.398e-05, | |
| "loss": 6.3487, | |
| "mean_token_accuracy": 0.8779210794717074, | |
| "num_tokens": 30700777.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 0.3898567968979478, | |
| "epoch": 1.488, | |
| "grad_norm": 90.5, | |
| "learning_rate": 1.395777777777778e-05, | |
| "loss": 6.3799, | |
| "mean_token_accuracy": 0.8781911455094814, | |
| "num_tokens": 30782343.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 0.4058146263472736, | |
| "epoch": 1.492, | |
| "grad_norm": 56.0, | |
| "learning_rate": 1.3935555555555557e-05, | |
| "loss": 6.4268, | |
| "mean_token_accuracy": 0.8762706536799669, | |
| "num_tokens": 30867783.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 0.3867638412863016, | |
| "epoch": 1.496, | |
| "grad_norm": 50.5, | |
| "learning_rate": 1.3913333333333335e-05, | |
| "loss": 6.3126, | |
| "mean_token_accuracy": 0.8784434229135514, | |
| "num_tokens": 30947703.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 0.41046067867428065, | |
| "epoch": 1.5, | |
| "grad_norm": 49.25, | |
| "learning_rate": 1.3891111111111114e-05, | |
| "loss": 6.6858, | |
| "mean_token_accuracy": 0.8727071076631546, | |
| "num_tokens": 31029847.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 0.3890328987501562, | |
| "epoch": 1.504, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.386888888888889e-05, | |
| "loss": 6.1404, | |
| "mean_token_accuracy": 0.8820106990635395, | |
| "num_tokens": 31113238.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 0.4187105460092425, | |
| "epoch": 1.508, | |
| "grad_norm": 37.5, | |
| "learning_rate": 1.3846666666666669e-05, | |
| "loss": 6.7747, | |
| "mean_token_accuracy": 0.872544726729393, | |
| "num_tokens": 31198778.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 0.4151353104971349, | |
| "epoch": 1.512, | |
| "grad_norm": 37.5, | |
| "learning_rate": 1.3824444444444445e-05, | |
| "loss": 6.7056, | |
| "mean_token_accuracy": 0.871716808527708, | |
| "num_tokens": 31283883.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 0.4268311742693186, | |
| "epoch": 1.516, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.3802222222222224e-05, | |
| "loss": 6.8163, | |
| "mean_token_accuracy": 0.8693723428994418, | |
| "num_tokens": 31363764.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 0.4101722952444106, | |
| "epoch": 1.52, | |
| "grad_norm": 53.75, | |
| "learning_rate": 1.378e-05, | |
| "loss": 6.5721, | |
| "mean_token_accuracy": 0.8730484273284673, | |
| "num_tokens": 31444589.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "eval_chemistry_entropy": 0.5600586042702198, | |
| "eval_chemistry_loss": 0.6745473742485046, | |
| "eval_chemistry_mean_token_accuracy": 0.8113394799232483, | |
| "eval_chemistry_num_tokens": 31444589.0, | |
| "eval_chemistry_runtime": 53.5558, | |
| "eval_chemistry_samples_per_second": 9.336, | |
| "eval_chemistry_steps_per_second": 9.336, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "eval_physics_entropy": 0.5140104497522116, | |
| "eval_physics_loss": 0.5923190116882324, | |
| "eval_physics_mean_token_accuracy": 0.8303033008575439, | |
| "eval_physics_num_tokens": 31444589.0, | |
| "eval_physics_runtime": 64.0549, | |
| "eval_physics_samples_per_second": 7.806, | |
| "eval_physics_steps_per_second": 7.806, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 0.3989151350222528, | |
| "epoch": 1.524, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.375777777777778e-05, | |
| "loss": 6.3236, | |
| "mean_token_accuracy": 0.8772641956806183, | |
| "num_tokens": 31527312.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 0.3823771147057414, | |
| "epoch": 1.528, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.3735555555555557e-05, | |
| "loss": 6.2511, | |
| "mean_token_accuracy": 0.880349512398243, | |
| "num_tokens": 31608952.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 0.40338705498725175, | |
| "epoch": 1.532, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.3713333333333335e-05, | |
| "loss": 6.3916, | |
| "mean_token_accuracy": 0.8770292654633522, | |
| "num_tokens": 31689112.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 0.4159190553240478, | |
| "epoch": 1.536, | |
| "grad_norm": 52.75, | |
| "learning_rate": 1.3691111111111112e-05, | |
| "loss": 6.7008, | |
| "mean_token_accuracy": 0.8723959777504205, | |
| "num_tokens": 31774137.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 0.4056047718971968, | |
| "epoch": 1.54, | |
| "grad_norm": 53.5, | |
| "learning_rate": 1.366888888888889e-05, | |
| "loss": 6.5033, | |
| "mean_token_accuracy": 0.874358582124114, | |
| "num_tokens": 31855139.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 0.3725484909489751, | |
| "epoch": 1.544, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.3646666666666668e-05, | |
| "loss": 5.9503, | |
| "mean_token_accuracy": 0.8854304205626249, | |
| "num_tokens": 31937927.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 0.3921050449833274, | |
| "epoch": 1.548, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.3624444444444445e-05, | |
| "loss": 6.4555, | |
| "mean_token_accuracy": 0.8776751708239316, | |
| "num_tokens": 32020695.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 0.4078158959746361, | |
| "epoch": 1.552, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.3602222222222223e-05, | |
| "loss": 6.3901, | |
| "mean_token_accuracy": 0.8780841447412968, | |
| "num_tokens": 32101776.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 0.38753908928483727, | |
| "epoch": 1.556, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.3580000000000002e-05, | |
| "loss": 6.2953, | |
| "mean_token_accuracy": 0.878034945949912, | |
| "num_tokens": 32186113.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 0.39779740227386357, | |
| "epoch": 1.56, | |
| "grad_norm": 45.5, | |
| "learning_rate": 1.3557777777777778e-05, | |
| "loss": 6.3673, | |
| "mean_token_accuracy": 0.8784384347498417, | |
| "num_tokens": 32271213.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "eval_chemistry_entropy": 0.5504012902975083, | |
| "eval_chemistry_loss": 0.6732741594314575, | |
| "eval_chemistry_mean_token_accuracy": 0.8128898606300354, | |
| "eval_chemistry_num_tokens": 32271213.0, | |
| "eval_chemistry_runtime": 53.7613, | |
| "eval_chemistry_samples_per_second": 9.3, | |
| "eval_chemistry_steps_per_second": 9.3, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "eval_physics_entropy": 0.5060150835365057, | |
| "eval_physics_loss": 0.5907258987426758, | |
| "eval_physics_mean_token_accuracy": 0.8303489559888839, | |
| "eval_physics_num_tokens": 32271213.0, | |
| "eval_physics_runtime": 62.2781, | |
| "eval_physics_samples_per_second": 8.028, | |
| "eval_physics_steps_per_second": 8.028, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 0.39733350006863477, | |
| "epoch": 1.564, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.3535555555555557e-05, | |
| "loss": 6.4008, | |
| "mean_token_accuracy": 0.8777061901986599, | |
| "num_tokens": 32348533.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 0.3997656940482557, | |
| "epoch": 1.568, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.3513333333333333e-05, | |
| "loss": 6.4498, | |
| "mean_token_accuracy": 0.8779531549662352, | |
| "num_tokens": 32428021.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 0.3962811196222901, | |
| "epoch": 1.572, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.3491111111111113e-05, | |
| "loss": 6.426, | |
| "mean_token_accuracy": 0.8753586910665035, | |
| "num_tokens": 32507699.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 0.4098018233664334, | |
| "epoch": 1.576, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.3468888888888888e-05, | |
| "loss": 6.4576, | |
| "mean_token_accuracy": 0.8761188194155693, | |
| "num_tokens": 32585407.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 0.41058400813490153, | |
| "epoch": 1.58, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.3446666666666668e-05, | |
| "loss": 6.7808, | |
| "mean_token_accuracy": 0.8717385150492192, | |
| "num_tokens": 32670428.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 0.39778603641316296, | |
| "epoch": 1.584, | |
| "grad_norm": 52.25, | |
| "learning_rate": 1.3424444444444447e-05, | |
| "loss": 6.3636, | |
| "mean_token_accuracy": 0.8788483895361423, | |
| "num_tokens": 32752735.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 0.38312863996252416, | |
| "epoch": 1.588, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.3402222222222223e-05, | |
| "loss": 6.086, | |
| "mean_token_accuracy": 0.8815972603857517, | |
| "num_tokens": 32831548.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 0.39520974569022654, | |
| "epoch": 1.592, | |
| "grad_norm": 40.75, | |
| "learning_rate": 1.3380000000000002e-05, | |
| "loss": 6.3631, | |
| "mean_token_accuracy": 0.878123789280653, | |
| "num_tokens": 32912519.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 0.3998935830779374, | |
| "epoch": 1.596, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.3357777777777778e-05, | |
| "loss": 6.4147, | |
| "mean_token_accuracy": 0.8768809117376805, | |
| "num_tokens": 32993186.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 0.3972979296930134, | |
| "epoch": 1.6, | |
| "grad_norm": 38.5, | |
| "learning_rate": 1.3335555555555558e-05, | |
| "loss": 6.3371, | |
| "mean_token_accuracy": 0.879148568585515, | |
| "num_tokens": 33075822.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_chemistry_entropy": 0.5355219095349312, | |
| "eval_chemistry_loss": 0.6751107573509216, | |
| "eval_chemistry_mean_token_accuracy": 0.8136024585962296, | |
| "eval_chemistry_num_tokens": 33075822.0, | |
| "eval_chemistry_runtime": 53.4912, | |
| "eval_chemistry_samples_per_second": 9.347, | |
| "eval_chemistry_steps_per_second": 9.347, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_physics_entropy": 0.4959153833836317, | |
| "eval_physics_loss": 0.5912042856216431, | |
| "eval_physics_mean_token_accuracy": 0.8305088123679161, | |
| "eval_physics_num_tokens": 33075822.0, | |
| "eval_physics_runtime": 62.2699, | |
| "eval_physics_samples_per_second": 8.03, | |
| "eval_physics_steps_per_second": 8.03, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 0.38236242197453973, | |
| "epoch": 1.604, | |
| "grad_norm": 40.5, | |
| "learning_rate": 1.3313333333333333e-05, | |
| "loss": 6.115, | |
| "mean_token_accuracy": 0.8812222603708506, | |
| "num_tokens": 33160214.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 0.3806236485484987, | |
| "epoch": 1.608, | |
| "grad_norm": 52.25, | |
| "learning_rate": 1.3291111111111113e-05, | |
| "loss": 6.1912, | |
| "mean_token_accuracy": 0.8828809015452862, | |
| "num_tokens": 33250394.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 0.3983167938888073, | |
| "epoch": 1.612, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.3268888888888889e-05, | |
| "loss": 6.4582, | |
| "mean_token_accuracy": 0.877009741216898, | |
| "num_tokens": 33332517.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 0.4042822897899896, | |
| "epoch": 1.616, | |
| "grad_norm": 40.0, | |
| "learning_rate": 1.3246666666666668e-05, | |
| "loss": 6.4116, | |
| "mean_token_accuracy": 0.8771112740039826, | |
| "num_tokens": 33411850.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 0.3686965470202267, | |
| "epoch": 1.62, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.3224444444444446e-05, | |
| "loss": 5.968, | |
| "mean_token_accuracy": 0.884541454911232, | |
| "num_tokens": 33496033.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 0.38407273441553114, | |
| "epoch": 1.624, | |
| "grad_norm": 36.5, | |
| "learning_rate": 1.3202222222222223e-05, | |
| "loss": 6.3121, | |
| "mean_token_accuracy": 0.8793533518910408, | |
| "num_tokens": 33577978.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 0.41591387512162326, | |
| "epoch": 1.6280000000000001, | |
| "grad_norm": 45.5, | |
| "learning_rate": 1.3180000000000001e-05, | |
| "loss": 6.5055, | |
| "mean_token_accuracy": 0.8755687132477761, | |
| "num_tokens": 33658837.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 0.3909124245867133, | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 53.25, | |
| "learning_rate": 1.3157777777777778e-05, | |
| "loss": 6.4706, | |
| "mean_token_accuracy": 0.874223543331027, | |
| "num_tokens": 33739456.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 0.3910256386734545, | |
| "epoch": 1.6360000000000001, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.3135555555555558e-05, | |
| "loss": 6.1275, | |
| "mean_token_accuracy": 0.8816055655479431, | |
| "num_tokens": 33824629.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 0.3737372735515237, | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 40.75, | |
| "learning_rate": 1.3113333333333334e-05, | |
| "loss": 6.0599, | |
| "mean_token_accuracy": 0.8833718582987785, | |
| "num_tokens": 33904537.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "eval_chemistry_entropy": 0.5424275600910187, | |
| "eval_chemistry_loss": 0.6766899228096008, | |
| "eval_chemistry_mean_token_accuracy": 0.8130618922114372, | |
| "eval_chemistry_num_tokens": 33904537.0, | |
| "eval_chemistry_runtime": 53.8198, | |
| "eval_chemistry_samples_per_second": 9.29, | |
| "eval_chemistry_steps_per_second": 9.29, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "eval_physics_entropy": 0.49989713841676714, | |
| "eval_physics_loss": 0.5913661122322083, | |
| "eval_physics_mean_token_accuracy": 0.8308604298830032, | |
| "eval_physics_num_tokens": 33904537.0, | |
| "eval_physics_runtime": 62.549, | |
| "eval_physics_samples_per_second": 7.994, | |
| "eval_physics_steps_per_second": 7.994, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 0.3754084711894393, | |
| "epoch": 1.6440000000000001, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.3091111111111113e-05, | |
| "loss": 5.9604, | |
| "mean_token_accuracy": 0.8852428376674653, | |
| "num_tokens": 33987737.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 0.3802527576684952, | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.306888888888889e-05, | |
| "loss": 6.0927, | |
| "mean_token_accuracy": 0.8817774288356304, | |
| "num_tokens": 34072022.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 0.3913967923261225, | |
| "epoch": 1.6520000000000001, | |
| "grad_norm": 40.5, | |
| "learning_rate": 1.3046666666666668e-05, | |
| "loss": 6.4194, | |
| "mean_token_accuracy": 0.8791510999202728, | |
| "num_tokens": 34154187.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 0.40655491799116134, | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.3024444444444446e-05, | |
| "loss": 6.5256, | |
| "mean_token_accuracy": 0.8768716558814049, | |
| "num_tokens": 34236954.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 0.396803954988718, | |
| "epoch": 1.6600000000000001, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.3002222222222223e-05, | |
| "loss": 6.2763, | |
| "mean_token_accuracy": 0.8783984709531069, | |
| "num_tokens": 34323386.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 0.39772137328982354, | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.2980000000000001e-05, | |
| "loss": 6.5304, | |
| "mean_token_accuracy": 0.8762561745941639, | |
| "num_tokens": 34398959.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 0.395791903976351, | |
| "epoch": 1.6680000000000001, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.2957777777777779e-05, | |
| "loss": 6.3265, | |
| "mean_token_accuracy": 0.87955605648458, | |
| "num_tokens": 34477272.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 0.37107134936377406, | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 40.0, | |
| "learning_rate": 1.2935555555555556e-05, | |
| "loss": 5.933, | |
| "mean_token_accuracy": 0.8845330536365509, | |
| "num_tokens": 34564364.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 0.3933924556709826, | |
| "epoch": 1.6760000000000002, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.2913333333333336e-05, | |
| "loss": 6.424, | |
| "mean_token_accuracy": 0.8781571734696627, | |
| "num_tokens": 34649149.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 0.41130870999768376, | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.2891111111111112e-05, | |
| "loss": 6.5347, | |
| "mean_token_accuracy": 0.8747550565749407, | |
| "num_tokens": 34732284.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "eval_chemistry_entropy": 0.5510465578138828, | |
| "eval_chemistry_loss": 0.6722443699836731, | |
| "eval_chemistry_mean_token_accuracy": 0.8126014887094498, | |
| "eval_chemistry_num_tokens": 34732284.0, | |
| "eval_chemistry_runtime": 53.7812, | |
| "eval_chemistry_samples_per_second": 9.297, | |
| "eval_chemistry_steps_per_second": 9.297, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "eval_physics_entropy": 0.5065301542282105, | |
| "eval_physics_loss": 0.588254988193512, | |
| "eval_physics_mean_token_accuracy": 0.8313609339594841, | |
| "eval_physics_num_tokens": 34732284.0, | |
| "eval_physics_runtime": 62.4485, | |
| "eval_physics_samples_per_second": 8.007, | |
| "eval_physics_steps_per_second": 8.007, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 0.3890102437697351, | |
| "epoch": 1.6840000000000002, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.2868888888888891e-05, | |
| "loss": 6.2136, | |
| "mean_token_accuracy": 0.8815694730728865, | |
| "num_tokens": 34818032.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 0.3943379674106836, | |
| "epoch": 1.688, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.2846666666666667e-05, | |
| "loss": 6.5283, | |
| "mean_token_accuracy": 0.8765201598405838, | |
| "num_tokens": 34895374.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 0.4018792214803398, | |
| "epoch": 1.692, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.2824444444444446e-05, | |
| "loss": 6.4101, | |
| "mean_token_accuracy": 0.8787004798650742, | |
| "num_tokens": 34975789.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 0.4061967826448381, | |
| "epoch": 1.696, | |
| "grad_norm": 47.75, | |
| "learning_rate": 1.2802222222222222e-05, | |
| "loss": 6.5462, | |
| "mean_token_accuracy": 0.8755400076508522, | |
| "num_tokens": 35053712.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 0.40665144557133315, | |
| "epoch": 1.7, | |
| "grad_norm": 47.5, | |
| "learning_rate": 1.2780000000000001e-05, | |
| "loss": 6.4962, | |
| "mean_token_accuracy": 0.8734507616609335, | |
| "num_tokens": 35133339.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 0.3985592287033796, | |
| "epoch": 1.704, | |
| "grad_norm": 47.5, | |
| "learning_rate": 1.2757777777777777e-05, | |
| "loss": 6.4644, | |
| "mean_token_accuracy": 0.8761161752045155, | |
| "num_tokens": 35216941.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 0.40054804934188726, | |
| "epoch": 1.708, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.2735555555555557e-05, | |
| "loss": 6.4884, | |
| "mean_token_accuracy": 0.8772318150848151, | |
| "num_tokens": 35305882.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 0.41109891440719365, | |
| "epoch": 1.712, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.2713333333333336e-05, | |
| "loss": 6.4997, | |
| "mean_token_accuracy": 0.8744318414479494, | |
| "num_tokens": 35393088.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 0.39241413585841656, | |
| "epoch": 1.716, | |
| "grad_norm": 51.0, | |
| "learning_rate": 1.2691111111111112e-05, | |
| "loss": 6.2986, | |
| "mean_token_accuracy": 0.8791064377874136, | |
| "num_tokens": 35475213.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 0.3878518451936543, | |
| "epoch": 1.72, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.2668888888888891e-05, | |
| "loss": 6.3365, | |
| "mean_token_accuracy": 0.8789217792451381, | |
| "num_tokens": 35559385.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "eval_chemistry_entropy": 0.5547326206564903, | |
| "eval_chemistry_loss": 0.6720637679100037, | |
| "eval_chemistry_mean_token_accuracy": 0.8123885672688484, | |
| "eval_chemistry_num_tokens": 35559385.0, | |
| "eval_chemistry_runtime": 53.7838, | |
| "eval_chemistry_samples_per_second": 9.296, | |
| "eval_chemistry_steps_per_second": 9.296, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "eval_physics_entropy": 0.5108413529396058, | |
| "eval_physics_loss": 0.5859846472740173, | |
| "eval_physics_mean_token_accuracy": 0.8304098455309867, | |
| "eval_physics_num_tokens": 35559385.0, | |
| "eval_physics_runtime": 62.5915, | |
| "eval_physics_samples_per_second": 7.988, | |
| "eval_physics_steps_per_second": 7.988, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 0.40798138780519366, | |
| "epoch": 1.724, | |
| "grad_norm": 57.75, | |
| "learning_rate": 1.2646666666666667e-05, | |
| "loss": 6.4559, | |
| "mean_token_accuracy": 0.8757931537926197, | |
| "num_tokens": 35641581.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 0.397324959281832, | |
| "epoch": 1.728, | |
| "grad_norm": 40.0, | |
| "learning_rate": 1.2624444444444446e-05, | |
| "loss": 6.4721, | |
| "mean_token_accuracy": 0.8773955579847097, | |
| "num_tokens": 35729007.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 0.39897064603865146, | |
| "epoch": 1.732, | |
| "grad_norm": 38.25, | |
| "learning_rate": 1.2602222222222222e-05, | |
| "loss": 6.3887, | |
| "mean_token_accuracy": 0.8778614915907383, | |
| "num_tokens": 35817187.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 0.38607349833473564, | |
| "epoch": 1.736, | |
| "grad_norm": 55.75, | |
| "learning_rate": 1.2580000000000002e-05, | |
| "loss": 6.3246, | |
| "mean_token_accuracy": 0.8786908566951752, | |
| "num_tokens": 35905199.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 0.4305374707095325, | |
| "epoch": 1.74, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.2557777777777779e-05, | |
| "loss": 6.865, | |
| "mean_token_accuracy": 0.8699071381241084, | |
| "num_tokens": 35987415.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 0.40012558540329335, | |
| "epoch": 1.744, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.2535555555555557e-05, | |
| "loss": 6.4054, | |
| "mean_token_accuracy": 0.879029955342412, | |
| "num_tokens": 36072489.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 0.3851448309607804, | |
| "epoch": 1.748, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.2513333333333334e-05, | |
| "loss": 6.3061, | |
| "mean_token_accuracy": 0.8798547301441431, | |
| "num_tokens": 36157560.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 0.39314311477355657, | |
| "epoch": 1.752, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.2491111111111112e-05, | |
| "loss": 6.2799, | |
| "mean_token_accuracy": 0.8792950961738825, | |
| "num_tokens": 36240600.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 0.3847956730984151, | |
| "epoch": 1.756, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.246888888888889e-05, | |
| "loss": 6.114, | |
| "mean_token_accuracy": 0.8813075732439757, | |
| "num_tokens": 36318165.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 0.3810992536135018, | |
| "epoch": 1.76, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.2446666666666667e-05, | |
| "loss": 6.1448, | |
| "mean_token_accuracy": 0.8820089619606734, | |
| "num_tokens": 36398285.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_chemistry_entropy": 0.5499019548296928, | |
| "eval_chemistry_loss": 0.6719583868980408, | |
| "eval_chemistry_mean_token_accuracy": 0.8134790292978287, | |
| "eval_chemistry_num_tokens": 36398285.0, | |
| "eval_chemistry_runtime": 53.5144, | |
| "eval_chemistry_samples_per_second": 9.343, | |
| "eval_chemistry_steps_per_second": 9.343, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_physics_entropy": 0.5046378436386585, | |
| "eval_physics_loss": 0.5862331390380859, | |
| "eval_physics_mean_token_accuracy": 0.8315197025537491, | |
| "eval_physics_num_tokens": 36398285.0, | |
| "eval_physics_runtime": 66.0295, | |
| "eval_physics_samples_per_second": 7.572, | |
| "eval_physics_steps_per_second": 7.572, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 0.414799137134105, | |
| "epoch": 1.764, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.2424444444444445e-05, | |
| "loss": 6.6864, | |
| "mean_token_accuracy": 0.8711853481829166, | |
| "num_tokens": 36478465.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 0.39798269486054777, | |
| "epoch": 1.768, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.2402222222222222e-05, | |
| "loss": 6.3449, | |
| "mean_token_accuracy": 0.8790823489427566, | |
| "num_tokens": 36559891.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 0.39331792537122967, | |
| "epoch": 1.772, | |
| "grad_norm": 37.25, | |
| "learning_rate": 1.2380000000000002e-05, | |
| "loss": 6.3936, | |
| "mean_token_accuracy": 0.8780659079551697, | |
| "num_tokens": 36645714.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 0.3999379160813987, | |
| "epoch": 1.776, | |
| "grad_norm": 43.0, | |
| "learning_rate": 1.235777777777778e-05, | |
| "loss": 6.4141, | |
| "mean_token_accuracy": 0.8769634023308754, | |
| "num_tokens": 36730479.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 0.39414182072505355, | |
| "epoch": 1.78, | |
| "grad_norm": 45.5, | |
| "learning_rate": 1.2335555555555557e-05, | |
| "loss": 6.3443, | |
| "mean_token_accuracy": 0.8797257397323847, | |
| "num_tokens": 36812010.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 0.37193125137127936, | |
| "epoch": 1.784, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.2313333333333335e-05, | |
| "loss": 5.9259, | |
| "mean_token_accuracy": 0.8858068864792585, | |
| "num_tokens": 36897454.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 0.41305548381060364, | |
| "epoch": 1.788, | |
| "grad_norm": 45.5, | |
| "learning_rate": 1.2291111111111112e-05, | |
| "loss": 6.668, | |
| "mean_token_accuracy": 0.8740019179880619, | |
| "num_tokens": 36977906.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 0.40217553824186325, | |
| "epoch": 1.792, | |
| "grad_norm": 56.75, | |
| "learning_rate": 1.226888888888889e-05, | |
| "loss": 6.4831, | |
| "mean_token_accuracy": 0.8755135674029588, | |
| "num_tokens": 37057425.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 0.4117017044685781, | |
| "epoch": 1.796, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.2246666666666667e-05, | |
| "loss": 6.6424, | |
| "mean_token_accuracy": 0.8731210798025131, | |
| "num_tokens": 37141687.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 0.38369811829179523, | |
| "epoch": 1.8, | |
| "grad_norm": 36.75, | |
| "learning_rate": 1.2224444444444445e-05, | |
| "loss": 6.0253, | |
| "mean_token_accuracy": 0.883763050287962, | |
| "num_tokens": 37225440.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "eval_chemistry_entropy": 0.5401435644030571, | |
| "eval_chemistry_loss": 0.6727893352508545, | |
| "eval_chemistry_mean_token_accuracy": 0.8133891487121582, | |
| "eval_chemistry_num_tokens": 37225440.0, | |
| "eval_chemistry_runtime": 53.6064, | |
| "eval_chemistry_samples_per_second": 9.327, | |
| "eval_chemistry_steps_per_second": 9.327, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "eval_physics_entropy": 0.48808869475126265, | |
| "eval_physics_loss": 0.5868460536003113, | |
| "eval_physics_mean_token_accuracy": 0.8314950082302094, | |
| "eval_physics_num_tokens": 37225440.0, | |
| "eval_physics_runtime": 62.3476, | |
| "eval_physics_samples_per_second": 8.02, | |
| "eval_physics_steps_per_second": 8.02, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 0.3714961093850434, | |
| "epoch": 1.804, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.2202222222222224e-05, | |
| "loss": 6.1358, | |
| "mean_token_accuracy": 0.8824129909276962, | |
| "num_tokens": 37311306.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "entropy": 0.4003173102624714, | |
| "epoch": 1.808, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.218e-05, | |
| "loss": 6.3433, | |
| "mean_token_accuracy": 0.8767804615199566, | |
| "num_tokens": 37395449.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "entropy": 0.3983105253893882, | |
| "epoch": 1.812, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.215777777777778e-05, | |
| "loss": 6.332, | |
| "mean_token_accuracy": 0.8774991031736136, | |
| "num_tokens": 37478804.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "entropy": 0.3816450200974941, | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.2135555555555556e-05, | |
| "loss": 6.243, | |
| "mean_token_accuracy": 0.879349748045206, | |
| "num_tokens": 37565463.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "entropy": 0.41424746057018635, | |
| "epoch": 1.8199999999999998, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.2113333333333335e-05, | |
| "loss": 6.5729, | |
| "mean_token_accuracy": 0.8759691219776868, | |
| "num_tokens": 37647141.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 0.3897454238496721, | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 49.0, | |
| "learning_rate": 1.209111111111111e-05, | |
| "loss": 6.2392, | |
| "mean_token_accuracy": 0.8819328770041466, | |
| "num_tokens": 37729535.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "entropy": 0.4377773189917207, | |
| "epoch": 1.8279999999999998, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.206888888888889e-05, | |
| "loss": 7.1286, | |
| "mean_token_accuracy": 0.8668288454413414, | |
| "num_tokens": 37807940.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "entropy": 0.38501444049179556, | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 40.0, | |
| "learning_rate": 1.204666666666667e-05, | |
| "loss": 6.0622, | |
| "mean_token_accuracy": 0.8835205603390932, | |
| "num_tokens": 37888632.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "entropy": 0.38747762781567874, | |
| "epoch": 1.8359999999999999, | |
| "grad_norm": 63.25, | |
| "learning_rate": 1.2024444444444445e-05, | |
| "loss": 6.4734, | |
| "mean_token_accuracy": 0.8781172964721918, | |
| "num_tokens": 37972144.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "entropy": 0.3853580172173679, | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 39.0, | |
| "learning_rate": 1.2002222222222225e-05, | |
| "loss": 6.0641, | |
| "mean_token_accuracy": 0.8832456823438406, | |
| "num_tokens": 38057839.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "eval_chemistry_entropy": 0.5565233428180217, | |
| "eval_chemistry_loss": 0.6685209274291992, | |
| "eval_chemistry_mean_token_accuracy": 0.8136617729663849, | |
| "eval_chemistry_num_tokens": 38057839.0, | |
| "eval_chemistry_runtime": 53.8992, | |
| "eval_chemistry_samples_per_second": 9.277, | |
| "eval_chemistry_steps_per_second": 9.277, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "eval_physics_entropy": 0.5040034144669772, | |
| "eval_physics_loss": 0.5828114151954651, | |
| "eval_physics_mean_token_accuracy": 0.8316534082889557, | |
| "eval_physics_num_tokens": 38057839.0, | |
| "eval_physics_runtime": 62.7885, | |
| "eval_physics_samples_per_second": 7.963, | |
| "eval_physics_steps_per_second": 7.963, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 0.40128663945943116, | |
| "epoch": 1.8439999999999999, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.198e-05, | |
| "loss": 6.4715, | |
| "mean_token_accuracy": 0.8770762640982867, | |
| "num_tokens": 38141516.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "entropy": 0.39252530760131776, | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 38.75, | |
| "learning_rate": 1.195777777777778e-05, | |
| "loss": 6.2283, | |
| "mean_token_accuracy": 0.8780303739011288, | |
| "num_tokens": 38228321.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "entropy": 0.39332945430651306, | |
| "epoch": 1.8519999999999999, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.1935555555555556e-05, | |
| "loss": 6.3253, | |
| "mean_token_accuracy": 0.8788463421165943, | |
| "num_tokens": 38314672.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "entropy": 0.37782758264802396, | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 48.5, | |
| "learning_rate": 1.1913333333333335e-05, | |
| "loss": 6.0889, | |
| "mean_token_accuracy": 0.8834712006151676, | |
| "num_tokens": 38398033.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "entropy": 0.3893645564094186, | |
| "epoch": 1.8599999999999999, | |
| "grad_norm": 53.25, | |
| "learning_rate": 1.1891111111111111e-05, | |
| "loss": 6.2706, | |
| "mean_token_accuracy": 0.8788041561841965, | |
| "num_tokens": 38481584.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 0.4083589227870107, | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.186888888888889e-05, | |
| "loss": 6.5162, | |
| "mean_token_accuracy": 0.8751051504164934, | |
| "num_tokens": 38568933.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "entropy": 0.38208632012829186, | |
| "epoch": 1.8679999999999999, | |
| "grad_norm": 42.5, | |
| "learning_rate": 1.1846666666666668e-05, | |
| "loss": 6.0292, | |
| "mean_token_accuracy": 0.8837982200086116, | |
| "num_tokens": 38646842.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "entropy": 0.37446660036221147, | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 42.75, | |
| "learning_rate": 1.1824444444444445e-05, | |
| "loss": 6.0901, | |
| "mean_token_accuracy": 0.8815959721803666, | |
| "num_tokens": 38729203.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "entropy": 0.39576807664707303, | |
| "epoch": 1.876, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.1802222222222223e-05, | |
| "loss": 6.2985, | |
| "mean_token_accuracy": 0.8780396945774556, | |
| "num_tokens": 38811533.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "entropy": 0.3904746563639492, | |
| "epoch": 1.88, | |
| "grad_norm": 51.75, | |
| "learning_rate": 1.178e-05, | |
| "loss": 6.3537, | |
| "mean_token_accuracy": 0.8788532923907042, | |
| "num_tokens": 38895784.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "eval_chemistry_entropy": 0.549619193226099, | |
| "eval_chemistry_loss": 0.6710554361343384, | |
| "eval_chemistry_mean_token_accuracy": 0.8132365696430206, | |
| "eval_chemistry_num_tokens": 38895784.0, | |
| "eval_chemistry_runtime": 53.5414, | |
| "eval_chemistry_samples_per_second": 9.339, | |
| "eval_chemistry_steps_per_second": 9.339, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "eval_physics_entropy": 0.49999719807505605, | |
| "eval_physics_loss": 0.583443820476532, | |
| "eval_physics_mean_token_accuracy": 0.8320970281362533, | |
| "eval_physics_num_tokens": 38895784.0, | |
| "eval_physics_runtime": 62.3798, | |
| "eval_physics_samples_per_second": 8.015, | |
| "eval_physics_steps_per_second": 8.015, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 0.40091329999268055, | |
| "epoch": 1.884, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.1757777777777778e-05, | |
| "loss": 6.461, | |
| "mean_token_accuracy": 0.8758362587541342, | |
| "num_tokens": 38980934.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "entropy": 0.3958409369457513, | |
| "epoch": 1.888, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.1735555555555556e-05, | |
| "loss": 6.3845, | |
| "mean_token_accuracy": 0.8778816595673561, | |
| "num_tokens": 39058791.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "entropy": 0.4157537971623242, | |
| "epoch": 1.892, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.1713333333333334e-05, | |
| "loss": 6.7041, | |
| "mean_token_accuracy": 0.8732577003538609, | |
| "num_tokens": 39142381.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "entropy": 0.3967838443815708, | |
| "epoch": 1.896, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.1691111111111113e-05, | |
| "loss": 6.4117, | |
| "mean_token_accuracy": 0.8777343105524779, | |
| "num_tokens": 39224201.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "entropy": 0.4056083607487381, | |
| "epoch": 1.9, | |
| "grad_norm": 47.0, | |
| "learning_rate": 1.1668888888888889e-05, | |
| "loss": 6.5486, | |
| "mean_token_accuracy": 0.874582264199853, | |
| "num_tokens": 39305803.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 0.4043416635133326, | |
| "epoch": 1.904, | |
| "grad_norm": 55.25, | |
| "learning_rate": 1.1646666666666668e-05, | |
| "loss": 6.6009, | |
| "mean_token_accuracy": 0.8759451858699322, | |
| "num_tokens": 39386170.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "entropy": 0.443442501546815, | |
| "epoch": 1.908, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.1624444444444446e-05, | |
| "loss": 7.0102, | |
| "mean_token_accuracy": 0.8690990075469017, | |
| "num_tokens": 39462457.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "entropy": 0.4046101786196232, | |
| "epoch": 1.912, | |
| "grad_norm": 38.0, | |
| "learning_rate": 1.1602222222222223e-05, | |
| "loss": 6.5874, | |
| "mean_token_accuracy": 0.8764791205525398, | |
| "num_tokens": 39545061.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "entropy": 0.39519280968233944, | |
| "epoch": 1.916, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.1580000000000001e-05, | |
| "loss": 6.3615, | |
| "mean_token_accuracy": 0.8806503396481276, | |
| "num_tokens": 39625674.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "entropy": 0.37955069476738573, | |
| "epoch": 1.92, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.1557777777777779e-05, | |
| "loss": 5.9753, | |
| "mean_token_accuracy": 0.8847485568374396, | |
| "num_tokens": 39710381.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_chemistry_entropy": 0.5332140245437622, | |
| "eval_chemistry_loss": 0.672716498374939, | |
| "eval_chemistry_mean_token_accuracy": 0.814129298210144, | |
| "eval_chemistry_num_tokens": 39710381.0, | |
| "eval_chemistry_runtime": 53.5948, | |
| "eval_chemistry_samples_per_second": 9.329, | |
| "eval_chemistry_steps_per_second": 9.329, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_physics_entropy": 0.4877495141774416, | |
| "eval_physics_loss": 0.5833487510681152, | |
| "eval_physics_mean_token_accuracy": 0.8329446904063225, | |
| "eval_physics_num_tokens": 39710381.0, | |
| "eval_physics_runtime": 62.3184, | |
| "eval_physics_samples_per_second": 8.023, | |
| "eval_physics_steps_per_second": 8.023, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 0.4095441807992756, | |
| "epoch": 1.924, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.1535555555555556e-05, | |
| "loss": 6.7103, | |
| "mean_token_accuracy": 0.8711355209350586, | |
| "num_tokens": 39795382.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "entropy": 0.4090301369316876, | |
| "epoch": 1.928, | |
| "grad_norm": 42.0, | |
| "learning_rate": 1.1513333333333334e-05, | |
| "loss": 6.4862, | |
| "mean_token_accuracy": 0.8757699660956859, | |
| "num_tokens": 39878408.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "entropy": 0.40265353601425885, | |
| "epoch": 1.932, | |
| "grad_norm": 39.5, | |
| "learning_rate": 1.1491111111111113e-05, | |
| "loss": 6.5649, | |
| "mean_token_accuracy": 0.8752508915960788, | |
| "num_tokens": 39962873.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "entropy": 0.3662755880970508, | |
| "epoch": 1.936, | |
| "grad_norm": 43.25, | |
| "learning_rate": 1.1468888888888889e-05, | |
| "loss": 5.8123, | |
| "mean_token_accuracy": 0.8867810111492872, | |
| "num_tokens": 40043354.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "entropy": 0.38979168897494676, | |
| "epoch": 1.94, | |
| "grad_norm": 44.75, | |
| "learning_rate": 1.1446666666666668e-05, | |
| "loss": 6.3222, | |
| "mean_token_accuracy": 0.8783317163586617, | |
| "num_tokens": 40124736.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 0.37943958956748247, | |
| "epoch": 1.944, | |
| "grad_norm": 39.25, | |
| "learning_rate": 1.1424444444444444e-05, | |
| "loss": 5.9994, | |
| "mean_token_accuracy": 0.8860078375786543, | |
| "num_tokens": 40206607.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "entropy": 0.3684364980086684, | |
| "epoch": 1.948, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.1402222222222224e-05, | |
| "loss": 5.9377, | |
| "mean_token_accuracy": 0.8854397624731064, | |
| "num_tokens": 40294013.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "entropy": 0.38094157055020333, | |
| "epoch": 1.952, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.138e-05, | |
| "loss": 6.0475, | |
| "mean_token_accuracy": 0.8843934826552868, | |
| "num_tokens": 40375790.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "entropy": 0.3802569825667888, | |
| "epoch": 1.956, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.1357777777777779e-05, | |
| "loss": 6.1904, | |
| "mean_token_accuracy": 0.8817527800798416, | |
| "num_tokens": 40455418.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "entropy": 0.3788024752866477, | |
| "epoch": 1.96, | |
| "grad_norm": 59.25, | |
| "learning_rate": 1.1335555555555558e-05, | |
| "loss": 5.9998, | |
| "mean_token_accuracy": 0.883975101262331, | |
| "num_tokens": 40539856.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "eval_chemistry_entropy": 0.5327692772746087, | |
| "eval_chemistry_loss": 0.6734102964401245, | |
| "eval_chemistry_mean_token_accuracy": 0.8138529658317566, | |
| "eval_chemistry_num_tokens": 40539856.0, | |
| "eval_chemistry_runtime": 53.5819, | |
| "eval_chemistry_samples_per_second": 9.332, | |
| "eval_chemistry_steps_per_second": 9.332, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "eval_physics_entropy": 0.4875757833570242, | |
| "eval_physics_loss": 0.5831025242805481, | |
| "eval_physics_mean_token_accuracy": 0.8328130157589912, | |
| "eval_physics_num_tokens": 40539856.0, | |
| "eval_physics_runtime": 62.266, | |
| "eval_physics_samples_per_second": 8.03, | |
| "eval_physics_steps_per_second": 8.03, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 0.41536389514803884, | |
| "epoch": 1.964, | |
| "grad_norm": 62.5, | |
| "learning_rate": 1.1313333333333334e-05, | |
| "loss": 6.9138, | |
| "mean_token_accuracy": 0.8712738990783692, | |
| "num_tokens": 40621709.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "entropy": 0.3926308457739651, | |
| "epoch": 1.968, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.1291111111111113e-05, | |
| "loss": 6.1219, | |
| "mean_token_accuracy": 0.8826039176434278, | |
| "num_tokens": 40705451.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "entropy": 0.37549833627417684, | |
| "epoch": 1.972, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.126888888888889e-05, | |
| "loss": 6.1036, | |
| "mean_token_accuracy": 0.8824028592556715, | |
| "num_tokens": 40785961.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "entropy": 0.3708323477767408, | |
| "epoch": 1.976, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.1246666666666669e-05, | |
| "loss": 5.9873, | |
| "mean_token_accuracy": 0.884157856926322, | |
| "num_tokens": 40873446.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "entropy": 0.3954867236316204, | |
| "epoch": 1.98, | |
| "grad_norm": 37.25, | |
| "learning_rate": 1.1224444444444444e-05, | |
| "loss": 6.322, | |
| "mean_token_accuracy": 0.8779752962291241, | |
| "num_tokens": 40956433.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 0.3989587856456637, | |
| "epoch": 1.984, | |
| "grad_norm": 52.75, | |
| "learning_rate": 1.1202222222222224e-05, | |
| "loss": 6.5748, | |
| "mean_token_accuracy": 0.8759325701743365, | |
| "num_tokens": 41040313.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "entropy": 0.4050182838924229, | |
| "epoch": 1.988, | |
| "grad_norm": 59.0, | |
| "learning_rate": 1.1180000000000001e-05, | |
| "loss": 6.448, | |
| "mean_token_accuracy": 0.8787617024034262, | |
| "num_tokens": 41124137.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "entropy": 0.39617609707638624, | |
| "epoch": 1.992, | |
| "grad_norm": 44.0, | |
| "learning_rate": 1.1157777777777779e-05, | |
| "loss": 6.4284, | |
| "mean_token_accuracy": 0.8796563293784857, | |
| "num_tokens": 41205940.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "entropy": 0.3897817573044449, | |
| "epoch": 1.996, | |
| "grad_norm": 45.75, | |
| "learning_rate": 1.1135555555555557e-05, | |
| "loss": 6.2676, | |
| "mean_token_accuracy": 0.8792812936007977, | |
| "num_tokens": 41292121.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "entropy": 0.39420619974844157, | |
| "epoch": 2.0, | |
| "grad_norm": 50.0, | |
| "learning_rate": 1.1113333333333334e-05, | |
| "loss": 6.1989, | |
| "mean_token_accuracy": 0.8807359039783478, | |
| "num_tokens": 41373608.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_chemistry_entropy": 0.5424425678253174, | |
| "eval_chemistry_loss": 0.668811023235321, | |
| "eval_chemistry_mean_token_accuracy": 0.8141675420999527, | |
| "eval_chemistry_num_tokens": 41373608.0, | |
| "eval_chemistry_runtime": 53.429, | |
| "eval_chemistry_samples_per_second": 9.358, | |
| "eval_chemistry_steps_per_second": 9.358, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_physics_entropy": 0.49732374693453313, | |
| "eval_physics_loss": 0.5799261331558228, | |
| "eval_physics_mean_token_accuracy": 0.833329997241497, | |
| "eval_physics_num_tokens": 41373608.0, | |
| "eval_physics_runtime": 62.3748, | |
| "eval_physics_samples_per_second": 8.016, | |
| "eval_physics_steps_per_second": 8.016, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 0.23874019854702055, | |
| "epoch": 2.004, | |
| "grad_norm": 58.75, | |
| "learning_rate": 1.1091111111111112e-05, | |
| "loss": 3.3263, | |
| "mean_token_accuracy": 0.9352266754955053, | |
| "num_tokens": 41457392.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "entropy": 0.17612766423262655, | |
| "epoch": 2.008, | |
| "grad_norm": 56.5, | |
| "learning_rate": 1.106888888888889e-05, | |
| "loss": 3.134, | |
| "mean_token_accuracy": 0.9380614425987005, | |
| "num_tokens": 41541054.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "entropy": 0.2073903985787183, | |
| "epoch": 2.012, | |
| "grad_norm": 47.5, | |
| "learning_rate": 1.1046666666666667e-05, | |
| "loss": 3.0936, | |
| "mean_token_accuracy": 0.9398012980818748, | |
| "num_tokens": 41621648.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "entropy": 0.19466686681844295, | |
| "epoch": 2.016, | |
| "grad_norm": 43.75, | |
| "learning_rate": 1.1024444444444445e-05, | |
| "loss": 3.1081, | |
| "mean_token_accuracy": 0.9395533055067062, | |
| "num_tokens": 41708226.0, | |
| "step": 5040 | |
| }, | |
| { | |
| "entropy": 0.19641057420521973, | |
| "epoch": 2.02, | |
| "grad_norm": 55.0, | |
| "learning_rate": 1.1002222222222222e-05, | |
| "loss": 3.1071, | |
| "mean_token_accuracy": 0.9384653646498918, | |
| "num_tokens": 41789805.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 0.1920419150032103, | |
| "epoch": 2.024, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.0980000000000002e-05, | |
| "loss": 3.0469, | |
| "mean_token_accuracy": 0.9404902808368206, | |
| "num_tokens": 41874459.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "entropy": 0.20032177697867154, | |
| "epoch": 2.028, | |
| "grad_norm": 41.75, | |
| "learning_rate": 1.0957777777777778e-05, | |
| "loss": 3.2179, | |
| "mean_token_accuracy": 0.9377676222473383, | |
| "num_tokens": 41962010.0, | |
| "step": 5070 | |
| }, | |
| { | |
| "entropy": 0.20085438918322324, | |
| "epoch": 2.032, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.0935555555555557e-05, | |
| "loss": 3.1985, | |
| "mean_token_accuracy": 0.9379490800201893, | |
| "num_tokens": 42045433.0, | |
| "step": 5080 | |
| }, | |
| { | |
| "entropy": 0.19083389090374112, | |
| "epoch": 2.036, | |
| "grad_norm": 45.25, | |
| "learning_rate": 1.0913333333333333e-05, | |
| "loss": 3.0181, | |
| "mean_token_accuracy": 0.9398705091327428, | |
| "num_tokens": 42131377.0, | |
| "step": 5090 | |
| }, | |
| { | |
| "entropy": 0.19753868989646434, | |
| "epoch": 2.04, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.0891111111111112e-05, | |
| "loss": 3.0686, | |
| "mean_token_accuracy": 0.9392331838607788, | |
| "num_tokens": 42208657.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "eval_chemistry_entropy": 0.41862027600407603, | |
| "eval_chemistry_loss": 0.7625600695610046, | |
| "eval_chemistry_mean_token_accuracy": 0.809203598678112, | |
| "eval_chemistry_num_tokens": 42208657.0, | |
| "eval_chemistry_runtime": 54.1132, | |
| "eval_chemistry_samples_per_second": 9.24, | |
| "eval_chemistry_steps_per_second": 9.24, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "eval_physics_entropy": 0.3989468842595816, | |
| "eval_physics_loss": 0.6450238227844238, | |
| "eval_physics_mean_token_accuracy": 0.8284254409670829, | |
| "eval_physics_num_tokens": 42208657.0, | |
| "eval_physics_runtime": 62.6425, | |
| "eval_physics_samples_per_second": 7.982, | |
| "eval_physics_steps_per_second": 7.982, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 0.20550553721841425, | |
| "epoch": 2.044, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.0868888888888888e-05, | |
| "loss": 3.296, | |
| "mean_token_accuracy": 0.9357010770589114, | |
| "num_tokens": 42289517.0, | |
| "step": 5110 | |
| }, | |
| { | |
| "entropy": 0.19961890419945122, | |
| "epoch": 2.048, | |
| "grad_norm": 60.75, | |
| "learning_rate": 1.0846666666666667e-05, | |
| "loss": 3.1818, | |
| "mean_token_accuracy": 0.9377218190580606, | |
| "num_tokens": 42372694.0, | |
| "step": 5120 | |
| }, | |
| { | |
| "entropy": 0.20033283510711045, | |
| "epoch": 2.052, | |
| "grad_norm": 48.5, | |
| "learning_rate": 1.0824444444444447e-05, | |
| "loss": 3.1738, | |
| "mean_token_accuracy": 0.9384920679032802, | |
| "num_tokens": 42458003.0, | |
| "step": 5130 | |
| }, | |
| { | |
| "entropy": 0.1853128487477079, | |
| "epoch": 2.056, | |
| "grad_norm": 46.5, | |
| "learning_rate": 1.0802222222222223e-05, | |
| "loss": 3.0728, | |
| "mean_token_accuracy": 0.9401571858674288, | |
| "num_tokens": 42541126.0, | |
| "step": 5140 | |
| }, | |
| { | |
| "entropy": 0.20658304239623249, | |
| "epoch": 2.06, | |
| "grad_norm": 61.25, | |
| "learning_rate": 1.0780000000000002e-05, | |
| "loss": 3.2669, | |
| "mean_token_accuracy": 0.9361684795469045, | |
| "num_tokens": 42629304.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 0.20088807311840357, | |
| "epoch": 2.064, | |
| "grad_norm": 41.0, | |
| "learning_rate": 1.0757777777777778e-05, | |
| "loss": 3.1508, | |
| "mean_token_accuracy": 0.938087810203433, | |
| "num_tokens": 42710240.0, | |
| "step": 5160 | |
| }, | |
| { | |
| "entropy": 0.19043276484590024, | |
| "epoch": 2.068, | |
| "grad_norm": 46.75, | |
| "learning_rate": 1.0735555555555557e-05, | |
| "loss": 3.1074, | |
| "mean_token_accuracy": 0.9401311114430427, | |
| "num_tokens": 42792801.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "entropy": 0.20087013996671885, | |
| "epoch": 2.072, | |
| "grad_norm": 48.25, | |
| "learning_rate": 1.0713333333333333e-05, | |
| "loss": 3.1866, | |
| "mean_token_accuracy": 0.9368706427514553, | |
| "num_tokens": 42874781.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "entropy": 0.19184604636393487, | |
| "epoch": 2.076, | |
| "grad_norm": 50.0, | |
| "learning_rate": 1.0691111111111112e-05, | |
| "loss": 3.0469, | |
| "mean_token_accuracy": 0.9403921004384757, | |
| "num_tokens": 42957730.0, | |
| "step": 5190 | |
| }, | |
| { | |
| "entropy": 0.20260196740273387, | |
| "epoch": 2.08, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.0668888888888892e-05, | |
| "loss": 3.2253, | |
| "mean_token_accuracy": 0.9380486860871315, | |
| "num_tokens": 43037844.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_chemistry_entropy": 0.4037852519154549, | |
| "eval_chemistry_loss": 0.7827287912368774, | |
| "eval_chemistry_mean_token_accuracy": 0.8081873996853829, | |
| "eval_chemistry_num_tokens": 43037844.0, | |
| "eval_chemistry_runtime": 53.8385, | |
| "eval_chemistry_samples_per_second": 9.287, | |
| "eval_chemistry_steps_per_second": 9.287, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_physics_entropy": 0.3873061000108719, | |
| "eval_physics_loss": 0.6558912396430969, | |
| "eval_physics_mean_token_accuracy": 0.8283050252199173, | |
| "eval_physics_num_tokens": 43037844.0, | |
| "eval_physics_runtime": 62.4627, | |
| "eval_physics_samples_per_second": 8.005, | |
| "eval_physics_steps_per_second": 8.005, | |
| "step": 5200 | |
| }, | |
| { | |
| "entropy": 0.20824256259948015, | |
| "epoch": 2.084, | |
| "grad_norm": 48.0, | |
| "learning_rate": 1.0646666666666668e-05, | |
| "loss": 3.4052, | |
| "mean_token_accuracy": 0.9333480909466744, | |
| "num_tokens": 43123325.0, | |
| "step": 5210 | |
| }, | |
| { | |
| "entropy": 0.19789786953479052, | |
| "epoch": 2.088, | |
| "grad_norm": 56.75, | |
| "learning_rate": 1.0624444444444447e-05, | |
| "loss": 3.0033, | |
| "mean_token_accuracy": 0.9417567845433951, | |
| "num_tokens": 43202298.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "entropy": 0.20151855589356274, | |
| "epoch": 2.092, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.0602222222222223e-05, | |
| "loss": 3.2207, | |
| "mean_token_accuracy": 0.938299423828721, | |
| "num_tokens": 43285598.0, | |
| "step": 5230 | |
| }, | |
| { | |
| "entropy": 0.1916871007764712, | |
| "epoch": 2.096, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.0580000000000002e-05, | |
| "loss": 3.0968, | |
| "mean_token_accuracy": 0.9383008874952793, | |
| "num_tokens": 43368675.0, | |
| "step": 5240 | |
| }, | |
| { | |
| "entropy": 0.18731462587602438, | |
| "epoch": 2.1, | |
| "grad_norm": 53.5, | |
| "learning_rate": 1.0557777777777778e-05, | |
| "loss": 2.9798, | |
| "mean_token_accuracy": 0.9430658213794232, | |
| "num_tokens": 43449954.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 0.18388683935627342, | |
| "epoch": 2.104, | |
| "grad_norm": 54.25, | |
| "learning_rate": 1.0535555555555557e-05, | |
| "loss": 2.9247, | |
| "mean_token_accuracy": 0.942484200373292, | |
| "num_tokens": 43528580.0, | |
| "step": 5260 | |
| }, | |
| { | |
| "entropy": 0.1879286190494895, | |
| "epoch": 2.108, | |
| "grad_norm": 43.5, | |
| "learning_rate": 1.0513333333333333e-05, | |
| "loss": 3.0419, | |
| "mean_token_accuracy": 0.9407990373671055, | |
| "num_tokens": 43612977.0, | |
| "step": 5270 | |
| }, | |
| { | |
| "entropy": 0.19514538389630615, | |
| "epoch": 2.112, | |
| "grad_norm": 44.5, | |
| "learning_rate": 1.0491111111111112e-05, | |
| "loss": 3.1495, | |
| "mean_token_accuracy": 0.9373606648296118, | |
| "num_tokens": 43699544.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "entropy": 0.19012615643441677, | |
| "epoch": 2.116, | |
| "grad_norm": 55.0, | |
| "learning_rate": 1.046888888888889e-05, | |
| "loss": 2.9851, | |
| "mean_token_accuracy": 0.9408463139086962, | |
| "num_tokens": 43776161.0, | |
| "step": 5290 | |
| }, | |
| { | |
| "entropy": 0.20069221341982485, | |
| "epoch": 2.12, | |
| "grad_norm": 47.25, | |
| "learning_rate": 1.0446666666666668e-05, | |
| "loss": 3.1781, | |
| "mean_token_accuracy": 0.9359448026865721, | |
| "num_tokens": 43858502.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "eval_chemistry_entropy": 0.40047966596484186, | |
| "eval_chemistry_loss": 0.7848058938980103, | |
| "eval_chemistry_mean_token_accuracy": 0.8079274354577064, | |
| "eval_chemistry_num_tokens": 43858502.0, | |
| "eval_chemistry_runtime": 53.5898, | |
| "eval_chemistry_samples_per_second": 9.33, | |
| "eval_chemistry_steps_per_second": 9.33, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "eval_physics_entropy": 0.38767903473973275, | |
| "eval_physics_loss": 0.6546861529350281, | |
| "eval_physics_mean_token_accuracy": 0.8277567273378372, | |
| "eval_physics_num_tokens": 43858502.0, | |
| "eval_physics_runtime": 62.343, | |
| "eval_physics_samples_per_second": 8.02, | |
| "eval_physics_steps_per_second": 8.02, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 0.20158565249294041, | |
| "epoch": 2.124, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.0424444444444445e-05, | |
| "loss": 3.1985, | |
| "mean_token_accuracy": 0.9388665832579136, | |
| "num_tokens": 43935182.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "entropy": 0.1834682718385011, | |
| "epoch": 2.128, | |
| "grad_norm": 50.75, | |
| "learning_rate": 1.0402222222222223e-05, | |
| "loss": 2.9141, | |
| "mean_token_accuracy": 0.9433030698448419, | |
| "num_tokens": 44012632.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "entropy": 0.20906293289735914, | |
| "epoch": 2.132, | |
| "grad_norm": 52.0, | |
| "learning_rate": 1.038e-05, | |
| "loss": 3.3272, | |
| "mean_token_accuracy": 0.935481233894825, | |
| "num_tokens": 44096016.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "entropy": 0.18296956168487669, | |
| "epoch": 2.136, | |
| "grad_norm": 48.0, | |
| "learning_rate": 1.0357777777777778e-05, | |
| "loss": 2.9511, | |
| "mean_token_accuracy": 0.9421849232167006, | |
| "num_tokens": 44179301.0, | |
| "step": 5340 | |
| }, | |
| { | |
| "entropy": 0.208326399885118, | |
| "epoch": 2.14, | |
| "grad_norm": 63.0, | |
| "learning_rate": 1.0335555555555556e-05, | |
| "loss": 3.3295, | |
| "mean_token_accuracy": 0.9351174153387547, | |
| "num_tokens": 44264155.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 0.19738043893594295, | |
| "epoch": 2.144, | |
| "grad_norm": 41.25, | |
| "learning_rate": 1.0313333333333335e-05, | |
| "loss": 3.2062, | |
| "mean_token_accuracy": 0.9371892511844635, | |
| "num_tokens": 44349732.0, | |
| "step": 5360 | |
| }, | |
| { | |
| "entropy": 0.20311750478576868, | |
| "epoch": 2.148, | |
| "grad_norm": 50.0, | |
| "learning_rate": 1.0291111111111111e-05, | |
| "loss": 3.2275, | |
| "mean_token_accuracy": 0.9368093464523554, | |
| "num_tokens": 44431991.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "entropy": 0.20572135057300328, | |
| "epoch": 2.152, | |
| "grad_norm": 48.75, | |
| "learning_rate": 1.026888888888889e-05, | |
| "loss": 3.2749, | |
| "mean_token_accuracy": 0.9358561307191848, | |
| "num_tokens": 44514865.0, | |
| "step": 5380 | |
| }, | |
| { | |
| "entropy": 0.20425092987716198, | |
| "epoch": 2.156, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.0246666666666666e-05, | |
| "loss": 3.1763, | |
| "mean_token_accuracy": 0.9385301012545824, | |
| "num_tokens": 44596124.0, | |
| "step": 5390 | |
| }, | |
| { | |
| "entropy": 0.20146920066326857, | |
| "epoch": 2.16, | |
| "grad_norm": 46.0, | |
| "learning_rate": 1.0224444444444446e-05, | |
| "loss": 3.2178, | |
| "mean_token_accuracy": 0.9376044973731041, | |
| "num_tokens": 44680410.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "eval_chemistry_entropy": 0.3856977616250515, | |
| "eval_chemistry_loss": 0.798565685749054, | |
| "eval_chemistry_mean_token_accuracy": 0.8074681994318962, | |
| "eval_chemistry_num_tokens": 44680410.0, | |
| "eval_chemistry_runtime": 53.5194, | |
| "eval_chemistry_samples_per_second": 9.342, | |
| "eval_chemistry_steps_per_second": 9.342, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "eval_physics_entropy": 0.3756562244296074, | |
| "eval_physics_loss": 0.6638210415840149, | |
| "eval_physics_mean_token_accuracy": 0.827698894739151, | |
| "eval_physics_num_tokens": 44680410.0, | |
| "eval_physics_runtime": 62.2773, | |
| "eval_physics_samples_per_second": 8.029, | |
| "eval_physics_steps_per_second": 8.029, | |
| "step": 5400 | |
| }, | |
| { | |
| "entropy": 0.18812919512856752, | |
| "epoch": 2.164, | |
| "grad_norm": 54.75, | |
| "learning_rate": 1.0202222222222221e-05, | |
| "loss": 3.0064, | |
| "mean_token_accuracy": 0.9413947004824876, | |
| "num_tokens": 44762582.0, | |
| "step": 5410 | |
| }, | |
| { | |
| "entropy": 0.19338072049431504, | |
| "epoch": 2.168, | |
| "grad_norm": 50.25, | |
| "learning_rate": 1.018e-05, | |
| "loss": 3.0948, | |
| "mean_token_accuracy": 0.9408847827464342, | |
| "num_tokens": 44846515.0, | |
| "step": 5420 | |
| }, | |
| { | |
| "entropy": 0.21481295677367598, | |
| "epoch": 2.172, | |
| "grad_norm": 58.5, | |
| "learning_rate": 1.0157777777777777e-05, | |
| "loss": 3.4678, | |
| "mean_token_accuracy": 0.9332882829010487, | |
| "num_tokens": 44932293.0, | |
| "step": 5430 | |
| }, | |
| { | |
| "entropy": 0.19565081875771284, | |
| "epoch": 2.176, | |
| "grad_norm": 44.25, | |
| "learning_rate": 1.0135555555555556e-05, | |
| "loss": 3.1546, | |
| "mean_token_accuracy": 0.9388003278523683, | |
| "num_tokens": 45017922.0, | |
| "step": 5440 | |
| }, | |
| { | |
| "entropy": 0.1895020380616188, | |
| "epoch": 2.18, | |
| "grad_norm": 48.5, | |
| "learning_rate": 1.0113333333333335e-05, | |
| "loss": 2.9332, | |
| "mean_token_accuracy": 0.9421345889568329, | |
| "num_tokens": 45099844.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "entropy": 0.18103633744176478, | |
| "epoch": 2.184, | |
| "grad_norm": 50.0, | |
| "learning_rate": 1.0091111111111111e-05, | |
| "loss": 2.8778, | |
| "mean_token_accuracy": 0.9432444013655186, | |
| "num_tokens": 45181823.0, | |
| "step": 5460 | |
| }, | |
| { | |
| "entropy": 0.19471350284293293, | |
| "epoch": 2.188, | |
| "grad_norm": 51.5, | |
| "learning_rate": 1.006888888888889e-05, | |
| "loss": 3.1432, | |
| "mean_token_accuracy": 0.9383002948015928, | |
| "num_tokens": 45263197.0, | |
| "step": 5470 | |
| }, | |
| { | |
| "entropy": 0.20217982549220323, | |
| "epoch": 2.192, | |
| "grad_norm": 45.0, | |
| "learning_rate": 1.0046666666666666e-05, | |
| "loss": 3.2241, | |
| "mean_token_accuracy": 0.9373363882303238, | |
| "num_tokens": 45348179.0, | |
| "step": 5480 | |
| }, | |
| { | |
| "entropy": 0.19005100564099847, | |
| "epoch": 2.196, | |
| "grad_norm": 42.25, | |
| "learning_rate": 1.0024444444444446e-05, | |
| "loss": 2.9879, | |
| "mean_token_accuracy": 0.9420632436871529, | |
| "num_tokens": 45432770.0, | |
| "step": 5490 | |
| }, | |
| { | |
| "entropy": 0.18858733554370702, | |
| "epoch": 2.2, | |
| "grad_norm": 46.25, | |
| "learning_rate": 1.0002222222222222e-05, | |
| "loss": 3.0609, | |
| "mean_token_accuracy": 0.93999606333673, | |
| "num_tokens": 45514208.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "eval_chemistry_entropy": 0.39155059492588046, | |
| "eval_chemistry_loss": 0.7910591959953308, | |
| "eval_chemistry_mean_token_accuracy": 0.8080245018601417, | |
| "eval_chemistry_num_tokens": 45514208.0, | |
| "eval_chemistry_runtime": 53.4757, | |
| "eval_chemistry_samples_per_second": 9.35, | |
| "eval_chemistry_steps_per_second": 9.35, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "eval_physics_entropy": 0.3800457211136818, | |
| "eval_physics_loss": 0.6611268520355225, | |
| "eval_physics_mean_token_accuracy": 0.8281381183862686, | |
| "eval_physics_num_tokens": 45514208.0, | |
| "eval_physics_runtime": 62.3015, | |
| "eval_physics_samples_per_second": 8.025, | |
| "eval_physics_steps_per_second": 8.025, | |
| "step": 5500 | |
| }, | |
| { | |
| "entropy": 0.18614862819667904, | |
| "epoch": 2.204, | |
| "grad_norm": 44.0, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 2.9873, | |
| "mean_token_accuracy": 0.942183205112815, | |
| "num_tokens": 45596551.0, | |
| "step": 5510 | |
| }, | |
| { | |
| "entropy": 0.19601450096815826, | |
| "epoch": 2.208, | |
| "grad_norm": 53.0, | |
| "learning_rate": 9.957777777777779e-06, | |
| "loss": 3.1129, | |
| "mean_token_accuracy": 0.9396491654217243, | |
| "num_tokens": 45678397.0, | |
| "step": 5520 | |
| }, | |
| { | |
| "entropy": 0.19701245347969235, | |
| "epoch": 2.212, | |
| "grad_norm": 57.0, | |
| "learning_rate": 9.935555555555556e-06, | |
| "loss": 3.1407, | |
| "mean_token_accuracy": 0.9378465551882982, | |
| "num_tokens": 45762516.0, | |
| "step": 5530 | |
| }, | |
| { | |
| "entropy": 0.1893465863307938, | |
| "epoch": 2.216, | |
| "grad_norm": 53.5, | |
| "learning_rate": 9.913333333333334e-06, | |
| "loss": 3.0102, | |
| "mean_token_accuracy": 0.9404071982949972, | |
| "num_tokens": 45844632.0, | |
| "step": 5540 | |
| }, | |
| { | |
| "entropy": 0.19602622296661137, | |
| "epoch": 2.22, | |
| "grad_norm": 43.75, | |
| "learning_rate": 9.891111111111113e-06, | |
| "loss": 3.1011, | |
| "mean_token_accuracy": 0.9404466662555933, | |
| "num_tokens": 45925356.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "entropy": 0.2090798495337367, | |
| "epoch": 2.224, | |
| "grad_norm": 52.25, | |
| "learning_rate": 9.86888888888889e-06, | |
| "loss": 3.3086, | |
| "mean_token_accuracy": 0.9340983040630817, | |
| "num_tokens": 46010052.0, | |
| "step": 5560 | |
| }, | |
| { | |
| "entropy": 0.19277014466933906, | |
| "epoch": 2.228, | |
| "grad_norm": 48.0, | |
| "learning_rate": 9.846666666666668e-06, | |
| "loss": 3.0988, | |
| "mean_token_accuracy": 0.9395745355635882, | |
| "num_tokens": 46096605.0, | |
| "step": 5570 | |
| }, | |
| { | |
| "entropy": 0.20378333488479256, | |
| "epoch": 2.232, | |
| "grad_norm": 54.25, | |
| "learning_rate": 9.824444444444446e-06, | |
| "loss": 3.2394, | |
| "mean_token_accuracy": 0.9365138337016106, | |
| "num_tokens": 46176730.0, | |
| "step": 5580 | |
| }, | |
| { | |
| "entropy": 0.20600440469570458, | |
| "epoch": 2.2359999999999998, | |
| "grad_norm": 42.5, | |
| "learning_rate": 9.802222222222224e-06, | |
| "loss": 3.2901, | |
| "mean_token_accuracy": 0.9340278003364801, | |
| "num_tokens": 46262197.0, | |
| "step": 5590 | |
| }, | |
| { | |
| "entropy": 0.20139261563308536, | |
| "epoch": 2.24, | |
| "grad_norm": 42.5, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 3.1502, | |
| "mean_token_accuracy": 0.9389533422887325, | |
| "num_tokens": 46347116.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_chemistry_entropy": 0.3909877706617117, | |
| "eval_chemistry_loss": 0.7910385131835938, | |
| "eval_chemistry_mean_token_accuracy": 0.8081122170090675, | |
| "eval_chemistry_num_tokens": 46347116.0, | |
| "eval_chemistry_runtime": 53.5591, | |
| "eval_chemistry_samples_per_second": 9.335, | |
| "eval_chemistry_steps_per_second": 9.335, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_physics_entropy": 0.3792251762896776, | |
| "eval_physics_loss": 0.6604795455932617, | |
| "eval_physics_mean_token_accuracy": 0.8283513634800911, | |
| "eval_physics_num_tokens": 46347116.0, | |
| "eval_physics_runtime": 62.3061, | |
| "eval_physics_samples_per_second": 8.025, | |
| "eval_physics_steps_per_second": 8.025, | |
| "step": 5600 | |
| }, | |
| { | |
| "entropy": 0.18947711491491645, | |
| "epoch": 2.2439999999999998, | |
| "grad_norm": 47.25, | |
| "learning_rate": 9.757777777777779e-06, | |
| "loss": 3.1396, | |
| "mean_token_accuracy": 0.9386544648557902, | |
| "num_tokens": 46429703.0, | |
| "step": 5610 | |
| }, | |
| { | |
| "entropy": 0.21070674285292626, | |
| "epoch": 2.248, | |
| "grad_norm": 61.25, | |
| "learning_rate": 9.735555555555556e-06, | |
| "loss": 3.3304, | |
| "mean_token_accuracy": 0.935748977959156, | |
| "num_tokens": 46513696.0, | |
| "step": 5620 | |
| }, | |
| { | |
| "entropy": 0.19108673203736543, | |
| "epoch": 2.252, | |
| "grad_norm": 54.25, | |
| "learning_rate": 9.713333333333334e-06, | |
| "loss": 3.0523, | |
| "mean_token_accuracy": 0.9410431899130345, | |
| "num_tokens": 46595828.0, | |
| "step": 5630 | |
| }, | |
| { | |
| "entropy": 0.192821927042678, | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 48.75, | |
| "learning_rate": 9.691111111111112e-06, | |
| "loss": 3.0979, | |
| "mean_token_accuracy": 0.9399061404168606, | |
| "num_tokens": 46673074.0, | |
| "step": 5640 | |
| }, | |
| { | |
| "entropy": 0.18479047757573425, | |
| "epoch": 2.26, | |
| "grad_norm": 48.25, | |
| "learning_rate": 9.66888888888889e-06, | |
| "loss": 3.0073, | |
| "mean_token_accuracy": 0.9392396967858077, | |
| "num_tokens": 46755709.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "entropy": 0.212563668936491, | |
| "epoch": 2.2640000000000002, | |
| "grad_norm": 57.75, | |
| "learning_rate": 9.646666666666667e-06, | |
| "loss": 3.2538, | |
| "mean_token_accuracy": 0.9364705868065357, | |
| "num_tokens": 46841474.0, | |
| "step": 5660 | |
| }, | |
| { | |
| "entropy": 0.19083552088122815, | |
| "epoch": 2.268, | |
| "grad_norm": 49.0, | |
| "learning_rate": 9.624444444444445e-06, | |
| "loss": 3.0806, | |
| "mean_token_accuracy": 0.9388643242418766, | |
| "num_tokens": 46927376.0, | |
| "step": 5670 | |
| }, | |
| { | |
| "entropy": 0.19668898060917855, | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 48.0, | |
| "learning_rate": 9.602222222222222e-06, | |
| "loss": 3.1726, | |
| "mean_token_accuracy": 0.9378286074846983, | |
| "num_tokens": 47009386.0, | |
| "step": 5680 | |
| }, | |
| { | |
| "entropy": 0.1988459188491106, | |
| "epoch": 2.276, | |
| "grad_norm": 56.75, | |
| "learning_rate": 9.58e-06, | |
| "loss": 3.1352, | |
| "mean_token_accuracy": 0.9398025307804346, | |
| "num_tokens": 47091981.0, | |
| "step": 5690 | |
| }, | |
| { | |
| "entropy": 0.1950016777962446, | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 55.25, | |
| "learning_rate": 9.557777777777777e-06, | |
| "loss": 3.1477, | |
| "mean_token_accuracy": 0.9378086104989052, | |
| "num_tokens": 47175754.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "eval_chemistry_entropy": 0.3963790101259947, | |
| "eval_chemistry_loss": 0.7890461683273315, | |
| "eval_chemistry_mean_token_accuracy": 0.8081274604797364, | |
| "eval_chemistry_num_tokens": 47175754.0, | |
| "eval_chemistry_runtime": 53.5499, | |
| "eval_chemistry_samples_per_second": 9.337, | |
| "eval_chemistry_steps_per_second": 9.337, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "eval_physics_entropy": 0.3893430581241846, | |
| "eval_physics_loss": 0.6548593044281006, | |
| "eval_physics_mean_token_accuracy": 0.8284097356200218, | |
| "eval_physics_num_tokens": 47175754.0, | |
| "eval_physics_runtime": 62.376, | |
| "eval_physics_samples_per_second": 8.016, | |
| "eval_physics_steps_per_second": 8.016, | |
| "step": 5700 | |
| }, | |
| { | |
| "entropy": 0.20085035017691552, | |
| "epoch": 2.284, | |
| "grad_norm": 52.75, | |
| "learning_rate": 9.535555555555557e-06, | |
| "loss": 3.2197, | |
| "mean_token_accuracy": 0.9371700089424848, | |
| "num_tokens": 47262872.0, | |
| "step": 5710 | |
| }, | |
| { | |
| "entropy": 0.20114704947918655, | |
| "epoch": 2.288, | |
| "grad_norm": 53.75, | |
| "learning_rate": 9.513333333333334e-06, | |
| "loss": 3.2122, | |
| "mean_token_accuracy": 0.9378445014357567, | |
| "num_tokens": 47344231.0, | |
| "step": 5720 | |
| }, | |
| { | |
| "entropy": 0.1871094464790076, | |
| "epoch": 2.292, | |
| "grad_norm": 55.0, | |
| "learning_rate": 9.491111111111112e-06, | |
| "loss": 2.9673, | |
| "mean_token_accuracy": 0.9426150672137737, | |
| "num_tokens": 47425711.0, | |
| "step": 5730 | |
| }, | |
| { | |
| "entropy": 0.197967362171039, | |
| "epoch": 2.296, | |
| "grad_norm": 45.5, | |
| "learning_rate": 9.46888888888889e-06, | |
| "loss": 3.1839, | |
| "mean_token_accuracy": 0.9377172753214836, | |
| "num_tokens": 47507052.0, | |
| "step": 5740 | |
| }, | |
| { | |
| "entropy": 0.19870077301748096, | |
| "epoch": 2.3, | |
| "grad_norm": 43.25, | |
| "learning_rate": 9.446666666666667e-06, | |
| "loss": 3.1619, | |
| "mean_token_accuracy": 0.9375291150063276, | |
| "num_tokens": 47584179.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "entropy": 0.19026125948876144, | |
| "epoch": 2.304, | |
| "grad_norm": 52.5, | |
| "learning_rate": 9.424444444444445e-06, | |
| "loss": 2.9914, | |
| "mean_token_accuracy": 0.9407924722880125, | |
| "num_tokens": 47658392.0, | |
| "step": 5760 | |
| }, | |
| { | |
| "entropy": 0.19854985740967096, | |
| "epoch": 2.308, | |
| "grad_norm": 51.25, | |
| "learning_rate": 9.402222222222222e-06, | |
| "loss": 3.2477, | |
| "mean_token_accuracy": 0.9358540803194046, | |
| "num_tokens": 47742655.0, | |
| "step": 5770 | |
| }, | |
| { | |
| "entropy": 0.2093534952495247, | |
| "epoch": 2.312, | |
| "grad_norm": 43.75, | |
| "learning_rate": 9.38e-06, | |
| "loss": 3.3542, | |
| "mean_token_accuracy": 0.9344162929803133, | |
| "num_tokens": 47827802.0, | |
| "step": 5780 | |
| }, | |
| { | |
| "entropy": 0.2062529031187296, | |
| "epoch": 2.316, | |
| "grad_norm": 48.25, | |
| "learning_rate": 9.35777777777778e-06, | |
| "loss": 3.2809, | |
| "mean_token_accuracy": 0.9353579673916101, | |
| "num_tokens": 47908625.0, | |
| "step": 5790 | |
| }, | |
| { | |
| "entropy": 0.19415288937743752, | |
| "epoch": 2.32, | |
| "grad_norm": 53.25, | |
| "learning_rate": 9.335555555555557e-06, | |
| "loss": 3.0901, | |
| "mean_token_accuracy": 0.9379107497632504, | |
| "num_tokens": 47995015.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "eval_chemistry_entropy": 0.39470662355422975, | |
| "eval_chemistry_loss": 0.7899705171585083, | |
| "eval_chemistry_mean_token_accuracy": 0.8071958431005478, | |
| "eval_chemistry_num_tokens": 47995015.0, | |
| "eval_chemistry_runtime": 53.6444, | |
| "eval_chemistry_samples_per_second": 9.321, | |
| "eval_chemistry_steps_per_second": 9.321, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "eval_physics_entropy": 0.3849036909639835, | |
| "eval_physics_loss": 0.65633624792099, | |
| "eval_physics_mean_token_accuracy": 0.8284591092467308, | |
| "eval_physics_num_tokens": 47995015.0, | |
| "eval_physics_runtime": 62.3598, | |
| "eval_physics_samples_per_second": 8.018, | |
| "eval_physics_steps_per_second": 8.018, | |
| "step": 5800 | |
| }, | |
| { | |
| "entropy": 0.19207124339882284, | |
| "epoch": 2.324, | |
| "grad_norm": 59.25, | |
| "learning_rate": 9.313333333333335e-06, | |
| "loss": 3.0789, | |
| "mean_token_accuracy": 0.9400491904467344, | |
| "num_tokens": 48078584.0, | |
| "step": 5810 | |
| }, | |
| { | |
| "entropy": 0.1874732781201601, | |
| "epoch": 2.328, | |
| "grad_norm": 51.5, | |
| "learning_rate": 9.291111111111112e-06, | |
| "loss": 2.9617, | |
| "mean_token_accuracy": 0.9406363628804684, | |
| "num_tokens": 48163933.0, | |
| "step": 5820 | |
| }, | |
| { | |
| "entropy": 0.202512454520911, | |
| "epoch": 2.332, | |
| "grad_norm": 55.5, | |
| "learning_rate": 9.26888888888889e-06, | |
| "loss": 3.2603, | |
| "mean_token_accuracy": 0.9366499200463295, | |
| "num_tokens": 48251298.0, | |
| "step": 5830 | |
| }, | |
| { | |
| "entropy": 0.20060346848331392, | |
| "epoch": 2.336, | |
| "grad_norm": 71.5, | |
| "learning_rate": 9.246666666666667e-06, | |
| "loss": 3.2006, | |
| "mean_token_accuracy": 0.9366799276322126, | |
| "num_tokens": 48336708.0, | |
| "step": 5840 | |
| }, | |
| { | |
| "entropy": 0.18084419274237007, | |
| "epoch": 2.34, | |
| "grad_norm": 47.75, | |
| "learning_rate": 9.224444444444445e-06, | |
| "loss": 2.8636, | |
| "mean_token_accuracy": 0.9434659101068974, | |
| "num_tokens": 48420786.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "entropy": 0.19163171499967574, | |
| "epoch": 2.344, | |
| "grad_norm": 44.25, | |
| "learning_rate": 9.202222222222224e-06, | |
| "loss": 3.0004, | |
| "mean_token_accuracy": 0.9406759556382894, | |
| "num_tokens": 48501684.0, | |
| "step": 5860 | |
| }, | |
| { | |
| "entropy": 0.19346833657473325, | |
| "epoch": 2.348, | |
| "grad_norm": 77.0, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 3.1004, | |
| "mean_token_accuracy": 0.9387925416231155, | |
| "num_tokens": 48583449.0, | |
| "step": 5870 | |
| }, | |
| { | |
| "entropy": 0.18264342558104546, | |
| "epoch": 2.352, | |
| "grad_norm": 62.5, | |
| "learning_rate": 9.15777777777778e-06, | |
| "loss": 2.9199, | |
| "mean_token_accuracy": 0.9425822209566832, | |
| "num_tokens": 48665311.0, | |
| "step": 5880 | |
| }, | |
| { | |
| "entropy": 0.20894327545538544, | |
| "epoch": 2.356, | |
| "grad_norm": 45.25, | |
| "learning_rate": 9.135555555555557e-06, | |
| "loss": 3.361, | |
| "mean_token_accuracy": 0.9344129908829928, | |
| "num_tokens": 48753861.0, | |
| "step": 5890 | |
| }, | |
| { | |
| "entropy": 0.19564867338631303, | |
| "epoch": 2.36, | |
| "grad_norm": 59.5, | |
| "learning_rate": 9.113333333333335e-06, | |
| "loss": 3.1121, | |
| "mean_token_accuracy": 0.9393926940858364, | |
| "num_tokens": 48842066.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "eval_chemistry_entropy": 0.3913639240264893, | |
| "eval_chemistry_loss": 0.7933480739593506, | |
| "eval_chemistry_mean_token_accuracy": 0.8076732790470124, | |
| "eval_chemistry_num_tokens": 48842066.0, | |
| "eval_chemistry_runtime": 53.637, | |
| "eval_chemistry_samples_per_second": 9.322, | |
| "eval_chemistry_steps_per_second": 9.322, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "eval_physics_entropy": 0.38312798050045965, | |
| "eval_physics_loss": 0.6580366492271423, | |
| "eval_physics_mean_token_accuracy": 0.8283037160038949, | |
| "eval_physics_num_tokens": 48842066.0, | |
| "eval_physics_runtime": 62.3358, | |
| "eval_physics_samples_per_second": 8.021, | |
| "eval_physics_steps_per_second": 8.021, | |
| "step": 5900 | |
| }, | |
| { | |
| "entropy": 0.18991808039136232, | |
| "epoch": 2.364, | |
| "grad_norm": 48.75, | |
| "learning_rate": 9.091111111111112e-06, | |
| "loss": 2.9892, | |
| "mean_token_accuracy": 0.9419933516532183, | |
| "num_tokens": 48924724.0, | |
| "step": 5910 | |
| }, | |
| { | |
| "entropy": 0.2007227463182062, | |
| "epoch": 2.368, | |
| "grad_norm": 51.25, | |
| "learning_rate": 9.06888888888889e-06, | |
| "loss": 3.277, | |
| "mean_token_accuracy": 0.9373730711638928, | |
| "num_tokens": 49005334.0, | |
| "step": 5920 | |
| }, | |
| { | |
| "entropy": 0.19522760221734642, | |
| "epoch": 2.372, | |
| "grad_norm": 63.75, | |
| "learning_rate": 9.046666666666668e-06, | |
| "loss": 3.1662, | |
| "mean_token_accuracy": 0.9381731398403644, | |
| "num_tokens": 49086381.0, | |
| "step": 5930 | |
| }, | |
| { | |
| "entropy": 0.19688663026317954, | |
| "epoch": 2.376, | |
| "grad_norm": 43.0, | |
| "learning_rate": 9.024444444444445e-06, | |
| "loss": 3.1226, | |
| "mean_token_accuracy": 0.9393897090107203, | |
| "num_tokens": 49166287.0, | |
| "step": 5940 | |
| }, | |
| { | |
| "entropy": 0.2026588932145387, | |
| "epoch": 2.38, | |
| "grad_norm": 54.0, | |
| "learning_rate": 9.002222222222223e-06, | |
| "loss": 3.2199, | |
| "mean_token_accuracy": 0.9370492726564408, | |
| "num_tokens": 49251686.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "entropy": 0.1876268893945962, | |
| "epoch": 2.384, | |
| "grad_norm": 58.0, | |
| "learning_rate": 8.98e-06, | |
| "loss": 2.9421, | |
| "mean_token_accuracy": 0.9418988361954689, | |
| "num_tokens": 49331936.0, | |
| "step": 5960 | |
| }, | |
| { | |
| "entropy": 0.196075351908803, | |
| "epoch": 2.388, | |
| "grad_norm": 46.25, | |
| "learning_rate": 8.957777777777778e-06, | |
| "loss": 3.2015, | |
| "mean_token_accuracy": 0.9369890756905079, | |
| "num_tokens": 49415465.0, | |
| "step": 5970 | |
| }, | |
| { | |
| "entropy": 0.19922029618173837, | |
| "epoch": 2.392, | |
| "grad_norm": 56.5, | |
| "learning_rate": 8.935555555555556e-06, | |
| "loss": 3.1737, | |
| "mean_token_accuracy": 0.9385156963020563, | |
| "num_tokens": 49500119.0, | |
| "step": 5980 | |
| }, | |
| { | |
| "entropy": 0.2094525044085458, | |
| "epoch": 2.396, | |
| "grad_norm": 47.25, | |
| "learning_rate": 8.913333333333333e-06, | |
| "loss": 3.2756, | |
| "mean_token_accuracy": 0.9347373116761446, | |
| "num_tokens": 49590052.0, | |
| "step": 5990 | |
| }, | |
| { | |
| "entropy": 0.19299745629541576, | |
| "epoch": 2.4, | |
| "grad_norm": 48.25, | |
| "learning_rate": 8.891111111111111e-06, | |
| "loss": 3.1099, | |
| "mean_token_accuracy": 0.9393123656511306, | |
| "num_tokens": 49671048.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_chemistry_entropy": 0.3838026507794857, | |
| "eval_chemistry_loss": 0.8016492128372192, | |
| "eval_chemistry_mean_token_accuracy": 0.8075907635688782, | |
| "eval_chemistry_num_tokens": 49671048.0, | |
| "eval_chemistry_runtime": 53.5069, | |
| "eval_chemistry_samples_per_second": 9.345, | |
| "eval_chemistry_steps_per_second": 9.345, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_physics_entropy": 0.3768892270028591, | |
| "eval_physics_loss": 0.6634106636047363, | |
| "eval_physics_mean_token_accuracy": 0.8284309452176094, | |
| "eval_physics_num_tokens": 49671048.0, | |
| "eval_physics_runtime": 62.3362, | |
| "eval_physics_samples_per_second": 8.021, | |
| "eval_physics_steps_per_second": 8.021, | |
| "step": 6000 | |
| }, | |
| { | |
| "entropy": 0.20823265984654427, | |
| "epoch": 2.404, | |
| "grad_norm": 51.25, | |
| "learning_rate": 8.868888888888888e-06, | |
| "loss": 3.3921, | |
| "mean_token_accuracy": 0.9344091583043337, | |
| "num_tokens": 49759610.0, | |
| "step": 6010 | |
| }, | |
| { | |
| "entropy": 0.20835269689559938, | |
| "epoch": 2.408, | |
| "grad_norm": 47.25, | |
| "learning_rate": 8.846666666666668e-06, | |
| "loss": 3.3104, | |
| "mean_token_accuracy": 0.9350489232689142, | |
| "num_tokens": 49845815.0, | |
| "step": 6020 | |
| }, | |
| { | |
| "entropy": 0.20666063781827687, | |
| "epoch": 2.412, | |
| "grad_norm": 47.25, | |
| "learning_rate": 8.824444444444445e-06, | |
| "loss": 3.2397, | |
| "mean_token_accuracy": 0.9365291576832533, | |
| "num_tokens": 49933188.0, | |
| "step": 6030 | |
| }, | |
| { | |
| "entropy": 0.20116109929513187, | |
| "epoch": 2.416, | |
| "grad_norm": 48.5, | |
| "learning_rate": 8.802222222222223e-06, | |
| "loss": 3.2145, | |
| "mean_token_accuracy": 0.9366682499647141, | |
| "num_tokens": 50018182.0, | |
| "step": 6040 | |
| }, | |
| { | |
| "entropy": 0.18169673986267298, | |
| "epoch": 2.42, | |
| "grad_norm": 47.75, | |
| "learning_rate": 8.78e-06, | |
| "loss": 2.9009, | |
| "mean_token_accuracy": 0.944659861177206, | |
| "num_tokens": 50094728.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "entropy": 0.188365073595196, | |
| "epoch": 2.424, | |
| "grad_norm": 56.5, | |
| "learning_rate": 8.757777777777778e-06, | |
| "loss": 3.0693, | |
| "mean_token_accuracy": 0.9401367917656899, | |
| "num_tokens": 50176877.0, | |
| "step": 6060 | |
| }, | |
| { | |
| "entropy": 0.2062933636829257, | |
| "epoch": 2.428, | |
| "grad_norm": 52.0, | |
| "learning_rate": 8.735555555555556e-06, | |
| "loss": 3.3262, | |
| "mean_token_accuracy": 0.9365007903426885, | |
| "num_tokens": 50259308.0, | |
| "step": 6070 | |
| }, | |
| { | |
| "entropy": 0.19212678987532855, | |
| "epoch": 2.432, | |
| "grad_norm": 53.25, | |
| "learning_rate": 8.713333333333333e-06, | |
| "loss": 3.0282, | |
| "mean_token_accuracy": 0.9407858826220036, | |
| "num_tokens": 50341187.0, | |
| "step": 6080 | |
| }, | |
| { | |
| "entropy": 0.18838006763253362, | |
| "epoch": 2.436, | |
| "grad_norm": 46.25, | |
| "learning_rate": 8.691111111111111e-06, | |
| "loss": 3.0379, | |
| "mean_token_accuracy": 0.9402753319591284, | |
| "num_tokens": 50424533.0, | |
| "step": 6090 | |
| }, | |
| { | |
| "entropy": 0.19480088974814863, | |
| "epoch": 2.44, | |
| "grad_norm": 51.0, | |
| "learning_rate": 8.66888888888889e-06, | |
| "loss": 3.0538, | |
| "mean_token_accuracy": 0.9391263823956251, | |
| "num_tokens": 50505772.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "eval_chemistry_entropy": 0.38921355353295806, | |
| "eval_chemistry_loss": 0.7940726280212402, | |
| "eval_chemistry_mean_token_accuracy": 0.808529881298542, | |
| "eval_chemistry_num_tokens": 50505772.0, | |
| "eval_chemistry_runtime": 53.7085, | |
| "eval_chemistry_samples_per_second": 9.31, | |
| "eval_chemistry_steps_per_second": 9.31, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "eval_physics_entropy": 0.38084026461839676, | |
| "eval_physics_loss": 0.6578019857406616, | |
| "eval_physics_mean_token_accuracy": 0.8284438382983208, | |
| "eval_physics_num_tokens": 50505772.0, | |
| "eval_physics_runtime": 62.4493, | |
| "eval_physics_samples_per_second": 8.006, | |
| "eval_physics_steps_per_second": 8.006, | |
| "step": 6100 | |
| }, | |
| { | |
| "entropy": 0.1986117216059938, | |
| "epoch": 2.444, | |
| "grad_norm": 46.25, | |
| "learning_rate": 8.646666666666668e-06, | |
| "loss": 3.1835, | |
| "mean_token_accuracy": 0.9379383895546198, | |
| "num_tokens": 50585336.0, | |
| "step": 6110 | |
| }, | |
| { | |
| "entropy": 0.2012892787810415, | |
| "epoch": 2.448, | |
| "grad_norm": 55.25, | |
| "learning_rate": 8.624444444444446e-06, | |
| "loss": 3.1793, | |
| "mean_token_accuracy": 0.9370260410010814, | |
| "num_tokens": 50667019.0, | |
| "step": 6120 | |
| }, | |
| { | |
| "entropy": 0.1922749388962984, | |
| "epoch": 2.452, | |
| "grad_norm": 50.75, | |
| "learning_rate": 8.602222222222223e-06, | |
| "loss": 3.1234, | |
| "mean_token_accuracy": 0.9396220836788416, | |
| "num_tokens": 50746256.0, | |
| "step": 6130 | |
| }, | |
| { | |
| "entropy": 0.19711986249312757, | |
| "epoch": 2.456, | |
| "grad_norm": 46.0, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 3.1282, | |
| "mean_token_accuracy": 0.9387405037879943, | |
| "num_tokens": 50829652.0, | |
| "step": 6140 | |
| }, | |
| { | |
| "entropy": 0.18205175315961242, | |
| "epoch": 2.46, | |
| "grad_norm": 65.0, | |
| "learning_rate": 8.557777777777778e-06, | |
| "loss": 2.8789, | |
| "mean_token_accuracy": 0.9434380400925875, | |
| "num_tokens": 50909119.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "entropy": 0.19652860513888298, | |
| "epoch": 2.464, | |
| "grad_norm": 49.0, | |
| "learning_rate": 8.535555555555556e-06, | |
| "loss": 3.1464, | |
| "mean_token_accuracy": 0.9384651053696871, | |
| "num_tokens": 50991113.0, | |
| "step": 6160 | |
| }, | |
| { | |
| "entropy": 0.2062100607668981, | |
| "epoch": 2.468, | |
| "grad_norm": 52.5, | |
| "learning_rate": 8.513333333333335e-06, | |
| "loss": 3.3208, | |
| "mean_token_accuracy": 0.9351020980626344, | |
| "num_tokens": 51077483.0, | |
| "step": 6170 | |
| }, | |
| { | |
| "entropy": 0.1914805879816413, | |
| "epoch": 2.472, | |
| "grad_norm": 54.75, | |
| "learning_rate": 8.491111111111113e-06, | |
| "loss": 3.027, | |
| "mean_token_accuracy": 0.9398773550987244, | |
| "num_tokens": 51159212.0, | |
| "step": 6180 | |
| }, | |
| { | |
| "entropy": 0.19051351412199438, | |
| "epoch": 2.476, | |
| "grad_norm": 48.25, | |
| "learning_rate": 8.46888888888889e-06, | |
| "loss": 3.0237, | |
| "mean_token_accuracy": 0.9402312427759171, | |
| "num_tokens": 51238463.0, | |
| "step": 6190 | |
| }, | |
| { | |
| "entropy": 0.19327848725952207, | |
| "epoch": 2.48, | |
| "grad_norm": 45.0, | |
| "learning_rate": 8.446666666666668e-06, | |
| "loss": 3.0853, | |
| "mean_token_accuracy": 0.9396164819598198, | |
| "num_tokens": 51319624.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "eval_chemistry_entropy": 0.38841446498036386, | |
| "eval_chemistry_loss": 0.7920281887054443, | |
| "eval_chemistry_mean_token_accuracy": 0.8082874090671539, | |
| "eval_chemistry_num_tokens": 51319624.0, | |
| "eval_chemistry_runtime": 53.7118, | |
| "eval_chemistry_samples_per_second": 9.309, | |
| "eval_chemistry_steps_per_second": 9.309, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "eval_physics_entropy": 0.3811262975931168, | |
| "eval_physics_loss": 0.6563181281089783, | |
| "eval_physics_mean_token_accuracy": 0.8288948413729668, | |
| "eval_physics_num_tokens": 51319624.0, | |
| "eval_physics_runtime": 62.3741, | |
| "eval_physics_samples_per_second": 8.016, | |
| "eval_physics_steps_per_second": 8.016, | |
| "step": 6200 | |
| }, | |
| { | |
| "entropy": 0.18596046869643032, | |
| "epoch": 2.484, | |
| "grad_norm": 52.0, | |
| "learning_rate": 8.424444444444446e-06, | |
| "loss": 3.0286, | |
| "mean_token_accuracy": 0.9400562565773726, | |
| "num_tokens": 51399925.0, | |
| "step": 6210 | |
| }, | |
| { | |
| "entropy": 0.20379393815528601, | |
| "epoch": 2.488, | |
| "grad_norm": 49.0, | |
| "learning_rate": 8.402222222222223e-06, | |
| "loss": 3.2888, | |
| "mean_token_accuracy": 0.9348027430474758, | |
| "num_tokens": 51488050.0, | |
| "step": 6220 | |
| }, | |
| { | |
| "entropy": 0.19651083638891578, | |
| "epoch": 2.492, | |
| "grad_norm": 52.0, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 3.1156, | |
| "mean_token_accuracy": 0.9391793262213468, | |
| "num_tokens": 51574712.0, | |
| "step": 6230 | |
| }, | |
| { | |
| "entropy": 0.19182051892857999, | |
| "epoch": 2.496, | |
| "grad_norm": 53.0, | |
| "learning_rate": 8.357777777777779e-06, | |
| "loss": 3.0839, | |
| "mean_token_accuracy": 0.9381668120622635, | |
| "num_tokens": 51654633.0, | |
| "step": 6240 | |
| }, | |
| { | |
| "entropy": 0.1880463571753353, | |
| "epoch": 2.5, | |
| "grad_norm": 41.0, | |
| "learning_rate": 8.335555555555556e-06, | |
| "loss": 2.9915, | |
| "mean_token_accuracy": 0.940981075540185, | |
| "num_tokens": 51733099.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "entropy": 0.19248229740187525, | |
| "epoch": 2.504, | |
| "grad_norm": 51.0, | |
| "learning_rate": 8.313333333333334e-06, | |
| "loss": 3.056, | |
| "mean_token_accuracy": 0.9388160679489375, | |
| "num_tokens": 51818044.0, | |
| "step": 6260 | |
| }, | |
| { | |
| "entropy": 0.1870897078420967, | |
| "epoch": 2.508, | |
| "grad_norm": 47.0, | |
| "learning_rate": 8.291111111111112e-06, | |
| "loss": 3.0324, | |
| "mean_token_accuracy": 0.9416738871484995, | |
| "num_tokens": 51900998.0, | |
| "step": 6270 | |
| }, | |
| { | |
| "entropy": 0.19927883432246746, | |
| "epoch": 2.512, | |
| "grad_norm": 49.25, | |
| "learning_rate": 8.268888888888889e-06, | |
| "loss": 3.1251, | |
| "mean_token_accuracy": 0.9392885901033878, | |
| "num_tokens": 51980051.0, | |
| "step": 6280 | |
| }, | |
| { | |
| "entropy": 0.19368191035464405, | |
| "epoch": 2.516, | |
| "grad_norm": 51.25, | |
| "learning_rate": 8.246666666666667e-06, | |
| "loss": 3.1217, | |
| "mean_token_accuracy": 0.9377437356859446, | |
| "num_tokens": 52064164.0, | |
| "step": 6290 | |
| }, | |
| { | |
| "entropy": 0.1988163222093135, | |
| "epoch": 2.52, | |
| "grad_norm": 50.0, | |
| "learning_rate": 8.224444444444444e-06, | |
| "loss": 3.1396, | |
| "mean_token_accuracy": 0.9376461833715439, | |
| "num_tokens": 52146049.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "eval_chemistry_entropy": 0.39493052452802657, | |
| "eval_chemistry_loss": 0.7922888398170471, | |
| "eval_chemistry_mean_token_accuracy": 0.807987703859806, | |
| "eval_chemistry_num_tokens": 52146049.0, | |
| "eval_chemistry_runtime": 53.6383, | |
| "eval_chemistry_samples_per_second": 9.322, | |
| "eval_chemistry_steps_per_second": 9.322, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "eval_physics_entropy": 0.38753846891224386, | |
| "eval_physics_loss": 0.6539864540100098, | |
| "eval_physics_mean_token_accuracy": 0.8286657522320747, | |
| "eval_physics_num_tokens": 52146049.0, | |
| "eval_physics_runtime": 62.3501, | |
| "eval_physics_samples_per_second": 8.019, | |
| "eval_physics_steps_per_second": 8.019, | |
| "step": 6300 | |
| }, | |
| { | |
| "entropy": 0.19336708360351623, | |
| "epoch": 2.524, | |
| "grad_norm": 88.5, | |
| "learning_rate": 8.202222222222222e-06, | |
| "loss": 3.0569, | |
| "mean_token_accuracy": 0.939612278342247, | |
| "num_tokens": 52229856.0, | |
| "step": 6310 | |
| }, | |
| { | |
| "entropy": 0.18640460472088308, | |
| "epoch": 2.528, | |
| "grad_norm": 52.25, | |
| "learning_rate": 8.18e-06, | |
| "loss": 3.0449, | |
| "mean_token_accuracy": 0.9402619633823633, | |
| "num_tokens": 52316109.0, | |
| "step": 6320 | |
| }, | |
| { | |
| "entropy": 0.19002009390387684, | |
| "epoch": 2.532, | |
| "grad_norm": 44.5, | |
| "learning_rate": 8.157777777777779e-06, | |
| "loss": 2.9769, | |
| "mean_token_accuracy": 0.942006866261363, | |
| "num_tokens": 52398177.0, | |
| "step": 6330 | |
| }, | |
| { | |
| "entropy": 0.1812201444292441, | |
| "epoch": 2.536, | |
| "grad_norm": 50.25, | |
| "learning_rate": 8.135555555555557e-06, | |
| "loss": 2.8559, | |
| "mean_token_accuracy": 0.9443437688052654, | |
| "num_tokens": 52475120.0, | |
| "step": 6340 | |
| }, | |
| { | |
| "entropy": 0.17869069916196167, | |
| "epoch": 2.54, | |
| "grad_norm": 57.0, | |
| "learning_rate": 8.113333333333334e-06, | |
| "loss": 2.9435, | |
| "mean_token_accuracy": 0.9424817778170109, | |
| "num_tokens": 52551923.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "entropy": 0.20136196929961442, | |
| "epoch": 2.544, | |
| "grad_norm": 47.25, | |
| "learning_rate": 8.091111111111112e-06, | |
| "loss": 3.2445, | |
| "mean_token_accuracy": 0.9366252154111863, | |
| "num_tokens": 52634528.0, | |
| "step": 6360 | |
| }, | |
| { | |
| "entropy": 0.2106772383209318, | |
| "epoch": 2.548, | |
| "grad_norm": 53.75, | |
| "learning_rate": 8.06888888888889e-06, | |
| "loss": 3.3276, | |
| "mean_token_accuracy": 0.9355383839458227, | |
| "num_tokens": 52716201.0, | |
| "step": 6370 | |
| }, | |
| { | |
| "entropy": 0.18290378227829934, | |
| "epoch": 2.552, | |
| "grad_norm": 55.75, | |
| "learning_rate": 8.046666666666667e-06, | |
| "loss": 2.94, | |
| "mean_token_accuracy": 0.9428097851574421, | |
| "num_tokens": 52797762.0, | |
| "step": 6380 | |
| }, | |
| { | |
| "entropy": 0.18960943738929928, | |
| "epoch": 2.556, | |
| "grad_norm": 52.0, | |
| "learning_rate": 8.024444444444445e-06, | |
| "loss": 2.9571, | |
| "mean_token_accuracy": 0.9417350132018327, | |
| "num_tokens": 52879764.0, | |
| "step": 6390 | |
| }, | |
| { | |
| "entropy": 0.19443998872302473, | |
| "epoch": 2.56, | |
| "grad_norm": 49.0, | |
| "learning_rate": 8.002222222222222e-06, | |
| "loss": 3.1366, | |
| "mean_token_accuracy": 0.9388776656240225, | |
| "num_tokens": 52964035.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_chemistry_entropy": 0.3794822678864002, | |
| "eval_chemistry_loss": 0.8048861622810364, | |
| "eval_chemistry_mean_token_accuracy": 0.8082284069657326, | |
| "eval_chemistry_num_tokens": 52964035.0, | |
| "eval_chemistry_runtime": 53.5984, | |
| "eval_chemistry_samples_per_second": 9.329, | |
| "eval_chemistry_steps_per_second": 9.329, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_physics_entropy": 0.37519480285048484, | |
| "eval_physics_loss": 0.6617055535316467, | |
| "eval_physics_mean_token_accuracy": 0.8287884512543678, | |
| "eval_physics_num_tokens": 52964035.0, | |
| "eval_physics_runtime": 62.3024, | |
| "eval_physics_samples_per_second": 8.025, | |
| "eval_physics_steps_per_second": 8.025, | |
| "step": 6400 | |
| }, | |
| { | |
| "entropy": 0.1909260055515915, | |
| "epoch": 2.564, | |
| "grad_norm": 56.5, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 3.0468, | |
| "mean_token_accuracy": 0.939851763471961, | |
| "num_tokens": 53045110.0, | |
| "step": 6410 | |
| }, | |
| { | |
| "entropy": 0.19128599227406085, | |
| "epoch": 2.568, | |
| "grad_norm": 49.25, | |
| "learning_rate": 7.957777777777779e-06, | |
| "loss": 2.978, | |
| "mean_token_accuracy": 0.9413053765892982, | |
| "num_tokens": 53127106.0, | |
| "step": 6420 | |
| }, | |
| { | |
| "entropy": 0.19033207101747393, | |
| "epoch": 2.572, | |
| "grad_norm": 59.75, | |
| "learning_rate": 7.935555555555557e-06, | |
| "loss": 3.0351, | |
| "mean_token_accuracy": 0.941039651632309, | |
| "num_tokens": 53206785.0, | |
| "step": 6430 | |
| }, | |
| { | |
| "entropy": 0.1824286952847615, | |
| "epoch": 2.576, | |
| "grad_norm": 49.0, | |
| "learning_rate": 7.913333333333334e-06, | |
| "loss": 2.9573, | |
| "mean_token_accuracy": 0.9419077880680561, | |
| "num_tokens": 53293345.0, | |
| "step": 6440 | |
| }, | |
| { | |
| "entropy": 0.1864988123998046, | |
| "epoch": 2.58, | |
| "grad_norm": 56.75, | |
| "learning_rate": 7.891111111111112e-06, | |
| "loss": 3.0058, | |
| "mean_token_accuracy": 0.9412127815186977, | |
| "num_tokens": 53373525.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "entropy": 0.19675099512096494, | |
| "epoch": 2.584, | |
| "grad_norm": 58.5, | |
| "learning_rate": 7.86888888888889e-06, | |
| "loss": 3.1682, | |
| "mean_token_accuracy": 0.9386520054191351, | |
| "num_tokens": 53454336.0, | |
| "step": 6460 | |
| }, | |
| { | |
| "entropy": 0.19417250119149684, | |
| "epoch": 2.588, | |
| "grad_norm": 60.0, | |
| "learning_rate": 7.846666666666667e-06, | |
| "loss": 3.0557, | |
| "mean_token_accuracy": 0.9408094480633735, | |
| "num_tokens": 53536855.0, | |
| "step": 6470 | |
| }, | |
| { | |
| "entropy": 0.19675368582829833, | |
| "epoch": 2.592, | |
| "grad_norm": 41.5, | |
| "learning_rate": 7.824444444444445e-06, | |
| "loss": 3.1679, | |
| "mean_token_accuracy": 0.937222795933485, | |
| "num_tokens": 53620896.0, | |
| "step": 6480 | |
| }, | |
| { | |
| "entropy": 0.2045375799993053, | |
| "epoch": 2.596, | |
| "grad_norm": 53.5, | |
| "learning_rate": 7.802222222222222e-06, | |
| "loss": 3.2518, | |
| "mean_token_accuracy": 0.938046908006072, | |
| "num_tokens": 53701003.0, | |
| "step": 6490 | |
| }, | |
| { | |
| "entropy": 0.1999150504823774, | |
| "epoch": 2.6, | |
| "grad_norm": 52.25, | |
| "learning_rate": 7.78e-06, | |
| "loss": 3.1476, | |
| "mean_token_accuracy": 0.9388340193778276, | |
| "num_tokens": 53784544.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "eval_chemistry_entropy": 0.3801919347345829, | |
| "eval_chemistry_loss": 0.8062585592269897, | |
| "eval_chemistry_mean_token_accuracy": 0.8072631905674934, | |
| "eval_chemistry_num_tokens": 53784544.0, | |
| "eval_chemistry_runtime": 53.5253, | |
| "eval_chemistry_samples_per_second": 9.341, | |
| "eval_chemistry_steps_per_second": 9.341, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "eval_physics_entropy": 0.37476758985221387, | |
| "eval_physics_loss": 0.6621670126914978, | |
| "eval_physics_mean_token_accuracy": 0.8288013190627098, | |
| "eval_physics_num_tokens": 53784544.0, | |
| "eval_physics_runtime": 62.3432, | |
| "eval_physics_samples_per_second": 8.02, | |
| "eval_physics_steps_per_second": 8.02, | |
| "step": 6500 | |
| }, | |
| { | |
| "entropy": 0.1839638065546751, | |
| "epoch": 2.604, | |
| "grad_norm": 60.5, | |
| "learning_rate": 7.75777777777778e-06, | |
| "loss": 2.9651, | |
| "mean_token_accuracy": 0.9419289950281382, | |
| "num_tokens": 53865782.0, | |
| "step": 6510 | |
| }, | |
| { | |
| "entropy": 0.19291000463999808, | |
| "epoch": 2.608, | |
| "grad_norm": 57.25, | |
| "learning_rate": 7.735555555555557e-06, | |
| "loss": 3.0725, | |
| "mean_token_accuracy": 0.9392510026693344, | |
| "num_tokens": 53950989.0, | |
| "step": 6520 | |
| }, | |
| { | |
| "entropy": 0.19946922180242838, | |
| "epoch": 2.612, | |
| "grad_norm": 49.75, | |
| "learning_rate": 7.713333333333335e-06, | |
| "loss": 3.2101, | |
| "mean_token_accuracy": 0.937137245386839, | |
| "num_tokens": 54035532.0, | |
| "step": 6530 | |
| }, | |
| { | |
| "entropy": 0.18206452212762086, | |
| "epoch": 2.616, | |
| "grad_norm": 54.5, | |
| "learning_rate": 7.691111111111112e-06, | |
| "loss": 2.9032, | |
| "mean_token_accuracy": 0.9434707213193179, | |
| "num_tokens": 54116097.0, | |
| "step": 6540 | |
| }, | |
| { | |
| "entropy": 0.20347899200860411, | |
| "epoch": 2.62, | |
| "grad_norm": 53.0, | |
| "learning_rate": 7.66888888888889e-06, | |
| "loss": 3.2582, | |
| "mean_token_accuracy": 0.935738305747509, | |
| "num_tokens": 54199516.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "entropy": 0.19039809501264243, | |
| "epoch": 2.624, | |
| "grad_norm": 49.5, | |
| "learning_rate": 7.646666666666667e-06, | |
| "loss": 3.0809, | |
| "mean_token_accuracy": 0.9394499566406012, | |
| "num_tokens": 54282732.0, | |
| "step": 6560 | |
| }, | |
| { | |
| "entropy": 0.18198461681604386, | |
| "epoch": 2.628, | |
| "grad_norm": 60.5, | |
| "learning_rate": 7.624444444444445e-06, | |
| "loss": 2.949, | |
| "mean_token_accuracy": 0.9429820477962494, | |
| "num_tokens": 54360843.0, | |
| "step": 6570 | |
| }, | |
| { | |
| "entropy": 0.18644747165963055, | |
| "epoch": 2.632, | |
| "grad_norm": 44.5, | |
| "learning_rate": 7.602222222222223e-06, | |
| "loss": 2.9606, | |
| "mean_token_accuracy": 0.942878284305334, | |
| "num_tokens": 54442780.0, | |
| "step": 6580 | |
| }, | |
| { | |
| "entropy": 0.20099130254238845, | |
| "epoch": 2.636, | |
| "grad_norm": 57.25, | |
| "learning_rate": 7.58e-06, | |
| "loss": 3.231, | |
| "mean_token_accuracy": 0.9373830027878285, | |
| "num_tokens": 54526330.0, | |
| "step": 6590 | |
| }, | |
| { | |
| "entropy": 0.19821790065616368, | |
| "epoch": 2.64, | |
| "grad_norm": 56.0, | |
| "learning_rate": 7.557777777777779e-06, | |
| "loss": 3.1161, | |
| "mean_token_accuracy": 0.9382164843380452, | |
| "num_tokens": 54609748.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "eval_chemistry_entropy": 0.38609197197854517, | |
| "eval_chemistry_loss": 0.7984758019447327, | |
| "eval_chemistry_mean_token_accuracy": 0.8080443707108498, | |
| "eval_chemistry_num_tokens": 54609748.0, | |
| "eval_chemistry_runtime": 53.6591, | |
| "eval_chemistry_samples_per_second": 9.318, | |
| "eval_chemistry_steps_per_second": 9.318, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "eval_physics_entropy": 0.3797822464555502, | |
| "eval_physics_loss": 0.6579533815383911, | |
| "eval_physics_mean_token_accuracy": 0.8287059485316276, | |
| "eval_physics_num_tokens": 54609748.0, | |
| "eval_physics_runtime": 62.4293, | |
| "eval_physics_samples_per_second": 8.009, | |
| "eval_physics_steps_per_second": 8.009, | |
| "step": 6600 | |
| }, | |
| { | |
| "entropy": 0.1917013380676508, | |
| "epoch": 2.644, | |
| "grad_norm": 43.25, | |
| "learning_rate": 7.535555555555556e-06, | |
| "loss": 3.0642, | |
| "mean_token_accuracy": 0.9397985614836216, | |
| "num_tokens": 54698629.0, | |
| "step": 6610 | |
| }, | |
| { | |
| "entropy": 0.18994684009812773, | |
| "epoch": 2.648, | |
| "grad_norm": 56.25, | |
| "learning_rate": 7.513333333333334e-06, | |
| "loss": 3.0925, | |
| "mean_token_accuracy": 0.9398026667535305, | |
| "num_tokens": 54783193.0, | |
| "step": 6620 | |
| }, | |
| { | |
| "entropy": 0.2163593316450715, | |
| "epoch": 2.652, | |
| "grad_norm": 63.5, | |
| "learning_rate": 7.4911111111111116e-06, | |
| "loss": 3.5268, | |
| "mean_token_accuracy": 0.9321135234087705, | |
| "num_tokens": 54869351.0, | |
| "step": 6630 | |
| }, | |
| { | |
| "entropy": 0.21973537290468811, | |
| "epoch": 2.656, | |
| "grad_norm": 45.75, | |
| "learning_rate": 7.46888888888889e-06, | |
| "loss": 3.4755, | |
| "mean_token_accuracy": 0.9328120846301318, | |
| "num_tokens": 54953583.0, | |
| "step": 6640 | |
| }, | |
| { | |
| "entropy": 0.19087877119891344, | |
| "epoch": 2.66, | |
| "grad_norm": 50.0, | |
| "learning_rate": 7.446666666666668e-06, | |
| "loss": 3.0319, | |
| "mean_token_accuracy": 0.9405633073300124, | |
| "num_tokens": 55033264.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "entropy": 0.21176332752220334, | |
| "epoch": 2.664, | |
| "grad_norm": 82.5, | |
| "learning_rate": 7.424444444444445e-06, | |
| "loss": 3.3814, | |
| "mean_token_accuracy": 0.9343311324715614, | |
| "num_tokens": 55117144.0, | |
| "step": 6660 | |
| }, | |
| { | |
| "entropy": 0.19612957751378418, | |
| "epoch": 2.668, | |
| "grad_norm": 53.0, | |
| "learning_rate": 7.402222222222223e-06, | |
| "loss": 3.1454, | |
| "mean_token_accuracy": 0.9378169015049934, | |
| "num_tokens": 55199763.0, | |
| "step": 6670 | |
| }, | |
| { | |
| "entropy": 0.20124612057115882, | |
| "epoch": 2.672, | |
| "grad_norm": 52.5, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 3.181, | |
| "mean_token_accuracy": 0.9372910235077143, | |
| "num_tokens": 55281945.0, | |
| "step": 6680 | |
| }, | |
| { | |
| "entropy": 0.19095236633438617, | |
| "epoch": 2.676, | |
| "grad_norm": 53.0, | |
| "learning_rate": 7.357777777777778e-06, | |
| "loss": 3.0591, | |
| "mean_token_accuracy": 0.9394653089344501, | |
| "num_tokens": 55365453.0, | |
| "step": 6690 | |
| }, | |
| { | |
| "entropy": 0.18533826821949334, | |
| "epoch": 2.68, | |
| "grad_norm": 52.5, | |
| "learning_rate": 7.335555555555556e-06, | |
| "loss": 3.0354, | |
| "mean_token_accuracy": 0.939960053190589, | |
| "num_tokens": 55448136.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "eval_chemistry_entropy": 0.3872207936048508, | |
| "eval_chemistry_loss": 0.7984997630119324, | |
| "eval_chemistry_mean_token_accuracy": 0.8077767096161842, | |
| "eval_chemistry_num_tokens": 55448136.0, | |
| "eval_chemistry_runtime": 53.7309, | |
| "eval_chemistry_samples_per_second": 9.306, | |
| "eval_chemistry_steps_per_second": 9.306, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "eval_physics_entropy": 0.3809115352332592, | |
| "eval_physics_loss": 0.6571484804153442, | |
| "eval_physics_mean_token_accuracy": 0.8289183952212333, | |
| "eval_physics_num_tokens": 55448136.0, | |
| "eval_physics_runtime": 62.4829, | |
| "eval_physics_samples_per_second": 8.002, | |
| "eval_physics_steps_per_second": 8.002, | |
| "step": 6700 | |
| }, | |
| { | |
| "entropy": 0.1991529677528888, | |
| "epoch": 2.684, | |
| "grad_norm": 53.25, | |
| "learning_rate": 7.313333333333333e-06, | |
| "loss": 3.1639, | |
| "mean_token_accuracy": 0.9379461470991373, | |
| "num_tokens": 55527618.0, | |
| "step": 6710 | |
| }, | |
| { | |
| "entropy": 0.20218406142666936, | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 44.75, | |
| "learning_rate": 7.291111111111112e-06, | |
| "loss": 3.2156, | |
| "mean_token_accuracy": 0.9372401539236307, | |
| "num_tokens": 55612391.0, | |
| "step": 6720 | |
| }, | |
| { | |
| "entropy": 0.19709567329846323, | |
| "epoch": 2.692, | |
| "grad_norm": 54.5, | |
| "learning_rate": 7.268888888888889e-06, | |
| "loss": 3.1714, | |
| "mean_token_accuracy": 0.9389133401215076, | |
| "num_tokens": 55702866.0, | |
| "step": 6730 | |
| }, | |
| { | |
| "entropy": 0.1962867565918714, | |
| "epoch": 2.6959999999999997, | |
| "grad_norm": 49.0, | |
| "learning_rate": 7.246666666666667e-06, | |
| "loss": 3.1371, | |
| "mean_token_accuracy": 0.9385776583105325, | |
| "num_tokens": 55785705.0, | |
| "step": 6740 | |
| }, | |
| { | |
| "entropy": 0.19817850217223168, | |
| "epoch": 2.7, | |
| "grad_norm": 49.25, | |
| "learning_rate": 7.224444444444445e-06, | |
| "loss": 3.2753, | |
| "mean_token_accuracy": 0.9372531458735466, | |
| "num_tokens": 55867087.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "entropy": 0.1924516866914928, | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 45.0, | |
| "learning_rate": 7.202222222222222e-06, | |
| "loss": 3.0789, | |
| "mean_token_accuracy": 0.940273729339242, | |
| "num_tokens": 55949791.0, | |
| "step": 6760 | |
| }, | |
| { | |
| "entropy": 0.19664411903358997, | |
| "epoch": 2.708, | |
| "grad_norm": 55.0, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 3.1039, | |
| "mean_token_accuracy": 0.9393653303384781, | |
| "num_tokens": 56035086.0, | |
| "step": 6770 | |
| }, | |
| { | |
| "entropy": 0.19977637236006557, | |
| "epoch": 2.7119999999999997, | |
| "grad_norm": 43.5, | |
| "learning_rate": 7.157777777777778e-06, | |
| "loss": 3.1961, | |
| "mean_token_accuracy": 0.9382132433354855, | |
| "num_tokens": 56116561.0, | |
| "step": 6780 | |
| }, | |
| { | |
| "entropy": 0.18976120916195213, | |
| "epoch": 2.716, | |
| "grad_norm": 44.25, | |
| "learning_rate": 7.135555555555557e-06, | |
| "loss": 3.0028, | |
| "mean_token_accuracy": 0.9407155264168978, | |
| "num_tokens": 56200597.0, | |
| "step": 6790 | |
| }, | |
| { | |
| "entropy": 0.18658174788579346, | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 49.75, | |
| "learning_rate": 7.113333333333334e-06, | |
| "loss": 2.9268, | |
| "mean_token_accuracy": 0.9431479055434465, | |
| "num_tokens": 56282204.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_chemistry_entropy": 0.3863285926133394, | |
| "eval_chemistry_loss": 0.7942742705345154, | |
| "eval_chemistry_mean_token_accuracy": 0.8087411291003227, | |
| "eval_chemistry_num_tokens": 56282204.0, | |
| "eval_chemistry_runtime": 53.6679, | |
| "eval_chemistry_samples_per_second": 9.317, | |
| "eval_chemistry_steps_per_second": 9.317, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_physics_entropy": 0.3770558201819658, | |
| "eval_physics_loss": 0.6554569005966187, | |
| "eval_physics_mean_token_accuracy": 0.8296517934203148, | |
| "eval_physics_num_tokens": 56282204.0, | |
| "eval_physics_runtime": 63.9894, | |
| "eval_physics_samples_per_second": 7.814, | |
| "eval_physics_steps_per_second": 7.814, | |
| "step": 6800 | |
| }, | |
| { | |
| "entropy": 0.18676039190031588, | |
| "epoch": 2.724, | |
| "grad_norm": 55.5, | |
| "learning_rate": 7.091111111111112e-06, | |
| "loss": 3.0576, | |
| "mean_token_accuracy": 0.9410197082906961, | |
| "num_tokens": 56360927.0, | |
| "step": 6810 | |
| }, | |
| { | |
| "entropy": 0.19151561523322017, | |
| "epoch": 2.7279999999999998, | |
| "grad_norm": 45.5, | |
| "learning_rate": 7.06888888888889e-06, | |
| "loss": 3.0173, | |
| "mean_token_accuracy": 0.9407779421657324, | |
| "num_tokens": 56441829.0, | |
| "step": 6820 | |
| }, | |
| { | |
| "entropy": 0.2075064428150654, | |
| "epoch": 2.732, | |
| "grad_norm": 52.5, | |
| "learning_rate": 7.046666666666667e-06, | |
| "loss": 3.3576, | |
| "mean_token_accuracy": 0.9351352181285619, | |
| "num_tokens": 56525407.0, | |
| "step": 6830 | |
| }, | |
| { | |
| "entropy": 0.1923141000792384, | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 48.0, | |
| "learning_rate": 7.024444444444445e-06, | |
| "loss": 3.0718, | |
| "mean_token_accuracy": 0.9403763923794031, | |
| "num_tokens": 56605478.0, | |
| "step": 6840 | |
| }, | |
| { | |
| "entropy": 0.19709600014612078, | |
| "epoch": 2.74, | |
| "grad_norm": 62.5, | |
| "learning_rate": 7.0022222222222225e-06, | |
| "loss": 3.0554, | |
| "mean_token_accuracy": 0.9399521335959434, | |
| "num_tokens": 56686104.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "entropy": 0.1793178698513657, | |
| "epoch": 2.7439999999999998, | |
| "grad_norm": 62.25, | |
| "learning_rate": 6.98e-06, | |
| "loss": 2.859, | |
| "mean_token_accuracy": 0.9437912072986364, | |
| "num_tokens": 56764430.0, | |
| "step": 6860 | |
| }, | |
| { | |
| "entropy": 0.19850189993157982, | |
| "epoch": 2.748, | |
| "grad_norm": 56.5, | |
| "learning_rate": 6.9577777777777785e-06, | |
| "loss": 3.2057, | |
| "mean_token_accuracy": 0.9366141017526388, | |
| "num_tokens": 56848553.0, | |
| "step": 6870 | |
| }, | |
| { | |
| "entropy": 0.20232036970555783, | |
| "epoch": 2.752, | |
| "grad_norm": 53.5, | |
| "learning_rate": 6.935555555555556e-06, | |
| "loss": 3.2905, | |
| "mean_token_accuracy": 0.935778671875596, | |
| "num_tokens": 56935874.0, | |
| "step": 6880 | |
| }, | |
| { | |
| "entropy": 0.20167628910858185, | |
| "epoch": 2.7560000000000002, | |
| "grad_norm": 45.0, | |
| "learning_rate": 6.913333333333334e-06, | |
| "loss": 3.184, | |
| "mean_token_accuracy": 0.9392263300716877, | |
| "num_tokens": 57019666.0, | |
| "step": 6890 | |
| }, | |
| { | |
| "entropy": 0.18065473050810396, | |
| "epoch": 2.76, | |
| "grad_norm": 45.75, | |
| "learning_rate": 6.891111111111111e-06, | |
| "loss": 2.8317, | |
| "mean_token_accuracy": 0.9440672140568495, | |
| "num_tokens": 57104270.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "eval_chemistry_entropy": 0.3841234390437603, | |
| "eval_chemistry_loss": 0.8016098737716675, | |
| "eval_chemistry_mean_token_accuracy": 0.8080509839653969, | |
| "eval_chemistry_num_tokens": 57104270.0, | |
| "eval_chemistry_runtime": 53.664, | |
| "eval_chemistry_samples_per_second": 9.317, | |
| "eval_chemistry_steps_per_second": 9.317, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "eval_physics_entropy": 0.37600518913567066, | |
| "eval_physics_loss": 0.6605067253112793, | |
| "eval_physics_mean_token_accuracy": 0.8288771219849587, | |
| "eval_physics_num_tokens": 57104270.0, | |
| "eval_physics_runtime": 62.3388, | |
| "eval_physics_samples_per_second": 8.021, | |
| "eval_physics_steps_per_second": 8.021, | |
| "step": 6900 | |
| }, | |
| { | |
| "entropy": 0.18842837295960635, | |
| "epoch": 2.7640000000000002, | |
| "grad_norm": 56.5, | |
| "learning_rate": 6.868888888888889e-06, | |
| "loss": 3.1173, | |
| "mean_token_accuracy": 0.9401496604084969, | |
| "num_tokens": 57187408.0, | |
| "step": 6910 | |
| }, | |
| { | |
| "entropy": 0.20339992626104503, | |
| "epoch": 2.768, | |
| "grad_norm": 53.75, | |
| "learning_rate": 6.846666666666667e-06, | |
| "loss": 3.2452, | |
| "mean_token_accuracy": 0.9378059506416321, | |
| "num_tokens": 57270552.0, | |
| "step": 6920 | |
| }, | |
| { | |
| "entropy": 0.18887163817416877, | |
| "epoch": 2.7720000000000002, | |
| "grad_norm": 62.75, | |
| "learning_rate": 6.824444444444444e-06, | |
| "loss": 3.0476, | |
| "mean_token_accuracy": 0.9414676714688539, | |
| "num_tokens": 57349898.0, | |
| "step": 6930 | |
| }, | |
| { | |
| "entropy": 0.19644138389267027, | |
| "epoch": 2.776, | |
| "grad_norm": 49.5, | |
| "learning_rate": 6.802222222222223e-06, | |
| "loss": 3.1043, | |
| "mean_token_accuracy": 0.9390871111303568, | |
| "num_tokens": 57434347.0, | |
| "step": 6940 | |
| }, | |
| { | |
| "entropy": 0.19510912159457802, | |
| "epoch": 2.7800000000000002, | |
| "grad_norm": 53.75, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 3.1098, | |
| "mean_token_accuracy": 0.9399356350302697, | |
| "num_tokens": 57517981.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "entropy": 0.19700194797478615, | |
| "epoch": 2.784, | |
| "grad_norm": 46.5, | |
| "learning_rate": 6.757777777777779e-06, | |
| "loss": 3.1683, | |
| "mean_token_accuracy": 0.9388831451535224, | |
| "num_tokens": 57602968.0, | |
| "step": 6960 | |
| }, | |
| { | |
| "entropy": 0.19594523953273893, | |
| "epoch": 2.7880000000000003, | |
| "grad_norm": 55.0, | |
| "learning_rate": 6.735555555555556e-06, | |
| "loss": 3.1451, | |
| "mean_token_accuracy": 0.939293348044157, | |
| "num_tokens": 57682733.0, | |
| "step": 6970 | |
| }, | |
| { | |
| "entropy": 0.18715510638430716, | |
| "epoch": 2.792, | |
| "grad_norm": 53.75, | |
| "learning_rate": 6.713333333333334e-06, | |
| "loss": 3.0169, | |
| "mean_token_accuracy": 0.9412729896605014, | |
| "num_tokens": 57760844.0, | |
| "step": 6980 | |
| }, | |
| { | |
| "entropy": 0.1824359907535836, | |
| "epoch": 2.7960000000000003, | |
| "grad_norm": 47.0, | |
| "learning_rate": 6.691111111111112e-06, | |
| "loss": 2.8979, | |
| "mean_token_accuracy": 0.943104237690568, | |
| "num_tokens": 57841289.0, | |
| "step": 6990 | |
| }, | |
| { | |
| "entropy": 0.20199860502034425, | |
| "epoch": 2.8, | |
| "grad_norm": 43.75, | |
| "learning_rate": 6.668888888888889e-06, | |
| "loss": 3.2287, | |
| "mean_token_accuracy": 0.9358510050922633, | |
| "num_tokens": 57925886.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_chemistry_entropy": 0.3854501196742058, | |
| "eval_chemistry_loss": 0.7992454767227173, | |
| "eval_chemistry_mean_token_accuracy": 0.8082220183610916, | |
| "eval_chemistry_num_tokens": 57925886.0, | |
| "eval_chemistry_runtime": 53.6486, | |
| "eval_chemistry_samples_per_second": 9.32, | |
| "eval_chemistry_steps_per_second": 9.32, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_physics_entropy": 0.37856728382408616, | |
| "eval_physics_loss": 0.6578669548034668, | |
| "eval_physics_mean_token_accuracy": 0.829574048101902, | |
| "eval_physics_num_tokens": 57925886.0, | |
| "eval_physics_runtime": 62.2826, | |
| "eval_physics_samples_per_second": 8.028, | |
| "eval_physics_steps_per_second": 8.028, | |
| "step": 7000 | |
| }, | |
| { | |
| "entropy": 0.1885908193187788, | |
| "epoch": 2.8040000000000003, | |
| "grad_norm": 55.5, | |
| "learning_rate": 6.646666666666667e-06, | |
| "loss": 3.0018, | |
| "mean_token_accuracy": 0.9410428166389465, | |
| "num_tokens": 58007027.0, | |
| "step": 7010 | |
| }, | |
| { | |
| "entropy": 0.18736288100481033, | |
| "epoch": 2.808, | |
| "grad_norm": 45.75, | |
| "learning_rate": 6.6244444444444445e-06, | |
| "loss": 3.0258, | |
| "mean_token_accuracy": 0.9396544147282839, | |
| "num_tokens": 58090738.0, | |
| "step": 7020 | |
| }, | |
| { | |
| "entropy": 0.1878430583514273, | |
| "epoch": 2.8120000000000003, | |
| "grad_norm": 49.25, | |
| "learning_rate": 6.602222222222223e-06, | |
| "loss": 3.0079, | |
| "mean_token_accuracy": 0.9409733500331641, | |
| "num_tokens": 58179496.0, | |
| "step": 7030 | |
| }, | |
| { | |
| "entropy": 0.1853222672827542, | |
| "epoch": 2.816, | |
| "grad_norm": 43.75, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 2.9404, | |
| "mean_token_accuracy": 0.942448028549552, | |
| "num_tokens": 58262960.0, | |
| "step": 7040 | |
| }, | |
| { | |
| "entropy": 0.18262053932994604, | |
| "epoch": 2.82, | |
| "grad_norm": 50.0, | |
| "learning_rate": 6.557777777777778e-06, | |
| "loss": 2.8997, | |
| "mean_token_accuracy": 0.9433879490941763, | |
| "num_tokens": 58338205.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "entropy": 0.19106363146565855, | |
| "epoch": 2.824, | |
| "grad_norm": 45.25, | |
| "learning_rate": 6.535555555555556e-06, | |
| "loss": 3.1147, | |
| "mean_token_accuracy": 0.9396902658045292, | |
| "num_tokens": 58419404.0, | |
| "step": 7060 | |
| }, | |
| { | |
| "entropy": 0.19635051493532957, | |
| "epoch": 2.828, | |
| "grad_norm": 58.0, | |
| "learning_rate": 6.513333333333333e-06, | |
| "loss": 3.0466, | |
| "mean_token_accuracy": 0.9415500082075596, | |
| "num_tokens": 58502578.0, | |
| "step": 7070 | |
| }, | |
| { | |
| "entropy": 0.1852906398009509, | |
| "epoch": 2.832, | |
| "grad_norm": 44.75, | |
| "learning_rate": 6.491111111111111e-06, | |
| "loss": 2.9744, | |
| "mean_token_accuracy": 0.9411802385002375, | |
| "num_tokens": 58586324.0, | |
| "step": 7080 | |
| }, | |
| { | |
| "entropy": 0.18403017576783895, | |
| "epoch": 2.836, | |
| "grad_norm": 54.75, | |
| "learning_rate": 6.468888888888889e-06, | |
| "loss": 2.8748, | |
| "mean_token_accuracy": 0.9433661613613367, | |
| "num_tokens": 58670506.0, | |
| "step": 7090 | |
| }, | |
| { | |
| "entropy": 0.1956013266928494, | |
| "epoch": 2.84, | |
| "grad_norm": 55.5, | |
| "learning_rate": 6.446666666666668e-06, | |
| "loss": 3.2105, | |
| "mean_token_accuracy": 0.9360841907560825, | |
| "num_tokens": 58753501.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "eval_chemistry_entropy": 0.38081419436633585, | |
| "eval_chemistry_loss": 0.804741382598877, | |
| "eval_chemistry_mean_token_accuracy": 0.8079236562848091, | |
| "eval_chemistry_num_tokens": 58753501.0, | |
| "eval_chemistry_runtime": 53.6618, | |
| "eval_chemistry_samples_per_second": 9.318, | |
| "eval_chemistry_steps_per_second": 9.318, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "eval_physics_entropy": 0.37602790972590444, | |
| "eval_physics_loss": 0.6613836884498596, | |
| "eval_physics_mean_token_accuracy": 0.8293393741250038, | |
| "eval_physics_num_tokens": 58753501.0, | |
| "eval_physics_runtime": 62.3643, | |
| "eval_physics_samples_per_second": 8.017, | |
| "eval_physics_steps_per_second": 8.017, | |
| "step": 7100 | |
| }, | |
| { | |
| "entropy": 0.20206863661296665, | |
| "epoch": 2.844, | |
| "grad_norm": 56.0, | |
| "learning_rate": 6.4244444444444455e-06, | |
| "loss": 3.188, | |
| "mean_token_accuracy": 0.9375080045312643, | |
| "num_tokens": 58830587.0, | |
| "step": 7110 | |
| }, | |
| { | |
| "entropy": 0.19918231770861894, | |
| "epoch": 2.848, | |
| "grad_norm": 52.75, | |
| "learning_rate": 6.402222222222223e-06, | |
| "loss": 3.1786, | |
| "mean_token_accuracy": 0.9379303775727749, | |
| "num_tokens": 58912949.0, | |
| "step": 7120 | |
| }, | |
| { | |
| "entropy": 0.19943627554457635, | |
| "epoch": 2.852, | |
| "grad_norm": 49.5, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 3.1486, | |
| "mean_token_accuracy": 0.9387002568691969, | |
| "num_tokens": 58999083.0, | |
| "step": 7130 | |
| }, | |
| { | |
| "entropy": 0.20037950184196235, | |
| "epoch": 2.856, | |
| "grad_norm": 45.0, | |
| "learning_rate": 6.357777777777778e-06, | |
| "loss": 3.2555, | |
| "mean_token_accuracy": 0.9366050720214844, | |
| "num_tokens": 59082635.0, | |
| "step": 7140 | |
| }, | |
| { | |
| "entropy": 0.19903734037652612, | |
| "epoch": 2.86, | |
| "grad_norm": 50.75, | |
| "learning_rate": 6.335555555555556e-06, | |
| "loss": 3.1792, | |
| "mean_token_accuracy": 0.9386274553835392, | |
| "num_tokens": 59167551.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "entropy": 0.18307497054338456, | |
| "epoch": 2.864, | |
| "grad_norm": 52.75, | |
| "learning_rate": 6.313333333333334e-06, | |
| "loss": 2.9722, | |
| "mean_token_accuracy": 0.9420153506100177, | |
| "num_tokens": 59252494.0, | |
| "step": 7160 | |
| }, | |
| { | |
| "entropy": 0.19887874133419245, | |
| "epoch": 2.868, | |
| "grad_norm": 56.25, | |
| "learning_rate": 6.291111111111111e-06, | |
| "loss": 3.169, | |
| "mean_token_accuracy": 0.9400444660335779, | |
| "num_tokens": 59331383.0, | |
| "step": 7170 | |
| }, | |
| { | |
| "entropy": 0.18757213121280075, | |
| "epoch": 2.872, | |
| "grad_norm": 44.5, | |
| "learning_rate": 6.26888888888889e-06, | |
| "loss": 2.9598, | |
| "mean_token_accuracy": 0.9421675182878971, | |
| "num_tokens": 59415783.0, | |
| "step": 7180 | |
| }, | |
| { | |
| "entropy": 0.1866942286025733, | |
| "epoch": 2.876, | |
| "grad_norm": 45.0, | |
| "learning_rate": 6.246666666666667e-06, | |
| "loss": 2.9461, | |
| "mean_token_accuracy": 0.9424825951457023, | |
| "num_tokens": 59499036.0, | |
| "step": 7190 | |
| }, | |
| { | |
| "entropy": 0.1878941633971408, | |
| "epoch": 2.88, | |
| "grad_norm": 49.75, | |
| "learning_rate": 6.224444444444445e-06, | |
| "loss": 3.0087, | |
| "mean_token_accuracy": 0.9413879293948412, | |
| "num_tokens": 59574951.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_chemistry_entropy": 0.3784867779165506, | |
| "eval_chemistry_loss": 0.8032795190811157, | |
| "eval_chemistry_mean_token_accuracy": 0.8082240733504296, | |
| "eval_chemistry_num_tokens": 59574951.0, | |
| "eval_chemistry_runtime": 53.6582, | |
| "eval_chemistry_samples_per_second": 9.318, | |
| "eval_chemistry_steps_per_second": 9.318, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_physics_entropy": 0.3712990110218525, | |
| "eval_physics_loss": 0.6603429913520813, | |
| "eval_physics_mean_token_accuracy": 0.8300845678448677, | |
| "eval_physics_num_tokens": 59574951.0, | |
| "eval_physics_runtime": 62.4403, | |
| "eval_physics_samples_per_second": 8.008, | |
| "eval_physics_steps_per_second": 8.008, | |
| "step": 7200 | |
| }, | |
| { | |
| "entropy": 0.18565670023672282, | |
| "epoch": 2.884, | |
| "grad_norm": 50.75, | |
| "learning_rate": 6.2022222222222225e-06, | |
| "loss": 3.0284, | |
| "mean_token_accuracy": 0.9412801876664162, | |
| "num_tokens": 59661716.0, | |
| "step": 7210 | |
| }, | |
| { | |
| "entropy": 0.1831112688407302, | |
| "epoch": 2.888, | |
| "grad_norm": 60.75, | |
| "learning_rate": 6.18e-06, | |
| "loss": 2.9374, | |
| "mean_token_accuracy": 0.9427898015826941, | |
| "num_tokens": 59742110.0, | |
| "step": 7220 | |
| }, | |
| { | |
| "entropy": 0.19294007322750986, | |
| "epoch": 2.892, | |
| "grad_norm": 55.0, | |
| "learning_rate": 6.157777777777778e-06, | |
| "loss": 3.0357, | |
| "mean_token_accuracy": 0.9397770404815674, | |
| "num_tokens": 59823256.0, | |
| "step": 7230 | |
| }, | |
| { | |
| "entropy": 0.18930605384521187, | |
| "epoch": 2.896, | |
| "grad_norm": 64.0, | |
| "learning_rate": 6.135555555555555e-06, | |
| "loss": 3.0464, | |
| "mean_token_accuracy": 0.9409488279372453, | |
| "num_tokens": 59903274.0, | |
| "step": 7240 | |
| }, | |
| { | |
| "entropy": 0.18731349245645107, | |
| "epoch": 2.9, | |
| "grad_norm": 50.0, | |
| "learning_rate": 6.113333333333333e-06, | |
| "loss": 3.0478, | |
| "mean_token_accuracy": 0.9407065026462078, | |
| "num_tokens": 59983776.0, | |
| "step": 7250 | |
| }, | |
| { | |
| "entropy": 0.18384810623247178, | |
| "epoch": 2.904, | |
| "grad_norm": 54.5, | |
| "learning_rate": 6.091111111111112e-06, | |
| "loss": 2.9864, | |
| "mean_token_accuracy": 0.9418561324477196, | |
| "num_tokens": 60064633.0, | |
| "step": 7260 | |
| }, | |
| { | |
| "entropy": 0.18944950685836376, | |
| "epoch": 2.908, | |
| "grad_norm": 51.75, | |
| "learning_rate": 6.06888888888889e-06, | |
| "loss": 2.9694, | |
| "mean_token_accuracy": 0.941279224678874, | |
| "num_tokens": 60146531.0, | |
| "step": 7270 | |
| }, | |
| { | |
| "entropy": 0.20682764388620853, | |
| "epoch": 2.912, | |
| "grad_norm": 52.5, | |
| "learning_rate": 6.0466666666666675e-06, | |
| "loss": 3.3159, | |
| "mean_token_accuracy": 0.93412882424891, | |
| "num_tokens": 60234203.0, | |
| "step": 7280 | |
| }, | |
| { | |
| "entropy": 0.1821716870646924, | |
| "epoch": 2.916, | |
| "grad_norm": 51.0, | |
| "learning_rate": 6.024444444444445e-06, | |
| "loss": 2.9618, | |
| "mean_token_accuracy": 0.9426081079989672, | |
| "num_tokens": 60314948.0, | |
| "step": 7290 | |
| }, | |
| { | |
| "entropy": 0.1883004566654563, | |
| "epoch": 2.92, | |
| "grad_norm": 57.0, | |
| "learning_rate": 6.002222222222223e-06, | |
| "loss": 2.9096, | |
| "mean_token_accuracy": 0.942603713274002, | |
| "num_tokens": 60392231.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "eval_chemistry_entropy": 0.381943504139781, | |
| "eval_chemistry_loss": 0.8038550615310669, | |
| "eval_chemistry_mean_token_accuracy": 0.8077712023854255, | |
| "eval_chemistry_num_tokens": 60392231.0, | |
| "eval_chemistry_runtime": 53.6802, | |
| "eval_chemistry_samples_per_second": 9.314, | |
| "eval_chemistry_steps_per_second": 9.314, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "eval_physics_entropy": 0.3759955544620752, | |
| "eval_physics_loss": 0.659770131111145, | |
| "eval_physics_mean_token_accuracy": 0.8294044479727745, | |
| "eval_physics_num_tokens": 60392231.0, | |
| "eval_physics_runtime": 62.4462, | |
| "eval_physics_samples_per_second": 8.007, | |
| "eval_physics_steps_per_second": 8.007, | |
| "step": 7300 | |
| }, | |
| { | |
| "entropy": 0.18766175201162696, | |
| "epoch": 2.924, | |
| "grad_norm": 44.75, | |
| "learning_rate": 5.98e-06, | |
| "loss": 2.9743, | |
| "mean_token_accuracy": 0.9417757015675307, | |
| "num_tokens": 60473613.0, | |
| "step": 7310 | |
| }, | |
| { | |
| "entropy": 0.19203901765868067, | |
| "epoch": 2.928, | |
| "grad_norm": 47.75, | |
| "learning_rate": 5.957777777777778e-06, | |
| "loss": 3.1601, | |
| "mean_token_accuracy": 0.9375981785356998, | |
| "num_tokens": 60559123.0, | |
| "step": 7320 | |
| }, | |
| { | |
| "entropy": 0.20894643696956336, | |
| "epoch": 2.932, | |
| "grad_norm": 47.5, | |
| "learning_rate": 5.9355555555555556e-06, | |
| "loss": 3.3378, | |
| "mean_token_accuracy": 0.9357815183699131, | |
| "num_tokens": 60643595.0, | |
| "step": 7330 | |
| }, | |
| { | |
| "entropy": 0.17657043484505267, | |
| "epoch": 2.936, | |
| "grad_norm": 48.25, | |
| "learning_rate": 5.913333333333334e-06, | |
| "loss": 2.8788, | |
| "mean_token_accuracy": 0.9434063080698252, | |
| "num_tokens": 60723827.0, | |
| "step": 7340 | |
| }, | |
| { | |
| "entropy": 0.1906730006914586, | |
| "epoch": 2.94, | |
| "grad_norm": 57.0, | |
| "learning_rate": 5.891111111111112e-06, | |
| "loss": 2.9909, | |
| "mean_token_accuracy": 0.9411515049636364, | |
| "num_tokens": 60811772.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "entropy": 0.19591064886189996, | |
| "epoch": 2.944, | |
| "grad_norm": 52.5, | |
| "learning_rate": 5.868888888888889e-06, | |
| "loss": 3.1479, | |
| "mean_token_accuracy": 0.937995707988739, | |
| "num_tokens": 60893423.0, | |
| "step": 7360 | |
| }, | |
| { | |
| "entropy": 0.19990056552924215, | |
| "epoch": 2.948, | |
| "grad_norm": 62.25, | |
| "learning_rate": 5.846666666666667e-06, | |
| "loss": 3.202, | |
| "mean_token_accuracy": 0.9373685888946056, | |
| "num_tokens": 60974894.0, | |
| "step": 7370 | |
| }, | |
| { | |
| "entropy": 0.19673359054140746, | |
| "epoch": 2.952, | |
| "grad_norm": 45.75, | |
| "learning_rate": 5.8244444444444445e-06, | |
| "loss": 3.1624, | |
| "mean_token_accuracy": 0.9380766972899437, | |
| "num_tokens": 61061168.0, | |
| "step": 7380 | |
| }, | |
| { | |
| "entropy": 0.1861823730636388, | |
| "epoch": 2.956, | |
| "grad_norm": 44.75, | |
| "learning_rate": 5.802222222222222e-06, | |
| "loss": 2.9666, | |
| "mean_token_accuracy": 0.9405010983347892, | |
| "num_tokens": 61144920.0, | |
| "step": 7390 | |
| }, | |
| { | |
| "entropy": 0.19820140418596566, | |
| "epoch": 2.96, | |
| "grad_norm": 48.25, | |
| "learning_rate": 5.78e-06, | |
| "loss": 3.1092, | |
| "mean_token_accuracy": 0.9393005721271038, | |
| "num_tokens": 61228692.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "eval_chemistry_entropy": 0.38530797137320044, | |
| "eval_chemistry_loss": 0.8003439903259277, | |
| "eval_chemistry_mean_token_accuracy": 0.8083557995557785, | |
| "eval_chemistry_num_tokens": 61228692.0, | |
| "eval_chemistry_runtime": 53.5803, | |
| "eval_chemistry_samples_per_second": 9.332, | |
| "eval_chemistry_steps_per_second": 9.332, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "eval_physics_entropy": 0.37705324468016627, | |
| "eval_physics_loss": 0.6582639813423157, | |
| "eval_physics_mean_token_accuracy": 0.8295326589941978, | |
| "eval_physics_num_tokens": 61228692.0, | |
| "eval_physics_runtime": 62.3306, | |
| "eval_physics_samples_per_second": 8.022, | |
| "eval_physics_steps_per_second": 8.022, | |
| "step": 7400 | |
| }, | |
| { | |
| "entropy": 0.19290051145944745, | |
| "epoch": 2.964, | |
| "grad_norm": 53.75, | |
| "learning_rate": 5.757777777777779e-06, | |
| "loss": 3.1227, | |
| "mean_token_accuracy": 0.939802111685276, | |
| "num_tokens": 61308127.0, | |
| "step": 7410 | |
| }, | |
| { | |
| "entropy": 0.19154557730071248, | |
| "epoch": 2.968, | |
| "grad_norm": 45.0, | |
| "learning_rate": 5.735555555555557e-06, | |
| "loss": 3.0561, | |
| "mean_token_accuracy": 0.939547312259674, | |
| "num_tokens": 61391712.0, | |
| "step": 7420 | |
| }, | |
| { | |
| "entropy": 0.18490437644068153, | |
| "epoch": 2.972, | |
| "grad_norm": 42.25, | |
| "learning_rate": 5.713333333333334e-06, | |
| "loss": 2.9349, | |
| "mean_token_accuracy": 0.9423038199543953, | |
| "num_tokens": 61474578.0, | |
| "step": 7430 | |
| }, | |
| { | |
| "entropy": 0.19823734238743781, | |
| "epoch": 2.976, | |
| "grad_norm": 49.0, | |
| "learning_rate": 5.691111111111112e-06, | |
| "loss": 3.0802, | |
| "mean_token_accuracy": 0.939618230983615, | |
| "num_tokens": 61560852.0, | |
| "step": 7440 | |
| }, | |
| { | |
| "entropy": 0.20201547369360923, | |
| "epoch": 2.98, | |
| "grad_norm": 48.5, | |
| "learning_rate": 5.6688888888888895e-06, | |
| "loss": 3.2932, | |
| "mean_token_accuracy": 0.93549032099545, | |
| "num_tokens": 61647603.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "entropy": 0.19051897027529777, | |
| "epoch": 2.984, | |
| "grad_norm": 56.5, | |
| "learning_rate": 5.646666666666667e-06, | |
| "loss": 2.9478, | |
| "mean_token_accuracy": 0.9416087452322245, | |
| "num_tokens": 61730032.0, | |
| "step": 7460 | |
| }, | |
| { | |
| "entropy": 0.1911150220548734, | |
| "epoch": 2.988, | |
| "grad_norm": 52.5, | |
| "learning_rate": 5.624444444444445e-06, | |
| "loss": 3.0294, | |
| "mean_token_accuracy": 0.9407831937074661, | |
| "num_tokens": 61813075.0, | |
| "step": 7470 | |
| }, | |
| { | |
| "entropy": 0.18567710989154876, | |
| "epoch": 2.992, | |
| "grad_norm": 61.5, | |
| "learning_rate": 5.602222222222222e-06, | |
| "loss": 3.0278, | |
| "mean_token_accuracy": 0.9410592328757048, | |
| "num_tokens": 61898951.0, | |
| "step": 7480 | |
| }, | |
| { | |
| "entropy": 0.1776598389260471, | |
| "epoch": 2.996, | |
| "grad_norm": 46.75, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 2.7686, | |
| "mean_token_accuracy": 0.9449117906391621, | |
| "num_tokens": 61976730.0, | |
| "step": 7490 | |
| }, | |
| { | |
| "entropy": 0.19180974457412958, | |
| "epoch": 3.0, | |
| "grad_norm": 61.25, | |
| "learning_rate": 5.557777777777778e-06, | |
| "loss": 3.1208, | |
| "mean_token_accuracy": 0.9389300424605608, | |
| "num_tokens": 62060412.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_chemistry_entropy": 0.37897596071660516, | |
| "eval_chemistry_loss": 0.8056021332740784, | |
| "eval_chemistry_mean_token_accuracy": 0.8078991361260414, | |
| "eval_chemistry_num_tokens": 62060412.0, | |
| "eval_chemistry_runtime": 53.5759, | |
| "eval_chemistry_samples_per_second": 9.333, | |
| "eval_chemistry_steps_per_second": 9.333, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_physics_entropy": 0.3714893025457859, | |
| "eval_physics_loss": 0.6623644828796387, | |
| "eval_physics_mean_token_accuracy": 0.8292919734120369, | |
| "eval_physics_num_tokens": 62060412.0, | |
| "eval_physics_runtime": 62.2597, | |
| "eval_physics_samples_per_second": 8.031, | |
| "eval_physics_steps_per_second": 8.031, | |
| "step": 7500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.1614216031970304e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |