Instructions to use roonbug/2b63aec8 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use roonbug/2b63aec8 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="roonbug/2b63aec8") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("roonbug/2b63aec8") model = AutoModelForImageTextToText.from_pretrained("roonbug/2b63aec8") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use roonbug/2b63aec8 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "roonbug/2b63aec8" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/2b63aec8", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/roonbug/2b63aec8
- SGLang
How to use roonbug/2b63aec8 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "roonbug/2b63aec8" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/2b63aec8", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "roonbug/2b63aec8" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/2b63aec8", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use roonbug/2b63aec8 with Docker Model Runner:
docker model run hf.co/roonbug/2b63aec8
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.2, | |
| "eval_steps": 100, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.142920307815075, | |
| "epoch": 0.016, | |
| "grad_norm": 290.0, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 42.6658, | |
| "mean_token_accuracy": 0.5620782226324081, | |
| "num_tokens": 195524.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.148210159689188, | |
| "epoch": 0.032, | |
| "grad_norm": 239.0, | |
| "learning_rate": 1.2666666666666669e-06, | |
| "loss": 41.9984, | |
| "mean_token_accuracy": 0.5613080382347106, | |
| "num_tokens": 390903.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.1933260083198547, | |
| "epoch": 0.048, | |
| "grad_norm": 249.0, | |
| "learning_rate": 1.9333333333333336e-06, | |
| "loss": 40.6208, | |
| "mean_token_accuracy": 0.5657517908141017, | |
| "num_tokens": 589868.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.2957281917333603, | |
| "epoch": 0.064, | |
| "grad_norm": 139.0, | |
| "learning_rate": 2.6e-06, | |
| "loss": 37.9032, | |
| "mean_token_accuracy": 0.5714796105399728, | |
| "num_tokens": 791190.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.5075685508549213, | |
| "epoch": 0.08, | |
| "grad_norm": 94.0, | |
| "learning_rate": 3.266666666666667e-06, | |
| "loss": 35.7561, | |
| "mean_token_accuracy": 0.5766569443047047, | |
| "num_tokens": 989860.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.7984249681234359, | |
| "epoch": 0.096, | |
| "grad_norm": 50.75, | |
| "learning_rate": 3.9333333333333335e-06, | |
| "loss": 33.4379, | |
| "mean_token_accuracy": 0.5814697606489062, | |
| "num_tokens": 1181777.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.8387351341545581, | |
| "epoch": 0.112, | |
| "grad_norm": 43.0, | |
| "learning_rate": 4.600000000000001e-06, | |
| "loss": 30.4219, | |
| "mean_token_accuracy": 0.5971228444948793, | |
| "num_tokens": 1385513.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.7275233700871468, | |
| "epoch": 0.128, | |
| "grad_norm": 33.5, | |
| "learning_rate": 5.2666666666666665e-06, | |
| "loss": 28.4703, | |
| "mean_token_accuracy": 0.6095364252105355, | |
| "num_tokens": 1582368.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.7214979872107505, | |
| "epoch": 0.144, | |
| "grad_norm": 27.0, | |
| "learning_rate": 5.933333333333335e-06, | |
| "loss": 26.677, | |
| "mean_token_accuracy": 0.6243448719382286, | |
| "num_tokens": 1773764.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.6311134904623033, | |
| "epoch": 0.16, | |
| "grad_norm": 22.0, | |
| "learning_rate": 6.600000000000001e-06, | |
| "loss": 25.7683, | |
| "mean_token_accuracy": 0.6301404371857643, | |
| "num_tokens": 1970077.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_biology_entropy": 1.5580159120559693, | |
| "eval_biology_loss": 1.5081593990325928, | |
| "eval_biology_mean_token_accuracy": 0.6457349667549134, | |
| "eval_biology_num_tokens": 1970077.0, | |
| "eval_biology_runtime": 48.7413, | |
| "eval_biology_samples_per_second": 10.258, | |
| "eval_biology_steps_per_second": 2.565, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_chemistry_entropy": 1.206756212234497, | |
| "eval_chemistry_loss": 1.1218774318695068, | |
| "eval_chemistry_mean_token_accuracy": 0.7205783066749573, | |
| "eval_chemistry_num_tokens": 1970077.0, | |
| "eval_chemistry_runtime": 60.3159, | |
| "eval_chemistry_samples_per_second": 8.29, | |
| "eval_chemistry_steps_per_second": 2.072, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_math_entropy": 0.9672308325767517, | |
| "eval_math_loss": 1.159799337387085, | |
| "eval_math_mean_token_accuracy": 0.7189845342636109, | |
| "eval_math_num_tokens": 1970077.0, | |
| "eval_math_runtime": 61.8237, | |
| "eval_math_samples_per_second": 8.088, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_physics_entropy": 1.1670387201309205, | |
| "eval_physics_loss": 1.1291608810424805, | |
| "eval_physics_mean_token_accuracy": 0.7211072521209717, | |
| "eval_physics_num_tokens": 1970077.0, | |
| "eval_physics_runtime": 70.4586, | |
| "eval_physics_samples_per_second": 7.096, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.5482715763151647, | |
| "epoch": 0.176, | |
| "grad_norm": 21.125, | |
| "learning_rate": 7.266666666666668e-06, | |
| "loss": 24.5868, | |
| "mean_token_accuracy": 0.6385629490017891, | |
| "num_tokens": 2168354.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.5266574397683144, | |
| "epoch": 0.192, | |
| "grad_norm": 22.875, | |
| "learning_rate": 7.933333333333334e-06, | |
| "loss": 24.2707, | |
| "mean_token_accuracy": 0.6432460084557533, | |
| "num_tokens": 2365822.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.5192069873213767, | |
| "epoch": 0.208, | |
| "grad_norm": 20.875, | |
| "learning_rate": 8.6e-06, | |
| "loss": 24.1355, | |
| "mean_token_accuracy": 0.6436416517943144, | |
| "num_tokens": 2558762.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.4698147468268872, | |
| "epoch": 0.224, | |
| "grad_norm": 20.125, | |
| "learning_rate": 9.266666666666667e-06, | |
| "loss": 23.5154, | |
| "mean_token_accuracy": 0.6499760080128908, | |
| "num_tokens": 2755347.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.4506230603903532, | |
| "epoch": 0.24, | |
| "grad_norm": 19.625, | |
| "learning_rate": 9.933333333333334e-06, | |
| "loss": 23.2013, | |
| "mean_token_accuracy": 0.6523264441639185, | |
| "num_tokens": 2947346.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.4590953961014748, | |
| "epoch": 0.256, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.0600000000000002e-05, | |
| "loss": 23.3227, | |
| "mean_token_accuracy": 0.6508617259562015, | |
| "num_tokens": 3139957.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.419396448880434, | |
| "epoch": 0.272, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.1266666666666668e-05, | |
| "loss": 22.7352, | |
| "mean_token_accuracy": 0.6572458431124687, | |
| "num_tokens": 3335951.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.4005608204752207, | |
| "epoch": 0.288, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.1933333333333335e-05, | |
| "loss": 22.3969, | |
| "mean_token_accuracy": 0.6585959013551473, | |
| "num_tokens": 3539731.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.391934547200799, | |
| "epoch": 0.304, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 22.31, | |
| "mean_token_accuracy": 0.6621056370437145, | |
| "num_tokens": 3733488.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.4028674490749835, | |
| "epoch": 0.32, | |
| "grad_norm": 22.25, | |
| "learning_rate": 1.3266666666666668e-05, | |
| "loss": 22.5559, | |
| "mean_token_accuracy": 0.6576981086283922, | |
| "num_tokens": 3920545.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_biology_entropy": 1.3209806289672852, | |
| "eval_biology_loss": 1.338399887084961, | |
| "eval_biology_mean_token_accuracy": 0.6720403518676757, | |
| "eval_biology_num_tokens": 3920545.0, | |
| "eval_biology_runtime": 48.5853, | |
| "eval_biology_samples_per_second": 10.291, | |
| "eval_biology_steps_per_second": 2.573, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_chemistry_entropy": 1.0033348879814148, | |
| "eval_chemistry_loss": 0.9935092926025391, | |
| "eval_chemistry_mean_token_accuracy": 0.7448974308967591, | |
| "eval_chemistry_num_tokens": 3920545.0, | |
| "eval_chemistry_runtime": 60.24, | |
| "eval_chemistry_samples_per_second": 8.3, | |
| "eval_chemistry_steps_per_second": 2.075, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_math_entropy": 0.8341804294586181, | |
| "eval_math_loss": 1.0635857582092285, | |
| "eval_math_mean_token_accuracy": 0.7432106451988221, | |
| "eval_math_num_tokens": 3920545.0, | |
| "eval_math_runtime": 61.8174, | |
| "eval_math_samples_per_second": 8.088, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_physics_entropy": 0.9652358031272888, | |
| "eval_physics_loss": 0.9950281977653503, | |
| "eval_physics_mean_token_accuracy": 0.7510108857154846, | |
| "eval_physics_num_tokens": 3920545.0, | |
| "eval_physics_runtime": 70.411, | |
| "eval_physics_samples_per_second": 7.101, | |
| "eval_physics_steps_per_second": 1.775, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.3548175282776356, | |
| "epoch": 0.336, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.3933333333333334e-05, | |
| "loss": 21.7763, | |
| "mean_token_accuracy": 0.6656343434005976, | |
| "num_tokens": 4114077.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.3656601022928954, | |
| "epoch": 0.352, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.46e-05, | |
| "loss": 22.0972, | |
| "mean_token_accuracy": 0.6638848338276148, | |
| "num_tokens": 4306949.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.3525194190442562, | |
| "epoch": 0.368, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.5266666666666667e-05, | |
| "loss": 21.7293, | |
| "mean_token_accuracy": 0.6680811226367951, | |
| "num_tokens": 4504001.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.3454820621758699, | |
| "epoch": 0.384, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.5933333333333336e-05, | |
| "loss": 21.7032, | |
| "mean_token_accuracy": 0.6671383358538151, | |
| "num_tokens": 4693812.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.3525703553110362, | |
| "epoch": 0.4, | |
| "grad_norm": 17.5, | |
| "learning_rate": 1.66e-05, | |
| "loss": 21.7856, | |
| "mean_token_accuracy": 0.666401931643486, | |
| "num_tokens": 4887094.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.351718918606639, | |
| "epoch": 0.416, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.726666666666667e-05, | |
| "loss": 21.9058, | |
| "mean_token_accuracy": 0.6651136819273233, | |
| "num_tokens": 5085369.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.3526419658213853, | |
| "epoch": 0.432, | |
| "grad_norm": 20.875, | |
| "learning_rate": 1.7933333333333333e-05, | |
| "loss": 21.7813, | |
| "mean_token_accuracy": 0.6668458927422762, | |
| "num_tokens": 5271275.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.3480545241385697, | |
| "epoch": 0.448, | |
| "grad_norm": 22.875, | |
| "learning_rate": 1.86e-05, | |
| "loss": 21.627, | |
| "mean_token_accuracy": 0.6677324704825878, | |
| "num_tokens": 5460559.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.301166184991598, | |
| "epoch": 0.464, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.926666666666667e-05, | |
| "loss": 20.889, | |
| "mean_token_accuracy": 0.676617132872343, | |
| "num_tokens": 5653809.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.318466317281127, | |
| "epoch": 0.48, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.9933333333333334e-05, | |
| "loss": 21.2936, | |
| "mean_token_accuracy": 0.6712827417999506, | |
| "num_tokens": 5850176.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_biology_entropy": 1.2827796216011047, | |
| "eval_biology_loss": 1.275201678276062, | |
| "eval_biology_mean_token_accuracy": 0.6830832781791687, | |
| "eval_biology_num_tokens": 5850176.0, | |
| "eval_biology_runtime": 48.4915, | |
| "eval_biology_samples_per_second": 10.311, | |
| "eval_biology_steps_per_second": 2.578, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_chemistry_entropy": 0.983495129108429, | |
| "eval_chemistry_loss": 0.9488818645477295, | |
| "eval_chemistry_mean_token_accuracy": 0.7523409638404847, | |
| "eval_chemistry_num_tokens": 5850176.0, | |
| "eval_chemistry_runtime": 60.1707, | |
| "eval_chemistry_samples_per_second": 8.31, | |
| "eval_chemistry_steps_per_second": 2.077, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_math_entropy": 0.8216862387657166, | |
| "eval_math_loss": 1.0297818183898926, | |
| "eval_math_mean_token_accuracy": 0.7488151121139527, | |
| "eval_math_num_tokens": 5850176.0, | |
| "eval_math_runtime": 61.6905, | |
| "eval_math_samples_per_second": 8.105, | |
| "eval_math_steps_per_second": 2.026, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_physics_entropy": 0.9433758721351624, | |
| "eval_physics_loss": 0.9520999193191528, | |
| "eval_physics_mean_token_accuracy": 0.7585058889389038, | |
| "eval_physics_num_tokens": 5850176.0, | |
| "eval_physics_runtime": 70.301, | |
| "eval_physics_samples_per_second": 7.112, | |
| "eval_physics_steps_per_second": 1.778, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.2579400472342968, | |
| "epoch": 0.496, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.9933333333333334e-05, | |
| "loss": 20.2011, | |
| "mean_token_accuracy": 0.6842056062072516, | |
| "num_tokens": 6046503.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.3082518883049488, | |
| "epoch": 0.512, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.985925925925926e-05, | |
| "loss": 21.0658, | |
| "mean_token_accuracy": 0.6749501373618841, | |
| "num_tokens": 6240456.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.3003981616348028, | |
| "epoch": 0.528, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.9785185185185187e-05, | |
| "loss": 20.9809, | |
| "mean_token_accuracy": 0.6757604543119669, | |
| "num_tokens": 6430555.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.2986273631453513, | |
| "epoch": 0.544, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.971111111111111e-05, | |
| "loss": 20.8809, | |
| "mean_token_accuracy": 0.6782271713018417, | |
| "num_tokens": 6626006.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.284830729290843, | |
| "epoch": 0.56, | |
| "grad_norm": 17.25, | |
| "learning_rate": 1.963703703703704e-05, | |
| "loss": 20.8197, | |
| "mean_token_accuracy": 0.6767117112874985, | |
| "num_tokens": 6820754.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.2683125745505095, | |
| "epoch": 0.576, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.9562962962962964e-05, | |
| "loss": 20.4541, | |
| "mean_token_accuracy": 0.6809794403612613, | |
| "num_tokens": 7021844.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.2863252360373736, | |
| "epoch": 0.592, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.948888888888889e-05, | |
| "loss": 20.8043, | |
| "mean_token_accuracy": 0.676701345667243, | |
| "num_tokens": 7213951.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.2630502216517925, | |
| "epoch": 0.608, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.9414814814814817e-05, | |
| "loss": 20.4041, | |
| "mean_token_accuracy": 0.6803740747272968, | |
| "num_tokens": 7416773.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.2804703898727894, | |
| "epoch": 0.624, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.9340740740740743e-05, | |
| "loss": 20.6218, | |
| "mean_token_accuracy": 0.6788272958248853, | |
| "num_tokens": 7612843.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.2843346055597067, | |
| "epoch": 0.64, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.926666666666667e-05, | |
| "loss": 20.7171, | |
| "mean_token_accuracy": 0.6782444745302201, | |
| "num_tokens": 7801633.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_biology_entropy": 1.226506398677826, | |
| "eval_biology_loss": 1.2382104396820068, | |
| "eval_biology_mean_token_accuracy": 0.6894095778465271, | |
| "eval_biology_num_tokens": 7801633.0, | |
| "eval_biology_runtime": 48.5507, | |
| "eval_biology_samples_per_second": 10.299, | |
| "eval_biology_steps_per_second": 2.575, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_chemistry_entropy": 0.9317227191925049, | |
| "eval_chemistry_loss": 0.9207452535629272, | |
| "eval_chemistry_mean_token_accuracy": 0.7581370029449462, | |
| "eval_chemistry_num_tokens": 7801633.0, | |
| "eval_chemistry_runtime": 60.2113, | |
| "eval_chemistry_samples_per_second": 8.304, | |
| "eval_chemistry_steps_per_second": 2.076, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_math_entropy": 0.7863595089912414, | |
| "eval_math_loss": 1.010460376739502, | |
| "eval_math_mean_token_accuracy": 0.7535392093658447, | |
| "eval_math_num_tokens": 7801633.0, | |
| "eval_math_runtime": 61.807, | |
| "eval_math_samples_per_second": 8.09, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_physics_entropy": 0.8958085932731629, | |
| "eval_physics_loss": 0.9257401823997498, | |
| "eval_physics_mean_token_accuracy": 0.7637984156608582, | |
| "eval_physics_num_tokens": 7801633.0, | |
| "eval_physics_runtime": 70.3663, | |
| "eval_physics_samples_per_second": 7.106, | |
| "eval_physics_steps_per_second": 1.776, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.278659427165985, | |
| "epoch": 0.656, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.9192592592592593e-05, | |
| "loss": 20.6682, | |
| "mean_token_accuracy": 0.6772829819470644, | |
| "num_tokens": 7995843.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.2931427203118802, | |
| "epoch": 0.672, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.911851851851852e-05, | |
| "loss": 20.8656, | |
| "mean_token_accuracy": 0.6753748003393412, | |
| "num_tokens": 8183103.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.2739692747592926, | |
| "epoch": 0.688, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.9044444444444446e-05, | |
| "loss": 20.5407, | |
| "mean_token_accuracy": 0.6812681049108505, | |
| "num_tokens": 8385976.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.2659825466573238, | |
| "epoch": 0.704, | |
| "grad_norm": 16.25, | |
| "learning_rate": 1.8970370370370372e-05, | |
| "loss": 20.4243, | |
| "mean_token_accuracy": 0.6820976916700602, | |
| "num_tokens": 8578431.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.220404140278697, | |
| "epoch": 0.72, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.8896296296296295e-05, | |
| "loss": 19.6546, | |
| "mean_token_accuracy": 0.6908745598047972, | |
| "num_tokens": 8781342.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.2406103231012822, | |
| "epoch": 0.736, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.8822222222222225e-05, | |
| "loss": 19.9745, | |
| "mean_token_accuracy": 0.6853331789374352, | |
| "num_tokens": 8977918.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.2618801843374967, | |
| "epoch": 0.752, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.874814814814815e-05, | |
| "loss": 20.4041, | |
| "mean_token_accuracy": 0.6825968738645315, | |
| "num_tokens": 9169322.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.2232345014810562, | |
| "epoch": 0.768, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.8674074074074075e-05, | |
| "loss": 19.7045, | |
| "mean_token_accuracy": 0.6888250291347504, | |
| "num_tokens": 9368141.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.25159954726696, | |
| "epoch": 0.784, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.86e-05, | |
| "loss": 20.2036, | |
| "mean_token_accuracy": 0.6849453710019588, | |
| "num_tokens": 9565236.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.264250884205103, | |
| "epoch": 0.8, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.8525925925925928e-05, | |
| "loss": 20.5299, | |
| "mean_token_accuracy": 0.6811827480792999, | |
| "num_tokens": 9761227.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_biology_entropy": 1.2163097896575927, | |
| "eval_biology_loss": 1.2177292108535767, | |
| "eval_biology_mean_token_accuracy": 0.6932459664344788, | |
| "eval_biology_num_tokens": 9761227.0, | |
| "eval_biology_runtime": 48.5438, | |
| "eval_biology_samples_per_second": 10.3, | |
| "eval_biology_steps_per_second": 2.575, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_chemistry_entropy": 0.9239063205718994, | |
| "eval_chemistry_loss": 0.9047155380249023, | |
| "eval_chemistry_mean_token_accuracy": 0.761792631149292, | |
| "eval_chemistry_num_tokens": 9761227.0, | |
| "eval_chemistry_runtime": 59.9546, | |
| "eval_chemistry_samples_per_second": 8.34, | |
| "eval_chemistry_steps_per_second": 2.085, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_math_entropy": 0.7864464523792267, | |
| "eval_math_loss": 0.9939978122711182, | |
| "eval_math_mean_token_accuracy": 0.7574145245552063, | |
| "eval_math_num_tokens": 9761227.0, | |
| "eval_math_runtime": 61.7812, | |
| "eval_math_samples_per_second": 8.093, | |
| "eval_math_steps_per_second": 2.023, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_physics_entropy": 0.889360978603363, | |
| "eval_physics_loss": 0.9096766710281372, | |
| "eval_physics_mean_token_accuracy": 0.7674052910804748, | |
| "eval_physics_num_tokens": 9761227.0, | |
| "eval_physics_runtime": 70.5356, | |
| "eval_physics_samples_per_second": 7.089, | |
| "eval_physics_steps_per_second": 1.772, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.2362793002277612, | |
| "epoch": 0.816, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.8451851851851855e-05, | |
| "loss": 19.8133, | |
| "mean_token_accuracy": 0.6863504596054554, | |
| "num_tokens": 9958727.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.2254926670342683, | |
| "epoch": 0.832, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.8377777777777778e-05, | |
| "loss": 19.8307, | |
| "mean_token_accuracy": 0.6866675779223442, | |
| "num_tokens": 10155771.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.2238412775099277, | |
| "epoch": 0.848, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.8303703703703704e-05, | |
| "loss": 19.687, | |
| "mean_token_accuracy": 0.6897137116640806, | |
| "num_tokens": 10357721.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.2536957442760468, | |
| "epoch": 0.864, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.822962962962963e-05, | |
| "loss": 20.1565, | |
| "mean_token_accuracy": 0.6850291140377521, | |
| "num_tokens": 10552495.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.231699001789093, | |
| "epoch": 0.88, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.8155555555555557e-05, | |
| "loss": 19.8536, | |
| "mean_token_accuracy": 0.6891282081604004, | |
| "num_tokens": 10748749.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.2470501396805047, | |
| "epoch": 0.896, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.8081481481481484e-05, | |
| "loss": 20.1706, | |
| "mean_token_accuracy": 0.6856059569865465, | |
| "num_tokens": 10943319.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.2307742841541767, | |
| "epoch": 0.912, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.800740740740741e-05, | |
| "loss": 19.9062, | |
| "mean_token_accuracy": 0.6885740786790848, | |
| "num_tokens": 11136935.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.2445739306509496, | |
| "epoch": 0.928, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.7933333333333333e-05, | |
| "loss": 20.0979, | |
| "mean_token_accuracy": 0.6851089850068093, | |
| "num_tokens": 11331098.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.2021468229591847, | |
| "epoch": 0.944, | |
| "grad_norm": 15.875, | |
| "learning_rate": 1.785925925925926e-05, | |
| "loss": 19.4077, | |
| "mean_token_accuracy": 0.6915138956159353, | |
| "num_tokens": 11530550.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.226809823140502, | |
| "epoch": 0.96, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.7785185185185186e-05, | |
| "loss": 19.8062, | |
| "mean_token_accuracy": 0.6897286407649517, | |
| "num_tokens": 11729645.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_biology_entropy": 1.1845180039405823, | |
| "eval_biology_loss": 1.203829050064087, | |
| "eval_biology_mean_token_accuracy": 0.6961685500144958, | |
| "eval_biology_num_tokens": 11729645.0, | |
| "eval_biology_runtime": 48.6169, | |
| "eval_biology_samples_per_second": 10.284, | |
| "eval_biology_steps_per_second": 2.571, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_chemistry_entropy": 0.90015394115448, | |
| "eval_chemistry_loss": 0.8946329355239868, | |
| "eval_chemistry_mean_token_accuracy": 0.7635614371299744, | |
| "eval_chemistry_num_tokens": 11729645.0, | |
| "eval_chemistry_runtime": 60.2919, | |
| "eval_chemistry_samples_per_second": 8.293, | |
| "eval_chemistry_steps_per_second": 2.073, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_math_entropy": 0.7684455904960632, | |
| "eval_math_loss": 0.9900413751602173, | |
| "eval_math_mean_token_accuracy": 0.7588200316429138, | |
| "eval_math_num_tokens": 11729645.0, | |
| "eval_math_runtime": 61.8301, | |
| "eval_math_samples_per_second": 8.087, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_physics_entropy": 0.8686938014030456, | |
| "eval_physics_loss": 0.9008635878562927, | |
| "eval_physics_mean_token_accuracy": 0.7692377109527588, | |
| "eval_physics_num_tokens": 11729645.0, | |
| "eval_physics_runtime": 70.4349, | |
| "eval_physics_samples_per_second": 7.099, | |
| "eval_physics_steps_per_second": 1.775, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.2100978799164295, | |
| "epoch": 0.976, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.7711111111111113e-05, | |
| "loss": 19.47, | |
| "mean_token_accuracy": 0.6918121088296175, | |
| "num_tokens": 11924644.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.2226450834423304, | |
| "epoch": 0.992, | |
| "grad_norm": 16.625, | |
| "learning_rate": 1.763703703703704e-05, | |
| "loss": 19.8416, | |
| "mean_token_accuracy": 0.688375661149621, | |
| "num_tokens": 12123059.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.2316548496484756, | |
| "epoch": 1.008, | |
| "grad_norm": 16.875, | |
| "learning_rate": 1.7562962962962962e-05, | |
| "loss": 19.6116, | |
| "mean_token_accuracy": 0.6919524800032377, | |
| "num_tokens": 12319366.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.1779099617153406, | |
| "epoch": 1.024, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.7488888888888892e-05, | |
| "loss": 18.9763, | |
| "mean_token_accuracy": 0.6978646669536829, | |
| "num_tokens": 12524183.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.2152834441512823, | |
| "epoch": 1.04, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.7414814814814815e-05, | |
| "loss": 19.6247, | |
| "mean_token_accuracy": 0.6903412740677595, | |
| "num_tokens": 12718593.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.1799768891185523, | |
| "epoch": 1.056, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.7340740740740742e-05, | |
| "loss": 19.0432, | |
| "mean_token_accuracy": 0.6986244544386864, | |
| "num_tokens": 12917803.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.2108702428638936, | |
| "epoch": 1.072, | |
| "grad_norm": 19.125, | |
| "learning_rate": 1.726666666666667e-05, | |
| "loss": 19.4166, | |
| "mean_token_accuracy": 0.6927186574786901, | |
| "num_tokens": 13105826.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.1979756511747837, | |
| "epoch": 1.088, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.7192592592592595e-05, | |
| "loss": 19.2605, | |
| "mean_token_accuracy": 0.6957505799829959, | |
| "num_tokens": 13298619.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.192365935444832, | |
| "epoch": 1.104, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.711851851851852e-05, | |
| "loss": 19.2461, | |
| "mean_token_accuracy": 0.695810866355896, | |
| "num_tokens": 13491486.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.212946466356516, | |
| "epoch": 1.12, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.7044444444444445e-05, | |
| "loss": 19.5004, | |
| "mean_token_accuracy": 0.692466252297163, | |
| "num_tokens": 13674663.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_biology_entropy": 1.1571769905090332, | |
| "eval_biology_loss": 1.1946450471878052, | |
| "eval_biology_mean_token_accuracy": 0.6972691407203674, | |
| "eval_biology_num_tokens": 13674663.0, | |
| "eval_biology_runtime": 48.6729, | |
| "eval_biology_samples_per_second": 10.273, | |
| "eval_biology_steps_per_second": 2.568, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_chemistry_entropy": 0.8766862626075744, | |
| "eval_chemistry_loss": 0.8891168236732483, | |
| "eval_chemistry_mean_token_accuracy": 0.7645404329299926, | |
| "eval_chemistry_num_tokens": 13674663.0, | |
| "eval_chemistry_runtime": 60.3334, | |
| "eval_chemistry_samples_per_second": 8.287, | |
| "eval_chemistry_steps_per_second": 2.072, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_math_entropy": 0.7603865313529968, | |
| "eval_math_loss": 0.9834137558937073, | |
| "eval_math_mean_token_accuracy": 0.7596666264533997, | |
| "eval_math_num_tokens": 13674663.0, | |
| "eval_math_runtime": 61.8146, | |
| "eval_math_samples_per_second": 8.089, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_physics_entropy": 0.850571988105774, | |
| "eval_physics_loss": 0.8937918543815613, | |
| "eval_physics_mean_token_accuracy": 0.7703958468437195, | |
| "eval_physics_num_tokens": 13674663.0, | |
| "eval_physics_runtime": 70.4674, | |
| "eval_physics_samples_per_second": 7.095, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.1909121543169021, | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.697037037037037e-05, | |
| "loss": 19.2462, | |
| "mean_token_accuracy": 0.6955781776458025, | |
| "num_tokens": 13869134.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.1682380847632885, | |
| "epoch": 1.152, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.6896296296296298e-05, | |
| "loss": 18.8229, | |
| "mean_token_accuracy": 0.6991135813295841, | |
| "num_tokens": 14078365.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.1939557407051324, | |
| "epoch": 1.168, | |
| "grad_norm": 16.875, | |
| "learning_rate": 1.6822222222222224e-05, | |
| "loss": 19.1346, | |
| "mean_token_accuracy": 0.6960698150098323, | |
| "num_tokens": 14266831.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.180869185552001, | |
| "epoch": 1.184, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.6748148148148147e-05, | |
| "loss": 19.2654, | |
| "mean_token_accuracy": 0.6941378649324179, | |
| "num_tokens": 14465660.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.1937656667083503, | |
| "epoch": 1.2, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.6674074074074077e-05, | |
| "loss": 19.0305, | |
| "mean_token_accuracy": 0.6964295905083417, | |
| "num_tokens": 14653228.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.1589823190122843, | |
| "epoch": 1.216, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.66e-05, | |
| "loss": 18.6048, | |
| "mean_token_accuracy": 0.7018654596060514, | |
| "num_tokens": 14857782.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.1703605465590954, | |
| "epoch": 1.232, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.6525925925925927e-05, | |
| "loss": 18.8831, | |
| "mean_token_accuracy": 0.7015001580119133, | |
| "num_tokens": 15047356.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.1772115517407655, | |
| "epoch": 1.248, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.6451851851851853e-05, | |
| "loss": 19.0432, | |
| "mean_token_accuracy": 0.6959997840225697, | |
| "num_tokens": 15241098.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.196473068371415, | |
| "epoch": 1.264, | |
| "grad_norm": 16.375, | |
| "learning_rate": 1.637777777777778e-05, | |
| "loss": 19.1591, | |
| "mean_token_accuracy": 0.6967897292226553, | |
| "num_tokens": 15437657.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.2014197081327438, | |
| "epoch": 1.28, | |
| "grad_norm": 19.125, | |
| "learning_rate": 1.6303703703703706e-05, | |
| "loss": 19.4409, | |
| "mean_token_accuracy": 0.6926549930125475, | |
| "num_tokens": 15630795.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_biology_entropy": 1.134603425502777, | |
| "eval_biology_loss": 1.1884372234344482, | |
| "eval_biology_mean_token_accuracy": 0.6986491298675537, | |
| "eval_biology_num_tokens": 15630795.0, | |
| "eval_biology_runtime": 48.6306, | |
| "eval_biology_samples_per_second": 10.282, | |
| "eval_biology_steps_per_second": 2.57, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_chemistry_entropy": 0.8623910093307495, | |
| "eval_chemistry_loss": 0.885444700717926, | |
| "eval_chemistry_mean_token_accuracy": 0.7653528556823731, | |
| "eval_chemistry_num_tokens": 15630795.0, | |
| "eval_chemistry_runtime": 60.3508, | |
| "eval_chemistry_samples_per_second": 8.285, | |
| "eval_chemistry_steps_per_second": 2.071, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_math_entropy": 0.7589023416042328, | |
| "eval_math_loss": 0.983073353767395, | |
| "eval_math_mean_token_accuracy": 0.7593517408370972, | |
| "eval_math_num_tokens": 15630795.0, | |
| "eval_math_runtime": 61.9026, | |
| "eval_math_samples_per_second": 8.077, | |
| "eval_math_steps_per_second": 2.019, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_physics_entropy": 0.84130739736557, | |
| "eval_physics_loss": 0.8907755613327026, | |
| "eval_physics_mean_token_accuracy": 0.771062777519226, | |
| "eval_physics_num_tokens": 15630795.0, | |
| "eval_physics_runtime": 70.5226, | |
| "eval_physics_samples_per_second": 7.09, | |
| "eval_physics_steps_per_second": 1.772, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.2085642520338298, | |
| "epoch": 1.296, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.622962962962963e-05, | |
| "loss": 19.3831, | |
| "mean_token_accuracy": 0.6933640763163567, | |
| "num_tokens": 15827105.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.1861349143087865, | |
| "epoch": 1.312, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.6155555555555556e-05, | |
| "loss": 19.3103, | |
| "mean_token_accuracy": 0.694928414747119, | |
| "num_tokens": 16019645.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.195632776618004, | |
| "epoch": 1.328, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.6081481481481482e-05, | |
| "loss": 19.3068, | |
| "mean_token_accuracy": 0.6934712298214436, | |
| "num_tokens": 16221726.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.1725192748010158, | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.600740740740741e-05, | |
| "loss": 18.7963, | |
| "mean_token_accuracy": 0.700145885720849, | |
| "num_tokens": 16427594.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.179823150858283, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.5933333333333336e-05, | |
| "loss": 19.1154, | |
| "mean_token_accuracy": 0.6961398232728243, | |
| "num_tokens": 16621605.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.2228495314717294, | |
| "epoch": 1.376, | |
| "grad_norm": 19.5, | |
| "learning_rate": 1.5859259259259262e-05, | |
| "loss": 19.6627, | |
| "mean_token_accuracy": 0.6894211061298847, | |
| "num_tokens": 16813444.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.19021125882864, | |
| "epoch": 1.392, | |
| "grad_norm": 17.5, | |
| "learning_rate": 1.5785185185185185e-05, | |
| "loss": 19.2411, | |
| "mean_token_accuracy": 0.6957099426537752, | |
| "num_tokens": 17006509.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.184871331602335, | |
| "epoch": 1.408, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 1.571111111111111e-05, | |
| "loss": 18.9785, | |
| "mean_token_accuracy": 0.6971315786242485, | |
| "num_tokens": 17197870.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.1884775411337614, | |
| "epoch": 1.424, | |
| "grad_norm": 17.25, | |
| "learning_rate": 1.5637037037037038e-05, | |
| "loss": 19.1289, | |
| "mean_token_accuracy": 0.697301234304905, | |
| "num_tokens": 17394390.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.1825116220861673, | |
| "epoch": 1.44, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.5562962962962965e-05, | |
| "loss": 19.1266, | |
| "mean_token_accuracy": 0.6968252252787351, | |
| "num_tokens": 17587777.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_biology_entropy": 1.1672434105873108, | |
| "eval_biology_loss": 1.1815813779830933, | |
| "eval_biology_mean_token_accuracy": 0.7003293070793152, | |
| "eval_biology_num_tokens": 17587777.0, | |
| "eval_biology_runtime": 48.6205, | |
| "eval_biology_samples_per_second": 10.284, | |
| "eval_biology_steps_per_second": 2.571, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_chemistry_entropy": 0.8869817838668823, | |
| "eval_chemistry_loss": 0.8829970955848694, | |
| "eval_chemistry_mean_token_accuracy": 0.7656374487876892, | |
| "eval_chemistry_num_tokens": 17587777.0, | |
| "eval_chemistry_runtime": 60.3339, | |
| "eval_chemistry_samples_per_second": 8.287, | |
| "eval_chemistry_steps_per_second": 2.072, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_math_entropy": 0.7674483435153961, | |
| "eval_math_loss": 0.9792445302009583, | |
| "eval_math_mean_token_accuracy": 0.7597224740982056, | |
| "eval_math_num_tokens": 17587777.0, | |
| "eval_math_runtime": 61.8239, | |
| "eval_math_samples_per_second": 8.087, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_physics_entropy": 0.8621900615692139, | |
| "eval_physics_loss": 0.8884776830673218, | |
| "eval_physics_mean_token_accuracy": 0.7715646696090698, | |
| "eval_physics_num_tokens": 17587777.0, | |
| "eval_physics_runtime": 70.446, | |
| "eval_physics_samples_per_second": 7.098, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.1818725422024727, | |
| "epoch": 1.456, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.548888888888889e-05, | |
| "loss": 19.032, | |
| "mean_token_accuracy": 0.698200449720025, | |
| "num_tokens": 17788456.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.1769807077944279, | |
| "epoch": 1.472, | |
| "grad_norm": 16.0, | |
| "learning_rate": 1.5414814814814814e-05, | |
| "loss": 18.8791, | |
| "mean_token_accuracy": 0.7008779179304838, | |
| "num_tokens": 17984063.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.1641013238579034, | |
| "epoch": 1.488, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.5340740740740744e-05, | |
| "loss": 18.9913, | |
| "mean_token_accuracy": 0.6998269848525525, | |
| "num_tokens": 18175640.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.1960251219570637, | |
| "epoch": 1.504, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.5266666666666667e-05, | |
| "loss": 19.2076, | |
| "mean_token_accuracy": 0.696322177350521, | |
| "num_tokens": 18367857.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.1745740845799446, | |
| "epoch": 1.52, | |
| "grad_norm": 16.875, | |
| "learning_rate": 1.5192592592592594e-05, | |
| "loss": 19.0307, | |
| "mean_token_accuracy": 0.6969408400356769, | |
| "num_tokens": 18569146.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.2008745949715376, | |
| "epoch": 1.536, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.5118518518518519e-05, | |
| "loss": 19.2895, | |
| "mean_token_accuracy": 0.6946466054767371, | |
| "num_tokens": 18755079.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.1710849691182375, | |
| "epoch": 1.552, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.5044444444444445e-05, | |
| "loss": 18.9073, | |
| "mean_token_accuracy": 0.699705482646823, | |
| "num_tokens": 18956248.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.163971472159028, | |
| "epoch": 1.568, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.497037037037037e-05, | |
| "loss": 18.7379, | |
| "mean_token_accuracy": 0.7023108277469874, | |
| "num_tokens": 19150315.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.1755164857953786, | |
| "epoch": 1.584, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.4896296296296298e-05, | |
| "loss": 18.8826, | |
| "mean_token_accuracy": 0.6997408363968134, | |
| "num_tokens": 19344260.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.2020615819841622, | |
| "epoch": 1.6, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.4822222222222225e-05, | |
| "loss": 19.3858, | |
| "mean_token_accuracy": 0.6933612376451492, | |
| "num_tokens": 19532552.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_biology_entropy": 1.1451181559562682, | |
| "eval_biology_loss": 1.1767185926437378, | |
| "eval_biology_mean_token_accuracy": 0.7013368840217591, | |
| "eval_biology_num_tokens": 19532552.0, | |
| "eval_biology_runtime": 48.6261, | |
| "eval_biology_samples_per_second": 10.283, | |
| "eval_biology_steps_per_second": 2.571, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_chemistry_entropy": 0.8642943887710571, | |
| "eval_chemistry_loss": 0.8798553347587585, | |
| "eval_chemistry_mean_token_accuracy": 0.7664505195617676, | |
| "eval_chemistry_num_tokens": 19532552.0, | |
| "eval_chemistry_runtime": 59.8839, | |
| "eval_chemistry_samples_per_second": 8.349, | |
| "eval_chemistry_steps_per_second": 2.087, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_math_entropy": 0.7490488801002503, | |
| "eval_math_loss": 0.9804874062538147, | |
| "eval_math_mean_token_accuracy": 0.7602896738052368, | |
| "eval_math_num_tokens": 19532552.0, | |
| "eval_math_runtime": 61.7317, | |
| "eval_math_samples_per_second": 8.1, | |
| "eval_math_steps_per_second": 2.025, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_physics_entropy": 0.8397411880493164, | |
| "eval_physics_loss": 0.885618269443512, | |
| "eval_physics_mean_token_accuracy": 0.7722613711357117, | |
| "eval_physics_num_tokens": 19532552.0, | |
| "eval_physics_runtime": 70.4574, | |
| "eval_physics_samples_per_second": 7.096, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.1645312760025264, | |
| "epoch": 1.616, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.474814814814815e-05, | |
| "loss": 18.8318, | |
| "mean_token_accuracy": 0.7000049009919167, | |
| "num_tokens": 19732719.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.2018159918487072, | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.4674074074074076e-05, | |
| "loss": 19.3741, | |
| "mean_token_accuracy": 0.6942509710788727, | |
| "num_tokens": 19926830.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.1848741736263038, | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 16.25, | |
| "learning_rate": 1.46e-05, | |
| "loss": 19.0931, | |
| "mean_token_accuracy": 0.6962833561003208, | |
| "num_tokens": 20118800.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.1461675189435483, | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 16.5, | |
| "learning_rate": 1.4525925925925927e-05, | |
| "loss": 18.5384, | |
| "mean_token_accuracy": 0.7037994157522917, | |
| "num_tokens": 20320511.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.1853893544524907, | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.4451851851851852e-05, | |
| "loss": 18.9769, | |
| "mean_token_accuracy": 0.6986714884638786, | |
| "num_tokens": 20513393.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.1721675164997578, | |
| "epoch": 1.696, | |
| "grad_norm": 17.5, | |
| "learning_rate": 1.4377777777777779e-05, | |
| "loss": 18.9508, | |
| "mean_token_accuracy": 0.7003318756818772, | |
| "num_tokens": 20707237.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.1738170266151429, | |
| "epoch": 1.712, | |
| "grad_norm": 19.5, | |
| "learning_rate": 1.4303703703703703e-05, | |
| "loss": 18.9752, | |
| "mean_token_accuracy": 0.6993441980332136, | |
| "num_tokens": 20910419.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.190333865955472, | |
| "epoch": 1.728, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.4229629629629632e-05, | |
| "loss": 19.1838, | |
| "mean_token_accuracy": 0.6956166718155146, | |
| "num_tokens": 21107498.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.1574083410203457, | |
| "epoch": 1.744, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.4155555555555556e-05, | |
| "loss": 18.5378, | |
| "mean_token_accuracy": 0.7021385233849287, | |
| "num_tokens": 21303955.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.1666433937847613, | |
| "epoch": 1.76, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.4081481481481483e-05, | |
| "loss": 18.9266, | |
| "mean_token_accuracy": 0.7011374026536942, | |
| "num_tokens": 21499572.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_biology_entropy": 1.1345319437980652, | |
| "eval_biology_loss": 1.1725776195526123, | |
| "eval_biology_mean_token_accuracy": 0.7024298944473266, | |
| "eval_biology_num_tokens": 21499572.0, | |
| "eval_biology_runtime": 48.5727, | |
| "eval_biology_samples_per_second": 10.294, | |
| "eval_biology_steps_per_second": 2.573, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_chemistry_entropy": 0.8619537029266358, | |
| "eval_chemistry_loss": 0.8766760230064392, | |
| "eval_chemistry_mean_token_accuracy": 0.7667445015907287, | |
| "eval_chemistry_num_tokens": 21499572.0, | |
| "eval_chemistry_runtime": 60.4564, | |
| "eval_chemistry_samples_per_second": 8.27, | |
| "eval_chemistry_steps_per_second": 2.068, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_math_entropy": 0.7560438480377197, | |
| "eval_math_loss": 0.9768902063369751, | |
| "eval_math_mean_token_accuracy": 0.7610512175559998, | |
| "eval_math_num_tokens": 21499572.0, | |
| "eval_math_runtime": 61.7554, | |
| "eval_math_samples_per_second": 8.096, | |
| "eval_math_steps_per_second": 2.024, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_physics_entropy": 0.840686586856842, | |
| "eval_physics_loss": 0.8825888633728027, | |
| "eval_physics_mean_token_accuracy": 0.7726316556930543, | |
| "eval_physics_num_tokens": 21499572.0, | |
| "eval_physics_runtime": 70.3673, | |
| "eval_physics_samples_per_second": 7.106, | |
| "eval_physics_steps_per_second": 1.776, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.177320409566164, | |
| "epoch": 1.776, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.400740740740741e-05, | |
| "loss": 18.9676, | |
| "mean_token_accuracy": 0.6986722864210606, | |
| "num_tokens": 21692804.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.148191200569272, | |
| "epoch": 1.792, | |
| "grad_norm": 19.875, | |
| "learning_rate": 1.3933333333333334e-05, | |
| "loss": 18.5112, | |
| "mean_token_accuracy": 0.7040422059595585, | |
| "num_tokens": 21894218.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.1709640648216009, | |
| "epoch": 1.808, | |
| "grad_norm": 19.5, | |
| "learning_rate": 1.385925925925926e-05, | |
| "loss": 18.8424, | |
| "mean_token_accuracy": 0.7009096905589104, | |
| "num_tokens": 22082522.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.1934551119804382, | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.3785185185185186e-05, | |
| "loss": 19.3396, | |
| "mean_token_accuracy": 0.693663826212287, | |
| "num_tokens": 22278933.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.2042058877646924, | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.3711111111111112e-05, | |
| "loss": 19.393, | |
| "mean_token_accuracy": 0.6937405589967967, | |
| "num_tokens": 22473801.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.1648973379284144, | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.3637037037037037e-05, | |
| "loss": 18.7484, | |
| "mean_token_accuracy": 0.6991217479109764, | |
| "num_tokens": 22677853.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.1554684847593308, | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.3562962962962965e-05, | |
| "loss": 18.5305, | |
| "mean_token_accuracy": 0.7038368381559849, | |
| "num_tokens": 22874965.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.1899018451571464, | |
| "epoch": 1.888, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.3488888888888888e-05, | |
| "loss": 19.2428, | |
| "mean_token_accuracy": 0.6950660139322281, | |
| "num_tokens": 23068892.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.1851763129234314, | |
| "epoch": 1.904, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.3414814814814817e-05, | |
| "loss": 19.126, | |
| "mean_token_accuracy": 0.6976218212395906, | |
| "num_tokens": 23263827.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.1869227845221757, | |
| "epoch": 1.92, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.3340740740740741e-05, | |
| "loss": 19.135, | |
| "mean_token_accuracy": 0.6959322843700647, | |
| "num_tokens": 23463627.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_biology_entropy": 1.1337207446098327, | |
| "eval_biology_loss": 1.168716311454773, | |
| "eval_biology_mean_token_accuracy": 0.7030426645278931, | |
| "eval_biology_num_tokens": 23463627.0, | |
| "eval_biology_runtime": 48.5387, | |
| "eval_biology_samples_per_second": 10.301, | |
| "eval_biology_steps_per_second": 2.575, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_chemistry_entropy": 0.8601148505210876, | |
| "eval_chemistry_loss": 0.8744351267814636, | |
| "eval_chemistry_mean_token_accuracy": 0.767286482334137, | |
| "eval_chemistry_num_tokens": 23463627.0, | |
| "eval_chemistry_runtime": 60.2148, | |
| "eval_chemistry_samples_per_second": 8.304, | |
| "eval_chemistry_steps_per_second": 2.076, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_math_entropy": 0.7519172282218933, | |
| "eval_math_loss": 0.9753768444061279, | |
| "eval_math_mean_token_accuracy": 0.7615019774436951, | |
| "eval_math_num_tokens": 23463627.0, | |
| "eval_math_runtime": 61.7169, | |
| "eval_math_samples_per_second": 8.102, | |
| "eval_math_steps_per_second": 2.025, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_physics_entropy": 0.836491331577301, | |
| "eval_physics_loss": 0.8801184296607971, | |
| "eval_physics_mean_token_accuracy": 0.7731836423873901, | |
| "eval_physics_num_tokens": 23463627.0, | |
| "eval_physics_runtime": 70.3444, | |
| "eval_physics_samples_per_second": 7.108, | |
| "eval_physics_steps_per_second": 1.777, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.1517000958323478, | |
| "epoch": 1.936, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.3266666666666668e-05, | |
| "loss": 18.5112, | |
| "mean_token_accuracy": 0.7040315445512533, | |
| "num_tokens": 23660418.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.172454860061407, | |
| "epoch": 1.952, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.3192592592592594e-05, | |
| "loss": 18.8708, | |
| "mean_token_accuracy": 0.6998139064759016, | |
| "num_tokens": 23858145.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.18152665682137, | |
| "epoch": 1.968, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.311851851851852e-05, | |
| "loss": 19.0753, | |
| "mean_token_accuracy": 0.6981570664793253, | |
| "num_tokens": 24053364.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.1768125779926777, | |
| "epoch": 1.984, | |
| "grad_norm": 17.625, | |
| "learning_rate": 1.3044444444444446e-05, | |
| "loss": 18.9463, | |
| "mean_token_accuracy": 0.6988612022250891, | |
| "num_tokens": 24249465.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.180902672186494, | |
| "epoch": 2.0, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.297037037037037e-05, | |
| "loss": 18.999, | |
| "mean_token_accuracy": 0.7006669268012047, | |
| "num_tokens": 24442582.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.1616276282817126, | |
| "epoch": 2.016, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.2896296296296299e-05, | |
| "loss": 18.592, | |
| "mean_token_accuracy": 0.7041712146252394, | |
| "num_tokens": 24632353.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.1689807120710611, | |
| "epoch": 2.032, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.2822222222222222e-05, | |
| "loss": 18.9039, | |
| "mean_token_accuracy": 0.7014766734093427, | |
| "num_tokens": 24822715.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.1456513587385415, | |
| "epoch": 2.048, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.274814814814815e-05, | |
| "loss": 18.4096, | |
| "mean_token_accuracy": 0.7051771484315396, | |
| "num_tokens": 25023118.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.1587952699512243, | |
| "epoch": 2.064, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.2674074074074075e-05, | |
| "loss": 18.6378, | |
| "mean_token_accuracy": 0.7034870360046626, | |
| "num_tokens": 25217414.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.1504007514566184, | |
| "epoch": 2.08, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 18.4703, | |
| "mean_token_accuracy": 0.7041630525141954, | |
| "num_tokens": 25408961.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_biology_entropy": 1.1164145894050599, | |
| "eval_biology_loss": 1.1683411598205566, | |
| "eval_biology_mean_token_accuracy": 0.7027085943222046, | |
| "eval_biology_num_tokens": 25408961.0, | |
| "eval_biology_runtime": 48.5837, | |
| "eval_biology_samples_per_second": 10.292, | |
| "eval_biology_steps_per_second": 2.573, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_chemistry_entropy": 0.8446620798110962, | |
| "eval_chemistry_loss": 0.8756071925163269, | |
| "eval_chemistry_mean_token_accuracy": 0.7672800846099853, | |
| "eval_chemistry_num_tokens": 25408961.0, | |
| "eval_chemistry_runtime": 60.224, | |
| "eval_chemistry_samples_per_second": 8.302, | |
| "eval_chemistry_steps_per_second": 2.076, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_math_entropy": 0.7416602709293365, | |
| "eval_math_loss": 0.9767736792564392, | |
| "eval_math_mean_token_accuracy": 0.7612695918083191, | |
| "eval_math_num_tokens": 25408961.0, | |
| "eval_math_runtime": 61.7354, | |
| "eval_math_samples_per_second": 8.099, | |
| "eval_math_steps_per_second": 2.025, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_physics_entropy": 0.822511604309082, | |
| "eval_physics_loss": 0.8807807564735413, | |
| "eval_physics_mean_token_accuracy": 0.7734324297904969, | |
| "eval_physics_num_tokens": 25408961.0, | |
| "eval_physics_runtime": 70.3722, | |
| "eval_physics_samples_per_second": 7.105, | |
| "eval_physics_steps_per_second": 1.776, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.145626274123788, | |
| "epoch": 2.096, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.2525925925925928e-05, | |
| "loss": 18.4669, | |
| "mean_token_accuracy": 0.7053351275622844, | |
| "num_tokens": 25600511.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.1261590894311666, | |
| "epoch": 2.112, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.2451851851851853e-05, | |
| "loss": 18.0421, | |
| "mean_token_accuracy": 0.7101508729159832, | |
| "num_tokens": 25796565.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.1319866240024568, | |
| "epoch": 2.128, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.237777777777778e-05, | |
| "loss": 18.2326, | |
| "mean_token_accuracy": 0.7063661482185125, | |
| "num_tokens": 25991156.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.12694109082222, | |
| "epoch": 2.144, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.2303703703703704e-05, | |
| "loss": 18.27, | |
| "mean_token_accuracy": 0.7078616585582495, | |
| "num_tokens": 26193237.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.1670064296573401, | |
| "epoch": 2.16, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.222962962962963e-05, | |
| "loss": 18.6321, | |
| "mean_token_accuracy": 0.704596522077918, | |
| "num_tokens": 26387993.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.1611683428287507, | |
| "epoch": 2.176, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.2155555555555555e-05, | |
| "loss": 18.8084, | |
| "mean_token_accuracy": 0.7007863517850638, | |
| "num_tokens": 26585269.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.1334992978721856, | |
| "epoch": 2.192, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.2081481481481484e-05, | |
| "loss": 18.1002, | |
| "mean_token_accuracy": 0.7102227192372084, | |
| "num_tokens": 26776318.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.142113695293665, | |
| "epoch": 2.208, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.2007407407407408e-05, | |
| "loss": 18.4288, | |
| "mean_token_accuracy": 0.7056139782071114, | |
| "num_tokens": 26974420.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.165258849412203, | |
| "epoch": 2.224, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.1933333333333335e-05, | |
| "loss": 18.8072, | |
| "mean_token_accuracy": 0.7021366007626056, | |
| "num_tokens": 27167577.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.1230817057192326, | |
| "epoch": 2.24, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.185925925925926e-05, | |
| "loss": 18.0154, | |
| "mean_token_accuracy": 0.7094326011836529, | |
| "num_tokens": 27364189.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_biology_entropy": 1.107084683418274, | |
| "eval_biology_loss": 1.166341781616211, | |
| "eval_biology_mean_token_accuracy": 0.7033038935661315, | |
| "eval_biology_num_tokens": 27364189.0, | |
| "eval_biology_runtime": 48.5794, | |
| "eval_biology_samples_per_second": 10.292, | |
| "eval_biology_steps_per_second": 2.573, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_chemistry_entropy": 0.8436218018531799, | |
| "eval_chemistry_loss": 0.8749056458473206, | |
| "eval_chemistry_mean_token_accuracy": 0.7673236474990844, | |
| "eval_chemistry_num_tokens": 27364189.0, | |
| "eval_chemistry_runtime": 60.2275, | |
| "eval_chemistry_samples_per_second": 8.302, | |
| "eval_chemistry_steps_per_second": 2.075, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_math_entropy": 0.7436552357673645, | |
| "eval_math_loss": 0.9768530130386353, | |
| "eval_math_mean_token_accuracy": 0.7615942449569703, | |
| "eval_math_num_tokens": 27364189.0, | |
| "eval_math_runtime": 61.7353, | |
| "eval_math_samples_per_second": 8.099, | |
| "eval_math_steps_per_second": 2.025, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_physics_entropy": 0.8238044924736023, | |
| "eval_physics_loss": 0.880571186542511, | |
| "eval_physics_mean_token_accuracy": 0.7732309465408325, | |
| "eval_physics_num_tokens": 27364189.0, | |
| "eval_physics_runtime": 70.3425, | |
| "eval_physics_samples_per_second": 7.108, | |
| "eval_physics_steps_per_second": 1.777, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.1227998584508896, | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.1785185185185186e-05, | |
| "loss": 17.9995, | |
| "mean_token_accuracy": 0.71092384532094, | |
| "num_tokens": 27557387.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.1192971892654895, | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 19.5, | |
| "learning_rate": 1.1711111111111113e-05, | |
| "loss": 18.0703, | |
| "mean_token_accuracy": 0.7088606022298336, | |
| "num_tokens": 27755725.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.157552171498537, | |
| "epoch": 2.288, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.1637037037037037e-05, | |
| "loss": 18.6818, | |
| "mean_token_accuracy": 0.7010403741151094, | |
| "num_tokens": 27950694.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.1524705573916436, | |
| "epoch": 2.304, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.1562962962962964e-05, | |
| "loss": 18.6601, | |
| "mean_token_accuracy": 0.7030924465507269, | |
| "num_tokens": 28150719.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.1215086288750171, | |
| "epoch": 2.32, | |
| "grad_norm": 17.5, | |
| "learning_rate": 1.1488888888888889e-05, | |
| "loss": 17.9268, | |
| "mean_token_accuracy": 0.7104179698973894, | |
| "num_tokens": 28348652.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.1350885152816772, | |
| "epoch": 2.336, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.1414814814814817e-05, | |
| "loss": 18.3565, | |
| "mean_token_accuracy": 0.7067790202796459, | |
| "num_tokens": 28542945.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.156265541538596, | |
| "epoch": 2.352, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.1340740740740742e-05, | |
| "loss": 18.6992, | |
| "mean_token_accuracy": 0.7013768840581178, | |
| "num_tokens": 28731927.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.1396565582603215, | |
| "epoch": 2.368, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.1266666666666668e-05, | |
| "loss": 18.2949, | |
| "mean_token_accuracy": 0.7059750188142061, | |
| "num_tokens": 28929298.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.1448044694960118, | |
| "epoch": 2.384, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.1192592592592593e-05, | |
| "loss": 18.526, | |
| "mean_token_accuracy": 0.703592960909009, | |
| "num_tokens": 29121142.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.140185246989131, | |
| "epoch": 2.4, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.111851851851852e-05, | |
| "loss": 18.3273, | |
| "mean_token_accuracy": 0.706566022336483, | |
| "num_tokens": 29317919.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_biology_entropy": 1.1156310276985169, | |
| "eval_biology_loss": 1.1647558212280273, | |
| "eval_biology_mean_token_accuracy": 0.7032402672767639, | |
| "eval_biology_num_tokens": 29317919.0, | |
| "eval_biology_runtime": 48.611, | |
| "eval_biology_samples_per_second": 10.286, | |
| "eval_biology_steps_per_second": 2.571, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_chemistry_entropy": 0.8463425951004029, | |
| "eval_chemistry_loss": 0.8742334246635437, | |
| "eval_chemistry_mean_token_accuracy": 0.767361388683319, | |
| "eval_chemistry_num_tokens": 29317919.0, | |
| "eval_chemistry_runtime": 59.9689, | |
| "eval_chemistry_samples_per_second": 8.338, | |
| "eval_chemistry_steps_per_second": 2.084, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_math_entropy": 0.7472673971652984, | |
| "eval_math_loss": 0.9761422872543335, | |
| "eval_math_mean_token_accuracy": 0.7615765709877014, | |
| "eval_math_num_tokens": 29317919.0, | |
| "eval_math_runtime": 61.7442, | |
| "eval_math_samples_per_second": 8.098, | |
| "eval_math_steps_per_second": 2.024, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_physics_entropy": 0.8270321841239929, | |
| "eval_physics_loss": 0.8801943063735962, | |
| "eval_physics_mean_token_accuracy": 0.7734404511451721, | |
| "eval_physics_num_tokens": 29317919.0, | |
| "eval_physics_runtime": 70.5165, | |
| "eval_physics_samples_per_second": 7.091, | |
| "eval_physics_steps_per_second": 1.773, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.1317574352025985, | |
| "epoch": 2.416, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.1044444444444444e-05, | |
| "loss": 18.3109, | |
| "mean_token_accuracy": 0.7062701795250177, | |
| "num_tokens": 29518541.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.130976415425539, | |
| "epoch": 2.432, | |
| "grad_norm": 20.25, | |
| "learning_rate": 1.0970370370370371e-05, | |
| "loss": 18.1642, | |
| "mean_token_accuracy": 0.7086035583168269, | |
| "num_tokens": 29720768.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 1.1605936624109745, | |
| "epoch": 2.448, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.0896296296296298e-05, | |
| "loss": 18.7972, | |
| "mean_token_accuracy": 0.7010477486997843, | |
| "num_tokens": 29916619.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.1125331491231918, | |
| "epoch": 2.464, | |
| "grad_norm": 20.375, | |
| "learning_rate": 1.0822222222222222e-05, | |
| "loss": 17.8199, | |
| "mean_token_accuracy": 0.7112086053937674, | |
| "num_tokens": 30121198.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 1.1271852746605873, | |
| "epoch": 2.48, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.074814814814815e-05, | |
| "loss": 18.1447, | |
| "mean_token_accuracy": 0.707981801405549, | |
| "num_tokens": 30325508.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.1350488025695085, | |
| "epoch": 2.496, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.0674074074074074e-05, | |
| "loss": 18.3331, | |
| "mean_token_accuracy": 0.706438259780407, | |
| "num_tokens": 30517445.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.149303700402379, | |
| "epoch": 2.512, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.0600000000000002e-05, | |
| "loss": 18.5415, | |
| "mean_token_accuracy": 0.7031121108680963, | |
| "num_tokens": 30714049.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 1.134909725189209, | |
| "epoch": 2.528, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.0525925925925927e-05, | |
| "loss": 18.3422, | |
| "mean_token_accuracy": 0.7059885617345572, | |
| "num_tokens": 30911317.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.1456572752445937, | |
| "epoch": 2.544, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.0451851851851853e-05, | |
| "loss": 18.3318, | |
| "mean_token_accuracy": 0.70671008490026, | |
| "num_tokens": 31108983.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.1408224642276763, | |
| "epoch": 2.56, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.0377777777777778e-05, | |
| "loss": 18.441, | |
| "mean_token_accuracy": 0.705613837391138, | |
| "num_tokens": 31298994.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_biology_entropy": 1.1095367937088012, | |
| "eval_biology_loss": 1.1631091833114624, | |
| "eval_biology_mean_token_accuracy": 0.7037216110229492, | |
| "eval_biology_num_tokens": 31298994.0, | |
| "eval_biology_runtime": 48.5574, | |
| "eval_biology_samples_per_second": 10.297, | |
| "eval_biology_steps_per_second": 2.574, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_chemistry_entropy": 0.842681580543518, | |
| "eval_chemistry_loss": 0.8729196786880493, | |
| "eval_chemistry_mean_token_accuracy": 0.7677588958740235, | |
| "eval_chemistry_num_tokens": 31298994.0, | |
| "eval_chemistry_runtime": 60.3125, | |
| "eval_chemistry_samples_per_second": 8.29, | |
| "eval_chemistry_steps_per_second": 2.073, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_math_entropy": 0.7447464742660522, | |
| "eval_math_loss": 0.9752342700958252, | |
| "eval_math_mean_token_accuracy": 0.7617981548309326, | |
| "eval_math_num_tokens": 31298994.0, | |
| "eval_math_runtime": 61.7884, | |
| "eval_math_samples_per_second": 8.092, | |
| "eval_math_steps_per_second": 2.023, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_physics_entropy": 0.8225784077644348, | |
| "eval_physics_loss": 0.8789661526679993, | |
| "eval_physics_mean_token_accuracy": 0.7737646398544311, | |
| "eval_physics_num_tokens": 31298994.0, | |
| "eval_physics_runtime": 70.3931, | |
| "eval_physics_samples_per_second": 7.103, | |
| "eval_physics_steps_per_second": 1.776, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.152071548998356, | |
| "epoch": 2.576, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.0303703703703705e-05, | |
| "loss": 18.5693, | |
| "mean_token_accuracy": 0.7044297493994236, | |
| "num_tokens": 31488512.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.1696349333971738, | |
| "epoch": 2.592, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.0229629629629631e-05, | |
| "loss": 18.8544, | |
| "mean_token_accuracy": 0.7005049493163824, | |
| "num_tokens": 31684651.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.1627265084534883, | |
| "epoch": 2.608, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.0155555555555556e-05, | |
| "loss": 18.6882, | |
| "mean_token_accuracy": 0.7010684039443731, | |
| "num_tokens": 31876008.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 1.130651018768549, | |
| "epoch": 2.624, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.0081481481481484e-05, | |
| "loss": 18.1415, | |
| "mean_token_accuracy": 0.7068800464272499, | |
| "num_tokens": 32069617.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.1576575651764869, | |
| "epoch": 2.64, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.0007407407407407e-05, | |
| "loss": 18.7397, | |
| "mean_token_accuracy": 0.700667466595769, | |
| "num_tokens": 32262932.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.1208720214664936, | |
| "epoch": 2.656, | |
| "grad_norm": 18.625, | |
| "learning_rate": 9.933333333333334e-06, | |
| "loss": 17.9602, | |
| "mean_token_accuracy": 0.7097393788397313, | |
| "num_tokens": 32465495.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.1169210582971574, | |
| "epoch": 2.672, | |
| "grad_norm": 18.5, | |
| "learning_rate": 9.85925925925926e-06, | |
| "loss": 18.0691, | |
| "mean_token_accuracy": 0.7105248533189297, | |
| "num_tokens": 32665458.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.1241317071020602, | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 18.25, | |
| "learning_rate": 9.785185185185187e-06, | |
| "loss": 18.1474, | |
| "mean_token_accuracy": 0.7082856688648462, | |
| "num_tokens": 32867135.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 1.1339986488223075, | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 18.75, | |
| "learning_rate": 9.711111111111111e-06, | |
| "loss": 18.2237, | |
| "mean_token_accuracy": 0.7075312845408916, | |
| "num_tokens": 33070366.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 1.1257793482393026, | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 21.0, | |
| "learning_rate": 9.637037037037038e-06, | |
| "loss": 18.1537, | |
| "mean_token_accuracy": 0.707587756216526, | |
| "num_tokens": 33269122.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_biology_entropy": 1.1073571362495422, | |
| "eval_biology_loss": 1.161886215209961, | |
| "eval_biology_mean_token_accuracy": 0.7041741323471069, | |
| "eval_biology_num_tokens": 33269122.0, | |
| "eval_biology_runtime": 48.635, | |
| "eval_biology_samples_per_second": 10.281, | |
| "eval_biology_steps_per_second": 2.57, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_chemistry_entropy": 0.8408446173667907, | |
| "eval_chemistry_loss": 0.8714523911476135, | |
| "eval_chemistry_mean_token_accuracy": 0.7681687192916871, | |
| "eval_chemistry_num_tokens": 33269122.0, | |
| "eval_chemistry_runtime": 60.3526, | |
| "eval_chemistry_samples_per_second": 8.285, | |
| "eval_chemistry_steps_per_second": 2.071, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_math_entropy": 0.74438600897789, | |
| "eval_math_loss": 0.9744483828544617, | |
| "eval_math_mean_token_accuracy": 0.7620029444694519, | |
| "eval_math_num_tokens": 33269122.0, | |
| "eval_math_runtime": 61.8511, | |
| "eval_math_samples_per_second": 8.084, | |
| "eval_math_steps_per_second": 2.021, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_physics_entropy": 0.8209939393997192, | |
| "eval_physics_loss": 0.8775029182434082, | |
| "eval_physics_mean_token_accuracy": 0.7741135511398315, | |
| "eval_physics_num_tokens": 33269122.0, | |
| "eval_physics_runtime": 70.4692, | |
| "eval_physics_samples_per_second": 7.095, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.1649049088358878, | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 18.0, | |
| "learning_rate": 9.562962962962965e-06, | |
| "loss": 18.7293, | |
| "mean_token_accuracy": 0.7025999147444963, | |
| "num_tokens": 33456592.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.15311808437109, | |
| "epoch": 2.752, | |
| "grad_norm": 19.25, | |
| "learning_rate": 9.48888888888889e-06, | |
| "loss": 18.4325, | |
| "mean_token_accuracy": 0.7036401994526387, | |
| "num_tokens": 33645862.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.1674226205796003, | |
| "epoch": 2.768, | |
| "grad_norm": 18.75, | |
| "learning_rate": 9.414814814814816e-06, | |
| "loss": 18.8481, | |
| "mean_token_accuracy": 0.699421489983797, | |
| "num_tokens": 33835740.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.1250699553638697, | |
| "epoch": 2.784, | |
| "grad_norm": 18.0, | |
| "learning_rate": 9.34074074074074e-06, | |
| "loss": 18.1108, | |
| "mean_token_accuracy": 0.7099443785846233, | |
| "num_tokens": 34034702.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.1213359594345094, | |
| "epoch": 2.8, | |
| "grad_norm": 17.875, | |
| "learning_rate": 9.266666666666667e-06, | |
| "loss": 18.1123, | |
| "mean_token_accuracy": 0.7095074690878391, | |
| "num_tokens": 34235603.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.1317555967718362, | |
| "epoch": 2.816, | |
| "grad_norm": 19.625, | |
| "learning_rate": 9.192592592592594e-06, | |
| "loss": 18.2156, | |
| "mean_token_accuracy": 0.7085574407130479, | |
| "num_tokens": 34434833.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 1.1643431086093188, | |
| "epoch": 2.832, | |
| "grad_norm": 19.875, | |
| "learning_rate": 9.118518518518518e-06, | |
| "loss": 18.7866, | |
| "mean_token_accuracy": 0.702312757447362, | |
| "num_tokens": 34625581.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.1494974169880152, | |
| "epoch": 2.848, | |
| "grad_norm": 20.375, | |
| "learning_rate": 9.044444444444445e-06, | |
| "loss": 18.5235, | |
| "mean_token_accuracy": 0.705839891731739, | |
| "num_tokens": 34816521.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.1289043568074704, | |
| "epoch": 2.864, | |
| "grad_norm": 19.125, | |
| "learning_rate": 8.970370370370372e-06, | |
| "loss": 18.2474, | |
| "mean_token_accuracy": 0.7077045034617185, | |
| "num_tokens": 35013631.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.1608365170657635, | |
| "epoch": 2.88, | |
| "grad_norm": 20.75, | |
| "learning_rate": 8.896296296296298e-06, | |
| "loss": 18.651, | |
| "mean_token_accuracy": 0.7021035224199295, | |
| "num_tokens": 35207800.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_biology_entropy": 1.1227384514808654, | |
| "eval_biology_loss": 1.1606189012527466, | |
| "eval_biology_mean_token_accuracy": 0.7041417050361634, | |
| "eval_biology_num_tokens": 35207800.0, | |
| "eval_biology_runtime": 48.5521, | |
| "eval_biology_samples_per_second": 10.298, | |
| "eval_biology_steps_per_second": 2.575, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_chemistry_entropy": 0.8532550582885742, | |
| "eval_chemistry_loss": 0.8718345761299133, | |
| "eval_chemistry_mean_token_accuracy": 0.7678817505836487, | |
| "eval_chemistry_num_tokens": 35207800.0, | |
| "eval_chemistry_runtime": 60.2048, | |
| "eval_chemistry_samples_per_second": 8.305, | |
| "eval_chemistry_steps_per_second": 2.076, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_math_entropy": 0.7495931763648986, | |
| "eval_math_loss": 0.9741966724395752, | |
| "eval_math_mean_token_accuracy": 0.7619234776496887, | |
| "eval_math_num_tokens": 35207800.0, | |
| "eval_math_runtime": 61.7637, | |
| "eval_math_samples_per_second": 8.095, | |
| "eval_math_steps_per_second": 2.024, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_physics_entropy": 0.8320713305473327, | |
| "eval_physics_loss": 0.8780032992362976, | |
| "eval_physics_mean_token_accuracy": 0.7738002591133117, | |
| "eval_physics_num_tokens": 35207800.0, | |
| "eval_physics_runtime": 70.3744, | |
| "eval_physics_samples_per_second": 7.105, | |
| "eval_physics_steps_per_second": 1.776, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.1618830259889363, | |
| "epoch": 2.896, | |
| "grad_norm": 19.0, | |
| "learning_rate": 8.822222222222223e-06, | |
| "loss": 18.4927, | |
| "mean_token_accuracy": 0.7027845978736877, | |
| "num_tokens": 35402995.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.153843991830945, | |
| "epoch": 2.912, | |
| "grad_norm": 19.375, | |
| "learning_rate": 8.74814814814815e-06, | |
| "loss": 18.7586, | |
| "mean_token_accuracy": 0.7019136741757392, | |
| "num_tokens": 35595456.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.1577350933104753, | |
| "epoch": 2.928, | |
| "grad_norm": 19.5, | |
| "learning_rate": 8.674074074074074e-06, | |
| "loss": 18.625, | |
| "mean_token_accuracy": 0.703600461781025, | |
| "num_tokens": 35787309.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 1.140915045887232, | |
| "epoch": 2.944, | |
| "grad_norm": 17.875, | |
| "learning_rate": 8.6e-06, | |
| "loss": 18.3324, | |
| "mean_token_accuracy": 0.7066537465900182, | |
| "num_tokens": 35982686.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 1.1303145423531533, | |
| "epoch": 2.96, | |
| "grad_norm": 19.5, | |
| "learning_rate": 8.525925925925927e-06, | |
| "loss": 18.2798, | |
| "mean_token_accuracy": 0.7072657477110624, | |
| "num_tokens": 36179520.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.1355241533368825, | |
| "epoch": 2.976, | |
| "grad_norm": 18.625, | |
| "learning_rate": 8.451851851851852e-06, | |
| "loss": 18.3476, | |
| "mean_token_accuracy": 0.7064431738108397, | |
| "num_tokens": 36373689.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.1398762241005898, | |
| "epoch": 2.992, | |
| "grad_norm": 22.375, | |
| "learning_rate": 8.377777777777779e-06, | |
| "loss": 18.4292, | |
| "mean_token_accuracy": 0.7060572128742933, | |
| "num_tokens": 36569612.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 1.1687027130275964, | |
| "epoch": 3.008, | |
| "grad_norm": 19.875, | |
| "learning_rate": 8.303703703703705e-06, | |
| "loss": 18.6648, | |
| "mean_token_accuracy": 0.7008682768791914, | |
| "num_tokens": 36759194.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.1632583592087031, | |
| "epoch": 3.024, | |
| "grad_norm": 18.625, | |
| "learning_rate": 8.229629629629632e-06, | |
| "loss": 18.7022, | |
| "mean_token_accuracy": 0.7013984300196171, | |
| "num_tokens": 36947293.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 1.1147421635687351, | |
| "epoch": 3.04, | |
| "grad_norm": 19.75, | |
| "learning_rate": 8.155555555555556e-06, | |
| "loss": 18.0582, | |
| "mean_token_accuracy": 0.7098672483116388, | |
| "num_tokens": 37136901.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_biology_entropy": 1.1026909346580505, | |
| "eval_biology_loss": 1.1605921983718872, | |
| "eval_biology_mean_token_accuracy": 0.7041417622566223, | |
| "eval_biology_num_tokens": 37136901.0, | |
| "eval_biology_runtime": 48.5111, | |
| "eval_biology_samples_per_second": 10.307, | |
| "eval_biology_steps_per_second": 2.577, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_chemistry_entropy": 0.835658191204071, | |
| "eval_chemistry_loss": 0.8722280859947205, | |
| "eval_chemistry_mean_token_accuracy": 0.7677701048851013, | |
| "eval_chemistry_num_tokens": 37136901.0, | |
| "eval_chemistry_runtime": 60.1707, | |
| "eval_chemistry_samples_per_second": 8.31, | |
| "eval_chemistry_steps_per_second": 2.077, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_math_entropy": 0.7388259909152984, | |
| "eval_math_loss": 0.9761671423912048, | |
| "eval_math_mean_token_accuracy": 0.7617930121421814, | |
| "eval_math_num_tokens": 37136901.0, | |
| "eval_math_runtime": 61.9396, | |
| "eval_math_samples_per_second": 8.072, | |
| "eval_math_steps_per_second": 2.018, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_physics_entropy": 0.8157856950759887, | |
| "eval_physics_loss": 0.8783439993858337, | |
| "eval_physics_mean_token_accuracy": 0.7741703715324402, | |
| "eval_physics_num_tokens": 37136901.0, | |
| "eval_physics_runtime": 70.3241, | |
| "eval_physics_samples_per_second": 7.11, | |
| "eval_physics_steps_per_second": 1.777, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.1322670388966798, | |
| "epoch": 3.056, | |
| "grad_norm": 18.875, | |
| "learning_rate": 8.081481481481483e-06, | |
| "loss": 18.1169, | |
| "mean_token_accuracy": 0.7071005918085576, | |
| "num_tokens": 37331588.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.1061217069625855, | |
| "epoch": 3.072, | |
| "grad_norm": 19.875, | |
| "learning_rate": 8.007407407407408e-06, | |
| "loss": 17.7951, | |
| "mean_token_accuracy": 0.71182203553617, | |
| "num_tokens": 37521211.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.1269909385591745, | |
| "epoch": 3.088, | |
| "grad_norm": 19.125, | |
| "learning_rate": 7.933333333333334e-06, | |
| "loss": 18.2156, | |
| "mean_token_accuracy": 0.7076956331729889, | |
| "num_tokens": 37713328.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 1.1100518554449081, | |
| "epoch": 3.104, | |
| "grad_norm": 18.625, | |
| "learning_rate": 7.859259259259259e-06, | |
| "loss": 17.7995, | |
| "mean_token_accuracy": 0.7128304049372673, | |
| "num_tokens": 37913470.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.1166743770241738, | |
| "epoch": 3.12, | |
| "grad_norm": 18.5, | |
| "learning_rate": 7.785185185185185e-06, | |
| "loss": 18.061, | |
| "mean_token_accuracy": 0.7102392159402371, | |
| "num_tokens": 38106718.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.1333389516919852, | |
| "epoch": 3.136, | |
| "grad_norm": 20.625, | |
| "learning_rate": 7.711111111111112e-06, | |
| "loss": 18.1058, | |
| "mean_token_accuracy": 0.7095236502587795, | |
| "num_tokens": 38295701.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 1.1190204188227653, | |
| "epoch": 3.152, | |
| "grad_norm": 19.25, | |
| "learning_rate": 7.637037037037037e-06, | |
| "loss": 17.9389, | |
| "mean_token_accuracy": 0.7105425789952278, | |
| "num_tokens": 38493951.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.146292532607913, | |
| "epoch": 3.168, | |
| "grad_norm": 20.125, | |
| "learning_rate": 7.562962962962963e-06, | |
| "loss": 18.4401, | |
| "mean_token_accuracy": 0.7044953163713217, | |
| "num_tokens": 38688390.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 1.1336342521011828, | |
| "epoch": 3.184, | |
| "grad_norm": 17.875, | |
| "learning_rate": 7.48888888888889e-06, | |
| "loss": 18.2461, | |
| "mean_token_accuracy": 0.7075372830033302, | |
| "num_tokens": 38880157.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 1.110951554775238, | |
| "epoch": 3.2, | |
| "grad_norm": 18.5, | |
| "learning_rate": 7.4148148148148155e-06, | |
| "loss": 18.0452, | |
| "mean_token_accuracy": 0.7110834132879973, | |
| "num_tokens": 39081782.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_biology_entropy": 1.0987086420059204, | |
| "eval_biology_loss": 1.1604441404342651, | |
| "eval_biology_mean_token_accuracy": 0.7043456192016602, | |
| "eval_biology_num_tokens": 39081782.0, | |
| "eval_biology_runtime": 48.5741, | |
| "eval_biology_samples_per_second": 10.294, | |
| "eval_biology_steps_per_second": 2.573, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_chemistry_entropy": 0.8334956364631653, | |
| "eval_chemistry_loss": 0.8719582557678223, | |
| "eval_chemistry_mean_token_accuracy": 0.7679322199821472, | |
| "eval_chemistry_num_tokens": 39081782.0, | |
| "eval_chemistry_runtime": 60.6063, | |
| "eval_chemistry_samples_per_second": 8.25, | |
| "eval_chemistry_steps_per_second": 2.062, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_math_entropy": 0.7380490090847015, | |
| "eval_math_loss": 0.9763364195823669, | |
| "eval_math_mean_token_accuracy": 0.7618815884590149, | |
| "eval_math_num_tokens": 39081782.0, | |
| "eval_math_runtime": 62.6076, | |
| "eval_math_samples_per_second": 7.986, | |
| "eval_math_steps_per_second": 1.997, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_physics_entropy": 0.814041579246521, | |
| "eval_physics_loss": 0.8782714605331421, | |
| "eval_physics_mean_token_accuracy": 0.7738414001464844, | |
| "eval_physics_num_tokens": 39081782.0, | |
| "eval_physics_runtime": 71.4094, | |
| "eval_physics_samples_per_second": 7.002, | |
| "eval_physics_steps_per_second": 1.75, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.4082567723215933e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |