Instructions to use roonbug/2b63aec8 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use roonbug/2b63aec8 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="roonbug/2b63aec8") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("roonbug/2b63aec8") model = AutoModelForImageTextToText.from_pretrained("roonbug/2b63aec8") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use roonbug/2b63aec8 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "roonbug/2b63aec8" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/2b63aec8", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/roonbug/2b63aec8
- SGLang
How to use roonbug/2b63aec8 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "roonbug/2b63aec8" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/2b63aec8", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "roonbug/2b63aec8" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/2b63aec8", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use roonbug/2b63aec8 with Docker Model Runner:
docker model run hf.co/roonbug/2b63aec8
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.6, | |
| "eval_steps": 100, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.142920307815075, | |
| "epoch": 0.016, | |
| "grad_norm": 290.0, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 42.6658, | |
| "mean_token_accuracy": 0.5620782226324081, | |
| "num_tokens": 195524.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.148210159689188, | |
| "epoch": 0.032, | |
| "grad_norm": 239.0, | |
| "learning_rate": 1.2666666666666669e-06, | |
| "loss": 41.9984, | |
| "mean_token_accuracy": 0.5613080382347106, | |
| "num_tokens": 390903.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.1933260083198547, | |
| "epoch": 0.048, | |
| "grad_norm": 249.0, | |
| "learning_rate": 1.9333333333333336e-06, | |
| "loss": 40.6208, | |
| "mean_token_accuracy": 0.5657517908141017, | |
| "num_tokens": 589868.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.2957281917333603, | |
| "epoch": 0.064, | |
| "grad_norm": 139.0, | |
| "learning_rate": 2.6e-06, | |
| "loss": 37.9032, | |
| "mean_token_accuracy": 0.5714796105399728, | |
| "num_tokens": 791190.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.5075685508549213, | |
| "epoch": 0.08, | |
| "grad_norm": 94.0, | |
| "learning_rate": 3.266666666666667e-06, | |
| "loss": 35.7561, | |
| "mean_token_accuracy": 0.5766569443047047, | |
| "num_tokens": 989860.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.7984249681234359, | |
| "epoch": 0.096, | |
| "grad_norm": 50.75, | |
| "learning_rate": 3.9333333333333335e-06, | |
| "loss": 33.4379, | |
| "mean_token_accuracy": 0.5814697606489062, | |
| "num_tokens": 1181777.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.8387351341545581, | |
| "epoch": 0.112, | |
| "grad_norm": 43.0, | |
| "learning_rate": 4.600000000000001e-06, | |
| "loss": 30.4219, | |
| "mean_token_accuracy": 0.5971228444948793, | |
| "num_tokens": 1385513.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.7275233700871468, | |
| "epoch": 0.128, | |
| "grad_norm": 33.5, | |
| "learning_rate": 5.2666666666666665e-06, | |
| "loss": 28.4703, | |
| "mean_token_accuracy": 0.6095364252105355, | |
| "num_tokens": 1582368.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.7214979872107505, | |
| "epoch": 0.144, | |
| "grad_norm": 27.0, | |
| "learning_rate": 5.933333333333335e-06, | |
| "loss": 26.677, | |
| "mean_token_accuracy": 0.6243448719382286, | |
| "num_tokens": 1773764.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.6311134904623033, | |
| "epoch": 0.16, | |
| "grad_norm": 22.0, | |
| "learning_rate": 6.600000000000001e-06, | |
| "loss": 25.7683, | |
| "mean_token_accuracy": 0.6301404371857643, | |
| "num_tokens": 1970077.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_biology_entropy": 1.5580159120559693, | |
| "eval_biology_loss": 1.5081593990325928, | |
| "eval_biology_mean_token_accuracy": 0.6457349667549134, | |
| "eval_biology_num_tokens": 1970077.0, | |
| "eval_biology_runtime": 48.7413, | |
| "eval_biology_samples_per_second": 10.258, | |
| "eval_biology_steps_per_second": 2.565, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_chemistry_entropy": 1.206756212234497, | |
| "eval_chemistry_loss": 1.1218774318695068, | |
| "eval_chemistry_mean_token_accuracy": 0.7205783066749573, | |
| "eval_chemistry_num_tokens": 1970077.0, | |
| "eval_chemistry_runtime": 60.3159, | |
| "eval_chemistry_samples_per_second": 8.29, | |
| "eval_chemistry_steps_per_second": 2.072, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_math_entropy": 0.9672308325767517, | |
| "eval_math_loss": 1.159799337387085, | |
| "eval_math_mean_token_accuracy": 0.7189845342636109, | |
| "eval_math_num_tokens": 1970077.0, | |
| "eval_math_runtime": 61.8237, | |
| "eval_math_samples_per_second": 8.088, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_physics_entropy": 1.1670387201309205, | |
| "eval_physics_loss": 1.1291608810424805, | |
| "eval_physics_mean_token_accuracy": 0.7211072521209717, | |
| "eval_physics_num_tokens": 1970077.0, | |
| "eval_physics_runtime": 70.4586, | |
| "eval_physics_samples_per_second": 7.096, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.5482715763151647, | |
| "epoch": 0.176, | |
| "grad_norm": 21.125, | |
| "learning_rate": 7.266666666666668e-06, | |
| "loss": 24.5868, | |
| "mean_token_accuracy": 0.6385629490017891, | |
| "num_tokens": 2168354.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.5266574397683144, | |
| "epoch": 0.192, | |
| "grad_norm": 22.875, | |
| "learning_rate": 7.933333333333334e-06, | |
| "loss": 24.2707, | |
| "mean_token_accuracy": 0.6432460084557533, | |
| "num_tokens": 2365822.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.5192069873213767, | |
| "epoch": 0.208, | |
| "grad_norm": 20.875, | |
| "learning_rate": 8.6e-06, | |
| "loss": 24.1355, | |
| "mean_token_accuracy": 0.6436416517943144, | |
| "num_tokens": 2558762.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.4698147468268872, | |
| "epoch": 0.224, | |
| "grad_norm": 20.125, | |
| "learning_rate": 9.266666666666667e-06, | |
| "loss": 23.5154, | |
| "mean_token_accuracy": 0.6499760080128908, | |
| "num_tokens": 2755347.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.4506230603903532, | |
| "epoch": 0.24, | |
| "grad_norm": 19.625, | |
| "learning_rate": 9.933333333333334e-06, | |
| "loss": 23.2013, | |
| "mean_token_accuracy": 0.6523264441639185, | |
| "num_tokens": 2947346.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.4590953961014748, | |
| "epoch": 0.256, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.0600000000000002e-05, | |
| "loss": 23.3227, | |
| "mean_token_accuracy": 0.6508617259562015, | |
| "num_tokens": 3139957.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.419396448880434, | |
| "epoch": 0.272, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.1266666666666668e-05, | |
| "loss": 22.7352, | |
| "mean_token_accuracy": 0.6572458431124687, | |
| "num_tokens": 3335951.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.4005608204752207, | |
| "epoch": 0.288, | |
| "grad_norm": 19.75, | |
| "learning_rate": 1.1933333333333335e-05, | |
| "loss": 22.3969, | |
| "mean_token_accuracy": 0.6585959013551473, | |
| "num_tokens": 3539731.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.391934547200799, | |
| "epoch": 0.304, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 22.31, | |
| "mean_token_accuracy": 0.6621056370437145, | |
| "num_tokens": 3733488.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.4028674490749835, | |
| "epoch": 0.32, | |
| "grad_norm": 22.25, | |
| "learning_rate": 1.3266666666666668e-05, | |
| "loss": 22.5559, | |
| "mean_token_accuracy": 0.6576981086283922, | |
| "num_tokens": 3920545.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_biology_entropy": 1.3209806289672852, | |
| "eval_biology_loss": 1.338399887084961, | |
| "eval_biology_mean_token_accuracy": 0.6720403518676757, | |
| "eval_biology_num_tokens": 3920545.0, | |
| "eval_biology_runtime": 48.5853, | |
| "eval_biology_samples_per_second": 10.291, | |
| "eval_biology_steps_per_second": 2.573, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_chemistry_entropy": 1.0033348879814148, | |
| "eval_chemistry_loss": 0.9935092926025391, | |
| "eval_chemistry_mean_token_accuracy": 0.7448974308967591, | |
| "eval_chemistry_num_tokens": 3920545.0, | |
| "eval_chemistry_runtime": 60.24, | |
| "eval_chemistry_samples_per_second": 8.3, | |
| "eval_chemistry_steps_per_second": 2.075, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_math_entropy": 0.8341804294586181, | |
| "eval_math_loss": 1.0635857582092285, | |
| "eval_math_mean_token_accuracy": 0.7432106451988221, | |
| "eval_math_num_tokens": 3920545.0, | |
| "eval_math_runtime": 61.8174, | |
| "eval_math_samples_per_second": 8.088, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_physics_entropy": 0.9652358031272888, | |
| "eval_physics_loss": 0.9950281977653503, | |
| "eval_physics_mean_token_accuracy": 0.7510108857154846, | |
| "eval_physics_num_tokens": 3920545.0, | |
| "eval_physics_runtime": 70.411, | |
| "eval_physics_samples_per_second": 7.101, | |
| "eval_physics_steps_per_second": 1.775, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.3548175282776356, | |
| "epoch": 0.336, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.3933333333333334e-05, | |
| "loss": 21.7763, | |
| "mean_token_accuracy": 0.6656343434005976, | |
| "num_tokens": 4114077.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.3656601022928954, | |
| "epoch": 0.352, | |
| "grad_norm": 20.625, | |
| "learning_rate": 1.46e-05, | |
| "loss": 22.0972, | |
| "mean_token_accuracy": 0.6638848338276148, | |
| "num_tokens": 4306949.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.3525194190442562, | |
| "epoch": 0.368, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.5266666666666667e-05, | |
| "loss": 21.7293, | |
| "mean_token_accuracy": 0.6680811226367951, | |
| "num_tokens": 4504001.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.3454820621758699, | |
| "epoch": 0.384, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.5933333333333336e-05, | |
| "loss": 21.7032, | |
| "mean_token_accuracy": 0.6671383358538151, | |
| "num_tokens": 4693812.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.3525703553110362, | |
| "epoch": 0.4, | |
| "grad_norm": 17.5, | |
| "learning_rate": 1.66e-05, | |
| "loss": 21.7856, | |
| "mean_token_accuracy": 0.666401931643486, | |
| "num_tokens": 4887094.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.351718918606639, | |
| "epoch": 0.416, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.726666666666667e-05, | |
| "loss": 21.9058, | |
| "mean_token_accuracy": 0.6651136819273233, | |
| "num_tokens": 5085369.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.3526419658213853, | |
| "epoch": 0.432, | |
| "grad_norm": 20.875, | |
| "learning_rate": 1.7933333333333333e-05, | |
| "loss": 21.7813, | |
| "mean_token_accuracy": 0.6668458927422762, | |
| "num_tokens": 5271275.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.3480545241385697, | |
| "epoch": 0.448, | |
| "grad_norm": 22.875, | |
| "learning_rate": 1.86e-05, | |
| "loss": 21.627, | |
| "mean_token_accuracy": 0.6677324704825878, | |
| "num_tokens": 5460559.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.301166184991598, | |
| "epoch": 0.464, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.926666666666667e-05, | |
| "loss": 20.889, | |
| "mean_token_accuracy": 0.676617132872343, | |
| "num_tokens": 5653809.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.318466317281127, | |
| "epoch": 0.48, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.9933333333333334e-05, | |
| "loss": 21.2936, | |
| "mean_token_accuracy": 0.6712827417999506, | |
| "num_tokens": 5850176.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_biology_entropy": 1.2827796216011047, | |
| "eval_biology_loss": 1.275201678276062, | |
| "eval_biology_mean_token_accuracy": 0.6830832781791687, | |
| "eval_biology_num_tokens": 5850176.0, | |
| "eval_biology_runtime": 48.4915, | |
| "eval_biology_samples_per_second": 10.311, | |
| "eval_biology_steps_per_second": 2.578, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_chemistry_entropy": 0.983495129108429, | |
| "eval_chemistry_loss": 0.9488818645477295, | |
| "eval_chemistry_mean_token_accuracy": 0.7523409638404847, | |
| "eval_chemistry_num_tokens": 5850176.0, | |
| "eval_chemistry_runtime": 60.1707, | |
| "eval_chemistry_samples_per_second": 8.31, | |
| "eval_chemistry_steps_per_second": 2.077, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_math_entropy": 0.8216862387657166, | |
| "eval_math_loss": 1.0297818183898926, | |
| "eval_math_mean_token_accuracy": 0.7488151121139527, | |
| "eval_math_num_tokens": 5850176.0, | |
| "eval_math_runtime": 61.6905, | |
| "eval_math_samples_per_second": 8.105, | |
| "eval_math_steps_per_second": 2.026, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_physics_entropy": 0.9433758721351624, | |
| "eval_physics_loss": 0.9520999193191528, | |
| "eval_physics_mean_token_accuracy": 0.7585058889389038, | |
| "eval_physics_num_tokens": 5850176.0, | |
| "eval_physics_runtime": 70.301, | |
| "eval_physics_samples_per_second": 7.112, | |
| "eval_physics_steps_per_second": 1.778, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.2579400472342968, | |
| "epoch": 0.496, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.9933333333333334e-05, | |
| "loss": 20.2011, | |
| "mean_token_accuracy": 0.6842056062072516, | |
| "num_tokens": 6046503.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.3082518883049488, | |
| "epoch": 0.512, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.985925925925926e-05, | |
| "loss": 21.0658, | |
| "mean_token_accuracy": 0.6749501373618841, | |
| "num_tokens": 6240456.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.3003981616348028, | |
| "epoch": 0.528, | |
| "grad_norm": 18.125, | |
| "learning_rate": 1.9785185185185187e-05, | |
| "loss": 20.9809, | |
| "mean_token_accuracy": 0.6757604543119669, | |
| "num_tokens": 6430555.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.2986273631453513, | |
| "epoch": 0.544, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.971111111111111e-05, | |
| "loss": 20.8809, | |
| "mean_token_accuracy": 0.6782271713018417, | |
| "num_tokens": 6626006.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.284830729290843, | |
| "epoch": 0.56, | |
| "grad_norm": 17.25, | |
| "learning_rate": 1.963703703703704e-05, | |
| "loss": 20.8197, | |
| "mean_token_accuracy": 0.6767117112874985, | |
| "num_tokens": 6820754.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.2683125745505095, | |
| "epoch": 0.576, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.9562962962962964e-05, | |
| "loss": 20.4541, | |
| "mean_token_accuracy": 0.6809794403612613, | |
| "num_tokens": 7021844.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.2863252360373736, | |
| "epoch": 0.592, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.948888888888889e-05, | |
| "loss": 20.8043, | |
| "mean_token_accuracy": 0.676701345667243, | |
| "num_tokens": 7213951.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.2630502216517925, | |
| "epoch": 0.608, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.9414814814814817e-05, | |
| "loss": 20.4041, | |
| "mean_token_accuracy": 0.6803740747272968, | |
| "num_tokens": 7416773.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.2804703898727894, | |
| "epoch": 0.624, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.9340740740740743e-05, | |
| "loss": 20.6218, | |
| "mean_token_accuracy": 0.6788272958248853, | |
| "num_tokens": 7612843.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.2843346055597067, | |
| "epoch": 0.64, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.926666666666667e-05, | |
| "loss": 20.7171, | |
| "mean_token_accuracy": 0.6782444745302201, | |
| "num_tokens": 7801633.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_biology_entropy": 1.226506398677826, | |
| "eval_biology_loss": 1.2382104396820068, | |
| "eval_biology_mean_token_accuracy": 0.6894095778465271, | |
| "eval_biology_num_tokens": 7801633.0, | |
| "eval_biology_runtime": 48.5507, | |
| "eval_biology_samples_per_second": 10.299, | |
| "eval_biology_steps_per_second": 2.575, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_chemistry_entropy": 0.9317227191925049, | |
| "eval_chemistry_loss": 0.9207452535629272, | |
| "eval_chemistry_mean_token_accuracy": 0.7581370029449462, | |
| "eval_chemistry_num_tokens": 7801633.0, | |
| "eval_chemistry_runtime": 60.2113, | |
| "eval_chemistry_samples_per_second": 8.304, | |
| "eval_chemistry_steps_per_second": 2.076, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_math_entropy": 0.7863595089912414, | |
| "eval_math_loss": 1.010460376739502, | |
| "eval_math_mean_token_accuracy": 0.7535392093658447, | |
| "eval_math_num_tokens": 7801633.0, | |
| "eval_math_runtime": 61.807, | |
| "eval_math_samples_per_second": 8.09, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_physics_entropy": 0.8958085932731629, | |
| "eval_physics_loss": 0.9257401823997498, | |
| "eval_physics_mean_token_accuracy": 0.7637984156608582, | |
| "eval_physics_num_tokens": 7801633.0, | |
| "eval_physics_runtime": 70.3663, | |
| "eval_physics_samples_per_second": 7.106, | |
| "eval_physics_steps_per_second": 1.776, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.278659427165985, | |
| "epoch": 0.656, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.9192592592592593e-05, | |
| "loss": 20.6682, | |
| "mean_token_accuracy": 0.6772829819470644, | |
| "num_tokens": 7995843.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.2931427203118802, | |
| "epoch": 0.672, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.911851851851852e-05, | |
| "loss": 20.8656, | |
| "mean_token_accuracy": 0.6753748003393412, | |
| "num_tokens": 8183103.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.2739692747592926, | |
| "epoch": 0.688, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.9044444444444446e-05, | |
| "loss": 20.5407, | |
| "mean_token_accuracy": 0.6812681049108505, | |
| "num_tokens": 8385976.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.2659825466573238, | |
| "epoch": 0.704, | |
| "grad_norm": 16.25, | |
| "learning_rate": 1.8970370370370372e-05, | |
| "loss": 20.4243, | |
| "mean_token_accuracy": 0.6820976916700602, | |
| "num_tokens": 8578431.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.220404140278697, | |
| "epoch": 0.72, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.8896296296296295e-05, | |
| "loss": 19.6546, | |
| "mean_token_accuracy": 0.6908745598047972, | |
| "num_tokens": 8781342.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.2406103231012822, | |
| "epoch": 0.736, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.8822222222222225e-05, | |
| "loss": 19.9745, | |
| "mean_token_accuracy": 0.6853331789374352, | |
| "num_tokens": 8977918.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.2618801843374967, | |
| "epoch": 0.752, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.874814814814815e-05, | |
| "loss": 20.4041, | |
| "mean_token_accuracy": 0.6825968738645315, | |
| "num_tokens": 9169322.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.2232345014810562, | |
| "epoch": 0.768, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.8674074074074075e-05, | |
| "loss": 19.7045, | |
| "mean_token_accuracy": 0.6888250291347504, | |
| "num_tokens": 9368141.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.25159954726696, | |
| "epoch": 0.784, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.86e-05, | |
| "loss": 20.2036, | |
| "mean_token_accuracy": 0.6849453710019588, | |
| "num_tokens": 9565236.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.264250884205103, | |
| "epoch": 0.8, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.8525925925925928e-05, | |
| "loss": 20.5299, | |
| "mean_token_accuracy": 0.6811827480792999, | |
| "num_tokens": 9761227.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_biology_entropy": 1.2163097896575927, | |
| "eval_biology_loss": 1.2177292108535767, | |
| "eval_biology_mean_token_accuracy": 0.6932459664344788, | |
| "eval_biology_num_tokens": 9761227.0, | |
| "eval_biology_runtime": 48.5438, | |
| "eval_biology_samples_per_second": 10.3, | |
| "eval_biology_steps_per_second": 2.575, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_chemistry_entropy": 0.9239063205718994, | |
| "eval_chemistry_loss": 0.9047155380249023, | |
| "eval_chemistry_mean_token_accuracy": 0.761792631149292, | |
| "eval_chemistry_num_tokens": 9761227.0, | |
| "eval_chemistry_runtime": 59.9546, | |
| "eval_chemistry_samples_per_second": 8.34, | |
| "eval_chemistry_steps_per_second": 2.085, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_math_entropy": 0.7864464523792267, | |
| "eval_math_loss": 0.9939978122711182, | |
| "eval_math_mean_token_accuracy": 0.7574145245552063, | |
| "eval_math_num_tokens": 9761227.0, | |
| "eval_math_runtime": 61.7812, | |
| "eval_math_samples_per_second": 8.093, | |
| "eval_math_steps_per_second": 2.023, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_physics_entropy": 0.889360978603363, | |
| "eval_physics_loss": 0.9096766710281372, | |
| "eval_physics_mean_token_accuracy": 0.7674052910804748, | |
| "eval_physics_num_tokens": 9761227.0, | |
| "eval_physics_runtime": 70.5356, | |
| "eval_physics_samples_per_second": 7.089, | |
| "eval_physics_steps_per_second": 1.772, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.2362793002277612, | |
| "epoch": 0.816, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.8451851851851855e-05, | |
| "loss": 19.8133, | |
| "mean_token_accuracy": 0.6863504596054554, | |
| "num_tokens": 9958727.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.2254926670342683, | |
| "epoch": 0.832, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.8377777777777778e-05, | |
| "loss": 19.8307, | |
| "mean_token_accuracy": 0.6866675779223442, | |
| "num_tokens": 10155771.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.2238412775099277, | |
| "epoch": 0.848, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.8303703703703704e-05, | |
| "loss": 19.687, | |
| "mean_token_accuracy": 0.6897137116640806, | |
| "num_tokens": 10357721.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.2536957442760468, | |
| "epoch": 0.864, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.822962962962963e-05, | |
| "loss": 20.1565, | |
| "mean_token_accuracy": 0.6850291140377521, | |
| "num_tokens": 10552495.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.231699001789093, | |
| "epoch": 0.88, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.8155555555555557e-05, | |
| "loss": 19.8536, | |
| "mean_token_accuracy": 0.6891282081604004, | |
| "num_tokens": 10748749.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.2470501396805047, | |
| "epoch": 0.896, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.8081481481481484e-05, | |
| "loss": 20.1706, | |
| "mean_token_accuracy": 0.6856059569865465, | |
| "num_tokens": 10943319.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.2307742841541767, | |
| "epoch": 0.912, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.800740740740741e-05, | |
| "loss": 19.9062, | |
| "mean_token_accuracy": 0.6885740786790848, | |
| "num_tokens": 11136935.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.2445739306509496, | |
| "epoch": 0.928, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.7933333333333333e-05, | |
| "loss": 20.0979, | |
| "mean_token_accuracy": 0.6851089850068093, | |
| "num_tokens": 11331098.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.2021468229591847, | |
| "epoch": 0.944, | |
| "grad_norm": 15.875, | |
| "learning_rate": 1.785925925925926e-05, | |
| "loss": 19.4077, | |
| "mean_token_accuracy": 0.6915138956159353, | |
| "num_tokens": 11530550.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.226809823140502, | |
| "epoch": 0.96, | |
| "grad_norm": 19.625, | |
| "learning_rate": 1.7785185185185186e-05, | |
| "loss": 19.8062, | |
| "mean_token_accuracy": 0.6897286407649517, | |
| "num_tokens": 11729645.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_biology_entropy": 1.1845180039405823, | |
| "eval_biology_loss": 1.203829050064087, | |
| "eval_biology_mean_token_accuracy": 0.6961685500144958, | |
| "eval_biology_num_tokens": 11729645.0, | |
| "eval_biology_runtime": 48.6169, | |
| "eval_biology_samples_per_second": 10.284, | |
| "eval_biology_steps_per_second": 2.571, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_chemistry_entropy": 0.90015394115448, | |
| "eval_chemistry_loss": 0.8946329355239868, | |
| "eval_chemistry_mean_token_accuracy": 0.7635614371299744, | |
| "eval_chemistry_num_tokens": 11729645.0, | |
| "eval_chemistry_runtime": 60.2919, | |
| "eval_chemistry_samples_per_second": 8.293, | |
| "eval_chemistry_steps_per_second": 2.073, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_math_entropy": 0.7684455904960632, | |
| "eval_math_loss": 0.9900413751602173, | |
| "eval_math_mean_token_accuracy": 0.7588200316429138, | |
| "eval_math_num_tokens": 11729645.0, | |
| "eval_math_runtime": 61.8301, | |
| "eval_math_samples_per_second": 8.087, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_physics_entropy": 0.8686938014030456, | |
| "eval_physics_loss": 0.9008635878562927, | |
| "eval_physics_mean_token_accuracy": 0.7692377109527588, | |
| "eval_physics_num_tokens": 11729645.0, | |
| "eval_physics_runtime": 70.4349, | |
| "eval_physics_samples_per_second": 7.099, | |
| "eval_physics_steps_per_second": 1.775, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.2100978799164295, | |
| "epoch": 0.976, | |
| "grad_norm": 18.0, | |
| "learning_rate": 1.7711111111111113e-05, | |
| "loss": 19.47, | |
| "mean_token_accuracy": 0.6918121088296175, | |
| "num_tokens": 11924644.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.2226450834423304, | |
| "epoch": 0.992, | |
| "grad_norm": 16.625, | |
| "learning_rate": 1.763703703703704e-05, | |
| "loss": 19.8416, | |
| "mean_token_accuracy": 0.688375661149621, | |
| "num_tokens": 12123059.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.2316548496484756, | |
| "epoch": 1.008, | |
| "grad_norm": 16.875, | |
| "learning_rate": 1.7562962962962962e-05, | |
| "loss": 19.6116, | |
| "mean_token_accuracy": 0.6919524800032377, | |
| "num_tokens": 12319366.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.1779099617153406, | |
| "epoch": 1.024, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.7488888888888892e-05, | |
| "loss": 18.9763, | |
| "mean_token_accuracy": 0.6978646669536829, | |
| "num_tokens": 12524183.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.2152834441512823, | |
| "epoch": 1.04, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.7414814814814815e-05, | |
| "loss": 19.6247, | |
| "mean_token_accuracy": 0.6903412740677595, | |
| "num_tokens": 12718593.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.1799768891185523, | |
| "epoch": 1.056, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.7340740740740742e-05, | |
| "loss": 19.0432, | |
| "mean_token_accuracy": 0.6986244544386864, | |
| "num_tokens": 12917803.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.2108702428638936, | |
| "epoch": 1.072, | |
| "grad_norm": 19.125, | |
| "learning_rate": 1.726666666666667e-05, | |
| "loss": 19.4166, | |
| "mean_token_accuracy": 0.6927186574786901, | |
| "num_tokens": 13105826.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.1979756511747837, | |
| "epoch": 1.088, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.7192592592592595e-05, | |
| "loss": 19.2605, | |
| "mean_token_accuracy": 0.6957505799829959, | |
| "num_tokens": 13298619.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.192365935444832, | |
| "epoch": 1.104, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.711851851851852e-05, | |
| "loss": 19.2461, | |
| "mean_token_accuracy": 0.695810866355896, | |
| "num_tokens": 13491486.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.212946466356516, | |
| "epoch": 1.12, | |
| "grad_norm": 19.25, | |
| "learning_rate": 1.7044444444444445e-05, | |
| "loss": 19.5004, | |
| "mean_token_accuracy": 0.692466252297163, | |
| "num_tokens": 13674663.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_biology_entropy": 1.1571769905090332, | |
| "eval_biology_loss": 1.1946450471878052, | |
| "eval_biology_mean_token_accuracy": 0.6972691407203674, | |
| "eval_biology_num_tokens": 13674663.0, | |
| "eval_biology_runtime": 48.6729, | |
| "eval_biology_samples_per_second": 10.273, | |
| "eval_biology_steps_per_second": 2.568, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_chemistry_entropy": 0.8766862626075744, | |
| "eval_chemistry_loss": 0.8891168236732483, | |
| "eval_chemistry_mean_token_accuracy": 0.7645404329299926, | |
| "eval_chemistry_num_tokens": 13674663.0, | |
| "eval_chemistry_runtime": 60.3334, | |
| "eval_chemistry_samples_per_second": 8.287, | |
| "eval_chemistry_steps_per_second": 2.072, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_math_entropy": 0.7603865313529968, | |
| "eval_math_loss": 0.9834137558937073, | |
| "eval_math_mean_token_accuracy": 0.7596666264533997, | |
| "eval_math_num_tokens": 13674663.0, | |
| "eval_math_runtime": 61.8146, | |
| "eval_math_samples_per_second": 8.089, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_physics_entropy": 0.850571988105774, | |
| "eval_physics_loss": 0.8937918543815613, | |
| "eval_physics_mean_token_accuracy": 0.7703958468437195, | |
| "eval_physics_num_tokens": 13674663.0, | |
| "eval_physics_runtime": 70.4674, | |
| "eval_physics_samples_per_second": 7.095, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.1909121543169021, | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 18.875, | |
| "learning_rate": 1.697037037037037e-05, | |
| "loss": 19.2462, | |
| "mean_token_accuracy": 0.6955781776458025, | |
| "num_tokens": 13869134.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.1682380847632885, | |
| "epoch": 1.152, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.6896296296296298e-05, | |
| "loss": 18.8229, | |
| "mean_token_accuracy": 0.6991135813295841, | |
| "num_tokens": 14078365.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.1939557407051324, | |
| "epoch": 1.168, | |
| "grad_norm": 16.875, | |
| "learning_rate": 1.6822222222222224e-05, | |
| "loss": 19.1346, | |
| "mean_token_accuracy": 0.6960698150098323, | |
| "num_tokens": 14266831.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.180869185552001, | |
| "epoch": 1.184, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.6748148148148147e-05, | |
| "loss": 19.2654, | |
| "mean_token_accuracy": 0.6941378649324179, | |
| "num_tokens": 14465660.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.1937656667083503, | |
| "epoch": 1.2, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.6674074074074077e-05, | |
| "loss": 19.0305, | |
| "mean_token_accuracy": 0.6964295905083417, | |
| "num_tokens": 14653228.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.1589823190122843, | |
| "epoch": 1.216, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.66e-05, | |
| "loss": 18.6048, | |
| "mean_token_accuracy": 0.7018654596060514, | |
| "num_tokens": 14857782.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.1703605465590954, | |
| "epoch": 1.232, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.6525925925925927e-05, | |
| "loss": 18.8831, | |
| "mean_token_accuracy": 0.7015001580119133, | |
| "num_tokens": 15047356.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.1772115517407655, | |
| "epoch": 1.248, | |
| "grad_norm": 17.875, | |
| "learning_rate": 1.6451851851851853e-05, | |
| "loss": 19.0432, | |
| "mean_token_accuracy": 0.6959997840225697, | |
| "num_tokens": 15241098.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.196473068371415, | |
| "epoch": 1.264, | |
| "grad_norm": 16.375, | |
| "learning_rate": 1.637777777777778e-05, | |
| "loss": 19.1591, | |
| "mean_token_accuracy": 0.6967897292226553, | |
| "num_tokens": 15437657.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.2014197081327438, | |
| "epoch": 1.28, | |
| "grad_norm": 19.125, | |
| "learning_rate": 1.6303703703703706e-05, | |
| "loss": 19.4409, | |
| "mean_token_accuracy": 0.6926549930125475, | |
| "num_tokens": 15630795.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_biology_entropy": 1.134603425502777, | |
| "eval_biology_loss": 1.1884372234344482, | |
| "eval_biology_mean_token_accuracy": 0.6986491298675537, | |
| "eval_biology_num_tokens": 15630795.0, | |
| "eval_biology_runtime": 48.6306, | |
| "eval_biology_samples_per_second": 10.282, | |
| "eval_biology_steps_per_second": 2.57, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_chemistry_entropy": 0.8623910093307495, | |
| "eval_chemistry_loss": 0.885444700717926, | |
| "eval_chemistry_mean_token_accuracy": 0.7653528556823731, | |
| "eval_chemistry_num_tokens": 15630795.0, | |
| "eval_chemistry_runtime": 60.3508, | |
| "eval_chemistry_samples_per_second": 8.285, | |
| "eval_chemistry_steps_per_second": 2.071, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_math_entropy": 0.7589023416042328, | |
| "eval_math_loss": 0.983073353767395, | |
| "eval_math_mean_token_accuracy": 0.7593517408370972, | |
| "eval_math_num_tokens": 15630795.0, | |
| "eval_math_runtime": 61.9026, | |
| "eval_math_samples_per_second": 8.077, | |
| "eval_math_steps_per_second": 2.019, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_physics_entropy": 0.84130739736557, | |
| "eval_physics_loss": 0.8907755613327026, | |
| "eval_physics_mean_token_accuracy": 0.771062777519226, | |
| "eval_physics_num_tokens": 15630795.0, | |
| "eval_physics_runtime": 70.5226, | |
| "eval_physics_samples_per_second": 7.09, | |
| "eval_physics_steps_per_second": 1.772, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.2085642520338298, | |
| "epoch": 1.296, | |
| "grad_norm": 17.375, | |
| "learning_rate": 1.622962962962963e-05, | |
| "loss": 19.3831, | |
| "mean_token_accuracy": 0.6933640763163567, | |
| "num_tokens": 15827105.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.1861349143087865, | |
| "epoch": 1.312, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.6155555555555556e-05, | |
| "loss": 19.3103, | |
| "mean_token_accuracy": 0.694928414747119, | |
| "num_tokens": 16019645.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.195632776618004, | |
| "epoch": 1.328, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.6081481481481482e-05, | |
| "loss": 19.3068, | |
| "mean_token_accuracy": 0.6934712298214436, | |
| "num_tokens": 16221726.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.1725192748010158, | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.600740740740741e-05, | |
| "loss": 18.7963, | |
| "mean_token_accuracy": 0.700145885720849, | |
| "num_tokens": 16427594.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.179823150858283, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.5933333333333336e-05, | |
| "loss": 19.1154, | |
| "mean_token_accuracy": 0.6961398232728243, | |
| "num_tokens": 16621605.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.2228495314717294, | |
| "epoch": 1.376, | |
| "grad_norm": 19.5, | |
| "learning_rate": 1.5859259259259262e-05, | |
| "loss": 19.6627, | |
| "mean_token_accuracy": 0.6894211061298847, | |
| "num_tokens": 16813444.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.19021125882864, | |
| "epoch": 1.392, | |
| "grad_norm": 17.5, | |
| "learning_rate": 1.5785185185185185e-05, | |
| "loss": 19.2411, | |
| "mean_token_accuracy": 0.6957099426537752, | |
| "num_tokens": 17006509.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.184871331602335, | |
| "epoch": 1.408, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 1.571111111111111e-05, | |
| "loss": 18.9785, | |
| "mean_token_accuracy": 0.6971315786242485, | |
| "num_tokens": 17197870.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.1884775411337614, | |
| "epoch": 1.424, | |
| "grad_norm": 17.25, | |
| "learning_rate": 1.5637037037037038e-05, | |
| "loss": 19.1289, | |
| "mean_token_accuracy": 0.697301234304905, | |
| "num_tokens": 17394390.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.1825116220861673, | |
| "epoch": 1.44, | |
| "grad_norm": 20.75, | |
| "learning_rate": 1.5562962962962965e-05, | |
| "loss": 19.1266, | |
| "mean_token_accuracy": 0.6968252252787351, | |
| "num_tokens": 17587777.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_biology_entropy": 1.1672434105873108, | |
| "eval_biology_loss": 1.1815813779830933, | |
| "eval_biology_mean_token_accuracy": 0.7003293070793152, | |
| "eval_biology_num_tokens": 17587777.0, | |
| "eval_biology_runtime": 48.6205, | |
| "eval_biology_samples_per_second": 10.284, | |
| "eval_biology_steps_per_second": 2.571, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_chemistry_entropy": 0.8869817838668823, | |
| "eval_chemistry_loss": 0.8829970955848694, | |
| "eval_chemistry_mean_token_accuracy": 0.7656374487876892, | |
| "eval_chemistry_num_tokens": 17587777.0, | |
| "eval_chemistry_runtime": 60.3339, | |
| "eval_chemistry_samples_per_second": 8.287, | |
| "eval_chemistry_steps_per_second": 2.072, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_math_entropy": 0.7674483435153961, | |
| "eval_math_loss": 0.9792445302009583, | |
| "eval_math_mean_token_accuracy": 0.7597224740982056, | |
| "eval_math_num_tokens": 17587777.0, | |
| "eval_math_runtime": 61.8239, | |
| "eval_math_samples_per_second": 8.087, | |
| "eval_math_steps_per_second": 2.022, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_physics_entropy": 0.8621900615692139, | |
| "eval_physics_loss": 0.8884776830673218, | |
| "eval_physics_mean_token_accuracy": 0.7715646696090698, | |
| "eval_physics_num_tokens": 17587777.0, | |
| "eval_physics_runtime": 70.446, | |
| "eval_physics_samples_per_second": 7.098, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.1818725422024727, | |
| "epoch": 1.456, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.548888888888889e-05, | |
| "loss": 19.032, | |
| "mean_token_accuracy": 0.698200449720025, | |
| "num_tokens": 17788456.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.1769807077944279, | |
| "epoch": 1.472, | |
| "grad_norm": 16.0, | |
| "learning_rate": 1.5414814814814814e-05, | |
| "loss": 18.8791, | |
| "mean_token_accuracy": 0.7008779179304838, | |
| "num_tokens": 17984063.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.1641013238579034, | |
| "epoch": 1.488, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.5340740740740744e-05, | |
| "loss": 18.9913, | |
| "mean_token_accuracy": 0.6998269848525525, | |
| "num_tokens": 18175640.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.1960251219570637, | |
| "epoch": 1.504, | |
| "grad_norm": 17.0, | |
| "learning_rate": 1.5266666666666667e-05, | |
| "loss": 19.2076, | |
| "mean_token_accuracy": 0.696322177350521, | |
| "num_tokens": 18367857.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.1745740845799446, | |
| "epoch": 1.52, | |
| "grad_norm": 16.875, | |
| "learning_rate": 1.5192592592592594e-05, | |
| "loss": 19.0307, | |
| "mean_token_accuracy": 0.6969408400356769, | |
| "num_tokens": 18569146.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.2008745949715376, | |
| "epoch": 1.536, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.5118518518518519e-05, | |
| "loss": 19.2895, | |
| "mean_token_accuracy": 0.6946466054767371, | |
| "num_tokens": 18755079.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.1710849691182375, | |
| "epoch": 1.552, | |
| "grad_norm": 19.375, | |
| "learning_rate": 1.5044444444444445e-05, | |
| "loss": 18.9073, | |
| "mean_token_accuracy": 0.699705482646823, | |
| "num_tokens": 18956248.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.163971472159028, | |
| "epoch": 1.568, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.497037037037037e-05, | |
| "loss": 18.7379, | |
| "mean_token_accuracy": 0.7023108277469874, | |
| "num_tokens": 19150315.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.1755164857953786, | |
| "epoch": 1.584, | |
| "grad_norm": 17.75, | |
| "learning_rate": 1.4896296296296298e-05, | |
| "loss": 18.8826, | |
| "mean_token_accuracy": 0.6997408363968134, | |
| "num_tokens": 19344260.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.2020615819841622, | |
| "epoch": 1.6, | |
| "grad_norm": 17.125, | |
| "learning_rate": 1.4822222222222225e-05, | |
| "loss": 19.3858, | |
| "mean_token_accuracy": 0.6933612376451492, | |
| "num_tokens": 19532552.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_biology_entropy": 1.1451181559562682, | |
| "eval_biology_loss": 1.1767185926437378, | |
| "eval_biology_mean_token_accuracy": 0.7013368840217591, | |
| "eval_biology_num_tokens": 19532552.0, | |
| "eval_biology_runtime": 48.6261, | |
| "eval_biology_samples_per_second": 10.283, | |
| "eval_biology_steps_per_second": 2.571, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_chemistry_entropy": 0.8642943887710571, | |
| "eval_chemistry_loss": 0.8798553347587585, | |
| "eval_chemistry_mean_token_accuracy": 0.7664505195617676, | |
| "eval_chemistry_num_tokens": 19532552.0, | |
| "eval_chemistry_runtime": 59.8839, | |
| "eval_chemistry_samples_per_second": 8.349, | |
| "eval_chemistry_steps_per_second": 2.087, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_math_entropy": 0.7490488801002503, | |
| "eval_math_loss": 0.9804874062538147, | |
| "eval_math_mean_token_accuracy": 0.7602896738052368, | |
| "eval_math_num_tokens": 19532552.0, | |
| "eval_math_runtime": 61.7317, | |
| "eval_math_samples_per_second": 8.1, | |
| "eval_math_steps_per_second": 2.025, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_physics_entropy": 0.8397411880493164, | |
| "eval_physics_loss": 0.885618269443512, | |
| "eval_physics_mean_token_accuracy": 0.7722613711357117, | |
| "eval_physics_num_tokens": 19532552.0, | |
| "eval_physics_runtime": 70.4574, | |
| "eval_physics_samples_per_second": 7.096, | |
| "eval_physics_steps_per_second": 1.774, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2020838985302374e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |