Text Generation
Transformers
Safetensors
qwen3
Generated from Trainer
trl-internal
trl
sft
trackio
conversational
text-generation-inference
Instructions to use edbeeching/Qwen3-4B-Base-SFT-tr5 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use edbeeching/Qwen3-4B-Base-SFT-tr5 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="edbeeching/Qwen3-4B-Base-SFT-tr5") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("edbeeching/Qwen3-4B-Base-SFT-tr5") model = AutoModelForCausalLM.from_pretrained("edbeeching/Qwen3-4B-Base-SFT-tr5") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Inference
- Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use edbeeching/Qwen3-4B-Base-SFT-tr5 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "edbeeching/Qwen3-4B-Base-SFT-tr5" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "edbeeching/Qwen3-4B-Base-SFT-tr5", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/edbeeching/Qwen3-4B-Base-SFT-tr5
- SGLang
How to use edbeeching/Qwen3-4B-Base-SFT-tr5 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "edbeeching/Qwen3-4B-Base-SFT-tr5" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "edbeeching/Qwen3-4B-Base-SFT-tr5", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "edbeeching/Qwen3-4B-Base-SFT-tr5" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "edbeeching/Qwen3-4B-Base-SFT-tr5", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use edbeeching/Qwen3-4B-Base-SFT-tr5 with Docker Model Runner:
docker model run hf.co/edbeeching/Qwen3-4B-Base-SFT-tr5
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.6268656716417915, | |
| "eval_steps": 500, | |
| "global_step": 620, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007462686567164179, | |
| "grad_norm": 1.8659797972008254, | |
| "learning_rate": 0.0, | |
| "loss": 0.7986637353897095, | |
| "num_tokens": 940199.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.014925373134328358, | |
| "grad_norm": 2.067470583174153, | |
| "learning_rate": 5.2631578947368416e-08, | |
| "loss": 0.8278242945671082, | |
| "num_tokens": 1940958.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.022388059701492536, | |
| "grad_norm": 1.8137442955270493, | |
| "learning_rate": 1.0526315789473683e-07, | |
| "loss": 0.7961194515228271, | |
| "num_tokens": 2857380.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.029850746268656716, | |
| "grad_norm": 1.8752542747158565, | |
| "learning_rate": 1.5789473684210525e-07, | |
| "loss": 0.7988173961639404, | |
| "num_tokens": 3696403.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.03731343283582089, | |
| "grad_norm": 1.9383021105435863, | |
| "learning_rate": 2.1052631578947366e-07, | |
| "loss": 0.8283753395080566, | |
| "num_tokens": 4528235.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04477611940298507, | |
| "grad_norm": 1.9512483400491862, | |
| "learning_rate": 2.631578947368421e-07, | |
| "loss": 0.8254790902137756, | |
| "num_tokens": 5554672.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.05223880597014925, | |
| "grad_norm": 1.8813572711890862, | |
| "learning_rate": 3.157894736842105e-07, | |
| "loss": 0.8230706453323364, | |
| "num_tokens": 6423132.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.05970149253731343, | |
| "grad_norm": 1.9448895375733437, | |
| "learning_rate": 3.684210526315789e-07, | |
| "loss": 0.8051227331161499, | |
| "num_tokens": 7201644.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.06716417910447761, | |
| "grad_norm": 1.8922387607703381, | |
| "learning_rate": 4.2105263157894733e-07, | |
| "loss": 0.7542356252670288, | |
| "num_tokens": 8128715.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.07462686567164178, | |
| "grad_norm": 1.9842025446380493, | |
| "learning_rate": 4.7368421052631574e-07, | |
| "loss": 0.8522481918334961, | |
| "num_tokens": 9074027.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08208955223880597, | |
| "grad_norm": 1.7964968435378388, | |
| "learning_rate": 5.263157894736842e-07, | |
| "loss": 0.7835813760757446, | |
| "num_tokens": 9950641.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.08955223880597014, | |
| "grad_norm": 1.8759196623765004, | |
| "learning_rate": 5.789473684210526e-07, | |
| "loss": 0.8266638517379761, | |
| "num_tokens": 10885057.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.09701492537313433, | |
| "grad_norm": 1.739475404373344, | |
| "learning_rate": 6.31578947368421e-07, | |
| "loss": 0.8051838874816895, | |
| "num_tokens": 11697963.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.1044776119402985, | |
| "grad_norm": 1.6356821751063044, | |
| "learning_rate": 6.842105263157895e-07, | |
| "loss": 0.7847919464111328, | |
| "num_tokens": 12632602.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.11194029850746269, | |
| "grad_norm": 1.6314037268606378, | |
| "learning_rate": 7.368421052631578e-07, | |
| "loss": 0.775245189666748, | |
| "num_tokens": 13568889.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.11940298507462686, | |
| "grad_norm": 1.6105524213156879, | |
| "learning_rate": 7.894736842105263e-07, | |
| "loss": 0.8013657331466675, | |
| "num_tokens": 14534242.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.12686567164179105, | |
| "grad_norm": 1.5779316833603265, | |
| "learning_rate": 8.421052631578947e-07, | |
| "loss": 0.80766761302948, | |
| "num_tokens": 15435946.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.13432835820895522, | |
| "grad_norm": 1.5079351322768313, | |
| "learning_rate": 8.947368421052631e-07, | |
| "loss": 0.7658109664916992, | |
| "num_tokens": 16352267.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.1417910447761194, | |
| "grad_norm": 1.320308981062678, | |
| "learning_rate": 9.473684210526315e-07, | |
| "loss": 0.7778770327568054, | |
| "num_tokens": 17277422.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.14925373134328357, | |
| "grad_norm": 1.4101803404332138, | |
| "learning_rate": 1e-06, | |
| "loss": 0.8158027529716492, | |
| "num_tokens": 18270697.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15671641791044777, | |
| "grad_norm": 1.3401873553364563, | |
| "learning_rate": 9.999938520216342e-07, | |
| "loss": 0.7766833305358887, | |
| "num_tokens": 19308555.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.16417910447761194, | |
| "grad_norm": 1.27445266639497, | |
| "learning_rate": 9.999754082545259e-07, | |
| "loss": 0.7421952486038208, | |
| "num_tokens": 20162797.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.17164179104477612, | |
| "grad_norm": 1.3912932427121052, | |
| "learning_rate": 9.999446692026396e-07, | |
| "loss": 0.7800503969192505, | |
| "num_tokens": 20981106.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.1791044776119403, | |
| "grad_norm": 1.2963982894236503, | |
| "learning_rate": 9.999016357058995e-07, | |
| "loss": 0.766775369644165, | |
| "num_tokens": 21858000.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.1865671641791045, | |
| "grad_norm": 1.140051976839368, | |
| "learning_rate": 9.998463089401678e-07, | |
| "loss": 0.7179380059242249, | |
| "num_tokens": 22793285.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.19402985074626866, | |
| "grad_norm": 1.1651471102131281, | |
| "learning_rate": 9.997786904172126e-07, | |
| "loss": 0.810413122177124, | |
| "num_tokens": 23723801.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.20149253731343283, | |
| "grad_norm": 1.0407108116745094, | |
| "learning_rate": 9.996987819846655e-07, | |
| "loss": 0.7446407079696655, | |
| "num_tokens": 24725740.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.208955223880597, | |
| "grad_norm": 0.9942554431409824, | |
| "learning_rate": 9.996065858259727e-07, | |
| "loss": 0.7915131449699402, | |
| "num_tokens": 25730725.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.21641791044776118, | |
| "grad_norm": 1.0234340889029334, | |
| "learning_rate": 9.995021044603342e-07, | |
| "loss": 0.7581333518028259, | |
| "num_tokens": 26557776.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.22388059701492538, | |
| "grad_norm": 1.0332669286266374, | |
| "learning_rate": 9.993853407426352e-07, | |
| "loss": 0.7365682125091553, | |
| "num_tokens": 27504251.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.23134328358208955, | |
| "grad_norm": 1.0634534407808287, | |
| "learning_rate": 9.99256297863368e-07, | |
| "loss": 0.7191506624221802, | |
| "num_tokens": 28534541.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.23880597014925373, | |
| "grad_norm": 1.1818384387181422, | |
| "learning_rate": 9.99114979348545e-07, | |
| "loss": 0.7689279317855835, | |
| "num_tokens": 29341502.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.2462686567164179, | |
| "grad_norm": 1.0143867857533606, | |
| "learning_rate": 9.989613890596033e-07, | |
| "loss": 0.7768257856369019, | |
| "num_tokens": 30211822.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.2537313432835821, | |
| "grad_norm": 0.9912608655279589, | |
| "learning_rate": 9.987955311932968e-07, | |
| "loss": 0.7552160024642944, | |
| "num_tokens": 31102775.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.26119402985074625, | |
| "grad_norm": 0.859671875219598, | |
| "learning_rate": 9.986174102815837e-07, | |
| "loss": 0.7417880892753601, | |
| "num_tokens": 31898227.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.26865671641791045, | |
| "grad_norm": 0.8467136154890645, | |
| "learning_rate": 9.984270311915018e-07, | |
| "loss": 0.7220484614372253, | |
| "num_tokens": 32541892.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.27611940298507465, | |
| "grad_norm": 0.968520072067917, | |
| "learning_rate": 9.982243991250357e-07, | |
| "loss": 0.7436271905899048, | |
| "num_tokens": 33543040.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.2835820895522388, | |
| "grad_norm": 0.7117412176092366, | |
| "learning_rate": 9.980095196189748e-07, | |
| "loss": 0.7281963229179382, | |
| "num_tokens": 34505224.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.291044776119403, | |
| "grad_norm": 0.7282443922942368, | |
| "learning_rate": 9.977823985447613e-07, | |
| "loss": 0.7709681987762451, | |
| "num_tokens": 35411826.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 0.8081380042496161, | |
| "learning_rate": 9.975430421083305e-07, | |
| "loss": 0.761425256729126, | |
| "num_tokens": 36307345.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.30597014925373134, | |
| "grad_norm": 0.8423198034792783, | |
| "learning_rate": 9.972914568499411e-07, | |
| "loss": 0.7312315106391907, | |
| "num_tokens": 37196875.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.31343283582089554, | |
| "grad_norm": 0.8739486151254683, | |
| "learning_rate": 9.970276496439966e-07, | |
| "loss": 0.7070371508598328, | |
| "num_tokens": 38112193.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.3208955223880597, | |
| "grad_norm": 0.8222714239323922, | |
| "learning_rate": 9.967516276988567e-07, | |
| "loss": 0.7004337310791016, | |
| "num_tokens": 38855918.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.3283582089552239, | |
| "grad_norm": 0.6714287856764727, | |
| "learning_rate": 9.964633985566412e-07, | |
| "loss": 0.7193351984024048, | |
| "num_tokens": 39833215.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.3358208955223881, | |
| "grad_norm": 0.6336781599517638, | |
| "learning_rate": 9.961629700930235e-07, | |
| "loss": 0.7344927787780762, | |
| "num_tokens": 40760145.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.34328358208955223, | |
| "grad_norm": 0.669269067988794, | |
| "learning_rate": 9.958503505170155e-07, | |
| "loss": 0.7277801632881165, | |
| "num_tokens": 41745749.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.35074626865671643, | |
| "grad_norm": 0.7317041279195701, | |
| "learning_rate": 9.95525548370744e-07, | |
| "loss": 0.7005234956741333, | |
| "num_tokens": 42686630.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.3582089552238806, | |
| "grad_norm": 0.7721535484599115, | |
| "learning_rate": 9.95188572529215e-07, | |
| "loss": 0.7193202376365662, | |
| "num_tokens": 43510586.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.3656716417910448, | |
| "grad_norm": 0.7041253506550509, | |
| "learning_rate": 9.948394322000746e-07, | |
| "loss": 0.6881219744682312, | |
| "num_tokens": 44362248.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.373134328358209, | |
| "grad_norm": 0.636240363667009, | |
| "learning_rate": 9.944781369233543e-07, | |
| "loss": 0.6522014141082764, | |
| "num_tokens": 45216722.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3805970149253731, | |
| "grad_norm": 0.6272094853443818, | |
| "learning_rate": 9.941046965712122e-07, | |
| "loss": 0.6842180490493774, | |
| "num_tokens": 46010142.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.3880597014925373, | |
| "grad_norm": 0.5645071953625206, | |
| "learning_rate": 9.937191213476625e-07, | |
| "loss": 0.6692793369293213, | |
| "num_tokens": 46858670.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.39552238805970147, | |
| "grad_norm": 0.585837329746578, | |
| "learning_rate": 9.933214217882971e-07, | |
| "loss": 0.7204340696334839, | |
| "num_tokens": 47836905.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.40298507462686567, | |
| "grad_norm": 0.5938197808350745, | |
| "learning_rate": 9.929116087599972e-07, | |
| "loss": 0.7186766266822815, | |
| "num_tokens": 48836237.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.41044776119402987, | |
| "grad_norm": 0.5264192209085824, | |
| "learning_rate": 9.924896934606364e-07, | |
| "loss": 0.7225839495658875, | |
| "num_tokens": 49860153.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.417910447761194, | |
| "grad_norm": 0.5025994710035754, | |
| "learning_rate": 9.920556874187757e-07, | |
| "loss": 0.6711542010307312, | |
| "num_tokens": 50786110.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.4253731343283582, | |
| "grad_norm": 0.43381197403497257, | |
| "learning_rate": 9.91609602493347e-07, | |
| "loss": 0.6544876098632812, | |
| "num_tokens": 51790390.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.43283582089552236, | |
| "grad_norm": 0.43233023140164006, | |
| "learning_rate": 9.911514508733306e-07, | |
| "loss": 0.7029759883880615, | |
| "num_tokens": 52742397.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.44029850746268656, | |
| "grad_norm": 0.43910093495037306, | |
| "learning_rate": 9.906812450774207e-07, | |
| "loss": 0.7200834155082703, | |
| "num_tokens": 53673114.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.44776119402985076, | |
| "grad_norm": 0.44127209489849284, | |
| "learning_rate": 9.90198997953684e-07, | |
| "loss": 0.6370296478271484, | |
| "num_tokens": 54566889.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4552238805970149, | |
| "grad_norm": 0.45096698239872907, | |
| "learning_rate": 9.89704722679209e-07, | |
| "loss": 0.6861921548843384, | |
| "num_tokens": 55460491.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.4626865671641791, | |
| "grad_norm": 0.4231809219141582, | |
| "learning_rate": 9.89198432759746e-07, | |
| "loss": 0.6846483945846558, | |
| "num_tokens": 56520990.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.4701492537313433, | |
| "grad_norm": 0.4911103394390683, | |
| "learning_rate": 9.886801420293363e-07, | |
| "loss": 0.6806150674819946, | |
| "num_tokens": 57422206.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.47761194029850745, | |
| "grad_norm": 0.4523240789905204, | |
| "learning_rate": 9.881498646499368e-07, | |
| "loss": 0.7077186107635498, | |
| "num_tokens": 58260720.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.48507462686567165, | |
| "grad_norm": 0.41521834156751597, | |
| "learning_rate": 9.876076151110313e-07, | |
| "loss": 0.696556806564331, | |
| "num_tokens": 59123617.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4925373134328358, | |
| "grad_norm": 0.44090049904740325, | |
| "learning_rate": 9.870534082292349e-07, | |
| "loss": 0.6695712804794312, | |
| "num_tokens": 60033505.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.3986636896953578, | |
| "learning_rate": 9.864872591478893e-07, | |
| "loss": 0.6385202407836914, | |
| "num_tokens": 60974452.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.5074626865671642, | |
| "grad_norm": 0.36309454275310177, | |
| "learning_rate": 9.859091833366496e-07, | |
| "loss": 0.6627390384674072, | |
| "num_tokens": 61913977.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.5149253731343284, | |
| "grad_norm": 0.39014943112878603, | |
| "learning_rate": 9.853191965910605e-07, | |
| "loss": 0.6990819573402405, | |
| "num_tokens": 62800879.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.5223880597014925, | |
| "grad_norm": 0.365989388161065, | |
| "learning_rate": 9.84717315032125e-07, | |
| "loss": 0.6741257309913635, | |
| "num_tokens": 63823183.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5298507462686567, | |
| "grad_norm": 0.367640955997548, | |
| "learning_rate": 9.841035551058648e-07, | |
| "loss": 0.657660722732544, | |
| "num_tokens": 64700087.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.5373134328358209, | |
| "grad_norm": 0.38259984541818554, | |
| "learning_rate": 9.834779335828697e-07, | |
| "loss": 0.6268120408058167, | |
| "num_tokens": 65533415.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.5447761194029851, | |
| "grad_norm": 0.4459381628745124, | |
| "learning_rate": 9.828404675578403e-07, | |
| "loss": 0.6166980266571045, | |
| "num_tokens": 66411589.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.5522388059701493, | |
| "grad_norm": 0.3964961096159446, | |
| "learning_rate": 9.821911744491202e-07, | |
| "loss": 0.6667238473892212, | |
| "num_tokens": 67203675.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.5597014925373134, | |
| "grad_norm": 0.3942585728115673, | |
| "learning_rate": 9.815300719982202e-07, | |
| "loss": 0.6620233058929443, | |
| "num_tokens": 68056574.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5671641791044776, | |
| "grad_norm": 0.35812083783608406, | |
| "learning_rate": 9.808571782693343e-07, | |
| "loss": 0.6339540481567383, | |
| "num_tokens": 68907426.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.5746268656716418, | |
| "grad_norm": 0.3369082279659881, | |
| "learning_rate": 9.801725116488449e-07, | |
| "loss": 0.6345670819282532, | |
| "num_tokens": 69817179.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.582089552238806, | |
| "grad_norm": 0.4132547384225159, | |
| "learning_rate": 9.794760908448213e-07, | |
| "loss": 0.6722534894943237, | |
| "num_tokens": 70803003.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.5895522388059702, | |
| "grad_norm": 0.3627266662239884, | |
| "learning_rate": 9.78767934886508e-07, | |
| "loss": 0.6158405542373657, | |
| "num_tokens": 71708353.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 0.3555505789377954, | |
| "learning_rate": 9.78048063123805e-07, | |
| "loss": 0.6479436159133911, | |
| "num_tokens": 72587706.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6044776119402985, | |
| "grad_norm": 0.36801010623436975, | |
| "learning_rate": 9.773164952267392e-07, | |
| "loss": 0.6247404217720032, | |
| "num_tokens": 73605832.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.6119402985074627, | |
| "grad_norm": 0.4986298126613671, | |
| "learning_rate": 9.765732511849267e-07, | |
| "loss": 0.6385573744773865, | |
| "num_tokens": 74512496.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.6194029850746269, | |
| "grad_norm": 0.3500792677565209, | |
| "learning_rate": 9.758183513070266e-07, | |
| "loss": 0.6781474351882935, | |
| "num_tokens": 75428481.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.6268656716417911, | |
| "grad_norm": 0.39093330569733553, | |
| "learning_rate": 9.750518162201857e-07, | |
| "loss": 0.6386494636535645, | |
| "num_tokens": 76292706.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.6343283582089553, | |
| "grad_norm": 0.4070936194536431, | |
| "learning_rate": 9.742736668694758e-07, | |
| "loss": 0.6180363893508911, | |
| "num_tokens": 77209633.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6417910447761194, | |
| "grad_norm": 0.3354352334817167, | |
| "learning_rate": 9.734839245173211e-07, | |
| "loss": 0.6163570880889893, | |
| "num_tokens": 78063420.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.6492537313432836, | |
| "grad_norm": 0.45211158839386173, | |
| "learning_rate": 9.726826107429168e-07, | |
| "loss": 0.6268313527107239, | |
| "num_tokens": 78870397.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.6567164179104478, | |
| "grad_norm": 0.31941448751910695, | |
| "learning_rate": 9.718697474416388e-07, | |
| "loss": 0.6327146291732788, | |
| "num_tokens": 79871666.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.664179104477612, | |
| "grad_norm": 0.3722172382477918, | |
| "learning_rate": 9.71045356824448e-07, | |
| "loss": 0.6110676527023315, | |
| "num_tokens": 80712206.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.6716417910447762, | |
| "grad_norm": 0.3465587426296506, | |
| "learning_rate": 9.7020946141728e-07, | |
| "loss": 0.5954413414001465, | |
| "num_tokens": 81538216.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6791044776119403, | |
| "grad_norm": 0.30862586857277025, | |
| "learning_rate": 9.693620840604325e-07, | |
| "loss": 0.6106799840927124, | |
| "num_tokens": 82585839.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.6865671641791045, | |
| "grad_norm": 0.34576661664754443, | |
| "learning_rate": 9.685032479079392e-07, | |
| "loss": 0.6184056997299194, | |
| "num_tokens": 83427449.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.6940298507462687, | |
| "grad_norm": 0.30968028932263886, | |
| "learning_rate": 9.676329764269383e-07, | |
| "loss": 0.6404486894607544, | |
| "num_tokens": 84449388.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.7014925373134329, | |
| "grad_norm": 0.3132280475142172, | |
| "learning_rate": 9.667512933970313e-07, | |
| "loss": 0.6027534008026123, | |
| "num_tokens": 85256514.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.7089552238805971, | |
| "grad_norm": 0.37723622679137114, | |
| "learning_rate": 9.658582229096319e-07, | |
| "loss": 0.636467695236206, | |
| "num_tokens": 86165960.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7164179104477612, | |
| "grad_norm": 0.3415033431478265, | |
| "learning_rate": 9.649537893673095e-07, | |
| "loss": 0.6198180317878723, | |
| "num_tokens": 86982659.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.7238805970149254, | |
| "grad_norm": 0.38663387244532404, | |
| "learning_rate": 9.640380174831209e-07, | |
| "loss": 0.6216307878494263, | |
| "num_tokens": 87931000.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.7313432835820896, | |
| "grad_norm": 0.32790162197293926, | |
| "learning_rate": 9.631109322799361e-07, | |
| "loss": 0.6376453638076782, | |
| "num_tokens": 88689701.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.7388059701492538, | |
| "grad_norm": 0.30426811432679324, | |
| "learning_rate": 9.621725590897543e-07, | |
| "loss": 0.6182718276977539, | |
| "num_tokens": 89547645.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.746268656716418, | |
| "grad_norm": 0.3319556928745246, | |
| "learning_rate": 9.61222923553011e-07, | |
| "loss": 0.6192991733551025, | |
| "num_tokens": 90297517.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.753731343283582, | |
| "grad_norm": 0.3063539981854288, | |
| "learning_rate": 9.602620516178788e-07, | |
| "loss": 0.6192951202392578, | |
| "num_tokens": 91186856.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.7611940298507462, | |
| "grad_norm": 0.3353221957958766, | |
| "learning_rate": 9.592899695395568e-07, | |
| "loss": 0.6191784739494324, | |
| "num_tokens": 91987232.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.7686567164179104, | |
| "grad_norm": 2.4726215379945016, | |
| "learning_rate": 9.583067038795544e-07, | |
| "loss": 0.6689252257347107, | |
| "num_tokens": 92898696.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.7761194029850746, | |
| "grad_norm": 0.3683317561657326, | |
| "learning_rate": 9.57312281504965e-07, | |
| "loss": 0.6164358854293823, | |
| "num_tokens": 93791120.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.7835820895522388, | |
| "grad_norm": 0.31142782603302166, | |
| "learning_rate": 9.563067295877318e-07, | |
| "loss": 0.5814804434776306, | |
| "num_tokens": 94639289.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7910447761194029, | |
| "grad_norm": 0.5777785271285245, | |
| "learning_rate": 9.552900756039056e-07, | |
| "loss": 0.6628624200820923, | |
| "num_tokens": 95400207.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.7985074626865671, | |
| "grad_norm": 0.31405569146967405, | |
| "learning_rate": 9.54262347332894e-07, | |
| "loss": 0.647003710269928, | |
| "num_tokens": 96227104.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.8059701492537313, | |
| "grad_norm": 0.3000528898105912, | |
| "learning_rate": 9.532235728567022e-07, | |
| "loss": 0.6015387177467346, | |
| "num_tokens": 97056588.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.8134328358208955, | |
| "grad_norm": 0.3033934043892588, | |
| "learning_rate": 9.521737805591661e-07, | |
| "loss": 0.629927396774292, | |
| "num_tokens": 97944111.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.8208955223880597, | |
| "grad_norm": 0.29345271513653554, | |
| "learning_rate": 9.511129991251755e-07, | |
| "loss": 0.5817909836769104, | |
| "num_tokens": 98816920.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8283582089552238, | |
| "grad_norm": 0.3148352988531031, | |
| "learning_rate": 9.500412575398922e-07, | |
| "loss": 0.6288615465164185, | |
| "num_tokens": 99773832.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.835820895522388, | |
| "grad_norm": 0.3225302890726275, | |
| "learning_rate": 9.489585850879564e-07, | |
| "loss": 0.6282119750976562, | |
| "num_tokens": 100805832.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.8432835820895522, | |
| "grad_norm": 0.3125344123235252, | |
| "learning_rate": 9.478650113526874e-07, | |
| "loss": 0.6161372661590576, | |
| "num_tokens": 101747939.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.8507462686567164, | |
| "grad_norm": 0.3022490350310229, | |
| "learning_rate": 9.467605662152745e-07, | |
| "loss": 0.6462452411651611, | |
| "num_tokens": 102733715.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.8582089552238806, | |
| "grad_norm": 0.29284676964407896, | |
| "learning_rate": 9.456452798539616e-07, | |
| "loss": 0.5786178112030029, | |
| "num_tokens": 103577969.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.8656716417910447, | |
| "grad_norm": 0.31081858681339425, | |
| "learning_rate": 9.445191827432215e-07, | |
| "loss": 0.6079792380332947, | |
| "num_tokens": 104507837.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.8731343283582089, | |
| "grad_norm": 0.305140531974358, | |
| "learning_rate": 9.433823056529241e-07, | |
| "loss": 0.6422327160835266, | |
| "num_tokens": 105482901.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.8805970149253731, | |
| "grad_norm": 0.8470132754236691, | |
| "learning_rate": 9.422346796474949e-07, | |
| "loss": 0.6116156578063965, | |
| "num_tokens": 106441176.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.8880597014925373, | |
| "grad_norm": 0.3062865057157625, | |
| "learning_rate": 9.410763360850665e-07, | |
| "loss": 0.6365537643432617, | |
| "num_tokens": 107265870.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 0.3112284745186021, | |
| "learning_rate": 9.399073066166217e-07, | |
| "loss": 0.6294253468513489, | |
| "num_tokens": 108146690.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.9029850746268657, | |
| "grad_norm": 0.3489293940518192, | |
| "learning_rate": 9.38727623185129e-07, | |
| "loss": 0.6304266452789307, | |
| "num_tokens": 109034402.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.9104477611940298, | |
| "grad_norm": 0.30735356689099647, | |
| "learning_rate": 9.375373180246696e-07, | |
| "loss": 0.6445657014846802, | |
| "num_tokens": 109989572.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.917910447761194, | |
| "grad_norm": 0.30036163756207646, | |
| "learning_rate": 9.36336423659556e-07, | |
| "loss": 0.6575721502304077, | |
| "num_tokens": 110969334.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.9253731343283582, | |
| "grad_norm": 0.3150563557302255, | |
| "learning_rate": 9.351249729034441e-07, | |
| "loss": 0.6350916624069214, | |
| "num_tokens": 111844990.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.9328358208955224, | |
| "grad_norm": 0.42959935293984475, | |
| "learning_rate": 9.339029988584364e-07, | |
| "loss": 0.6106451153755188, | |
| "num_tokens": 112800888.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9402985074626866, | |
| "grad_norm": 0.295731574777322, | |
| "learning_rate": 9.326705349141772e-07, | |
| "loss": 0.6363998651504517, | |
| "num_tokens": 113857610.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.9477611940298507, | |
| "grad_norm": 0.2886185648925166, | |
| "learning_rate": 9.314276147469408e-07, | |
| "loss": 0.6078730225563049, | |
| "num_tokens": 114800904.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.9552238805970149, | |
| "grad_norm": 0.2920263616232179, | |
| "learning_rate": 9.301742723187104e-07, | |
| "loss": 0.6083230972290039, | |
| "num_tokens": 115759913.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.9626865671641791, | |
| "grad_norm": 0.44738555848421335, | |
| "learning_rate": 9.289105418762512e-07, | |
| "loss": 0.6401327848434448, | |
| "num_tokens": 116624191.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.9701492537313433, | |
| "grad_norm": 0.31658939252581325, | |
| "learning_rate": 9.276364579501741e-07, | |
| "loss": 0.6194320321083069, | |
| "num_tokens": 117499418.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9776119402985075, | |
| "grad_norm": 0.3550447231230164, | |
| "learning_rate": 9.263520553539919e-07, | |
| "loss": 0.5973168611526489, | |
| "num_tokens": 118329517.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.9850746268656716, | |
| "grad_norm": 0.3461631737220481, | |
| "learning_rate": 9.250573691831686e-07, | |
| "loss": 0.6246321201324463, | |
| "num_tokens": 119221343.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.9925373134328358, | |
| "grad_norm": 0.29903733473091415, | |
| "learning_rate": 9.237524348141599e-07, | |
| "loss": 0.6080079078674316, | |
| "num_tokens": 120123659.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.30082599081337075, | |
| "learning_rate": 9.224372879034471e-07, | |
| "loss": 0.610882043838501, | |
| "num_tokens": 121054976.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.007462686567164, | |
| "grad_norm": 0.3306994076014225, | |
| "learning_rate": 9.211119643865625e-07, | |
| "loss": 0.617473840713501, | |
| "num_tokens": 121995409.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.0149253731343284, | |
| "grad_norm": 0.4118286344591086, | |
| "learning_rate": 9.197765004771074e-07, | |
| "loss": 0.6029432415962219, | |
| "num_tokens": 122823226.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.0223880597014925, | |
| "grad_norm": 0.33422438069467975, | |
| "learning_rate": 9.184309326657625e-07, | |
| "loss": 0.5911135077476501, | |
| "num_tokens": 123660597.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.0298507462686568, | |
| "grad_norm": 0.3051145695551814, | |
| "learning_rate": 9.17075297719292e-07, | |
| "loss": 0.5806124806404114, | |
| "num_tokens": 124554146.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.037313432835821, | |
| "grad_norm": 0.32826641207537816, | |
| "learning_rate": 9.157096326795367e-07, | |
| "loss": 0.6078518629074097, | |
| "num_tokens": 125332236.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.044776119402985, | |
| "grad_norm": 0.2711859183967077, | |
| "learning_rate": 9.143339748624042e-07, | |
| "loss": 0.5872972011566162, | |
| "num_tokens": 126310236.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.0522388059701493, | |
| "grad_norm": 0.29219827505795287, | |
| "learning_rate": 9.129483618568477e-07, | |
| "loss": 0.5903403759002686, | |
| "num_tokens": 127218706.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.0597014925373134, | |
| "grad_norm": 0.31682373789781537, | |
| "learning_rate": 9.115528315238396e-07, | |
| "loss": 0.6067441701889038, | |
| "num_tokens": 128057825.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.0671641791044777, | |
| "grad_norm": 0.290526593989316, | |
| "learning_rate": 9.101474219953366e-07, | |
| "loss": 0.5909883975982666, | |
| "num_tokens": 128955736.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.0746268656716418, | |
| "grad_norm": 0.33126907763025953, | |
| "learning_rate": 9.087321716732382e-07, | |
| "loss": 0.6024787425994873, | |
| "num_tokens": 129777788.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.0820895522388059, | |
| "grad_norm": 0.2848827551783459, | |
| "learning_rate": 9.073071192283374e-07, | |
| "loss": 0.6014402508735657, | |
| "num_tokens": 130659960.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.0895522388059702, | |
| "grad_norm": 0.32145826962059754, | |
| "learning_rate": 9.058723035992631e-07, | |
| "loss": 0.5986078977584839, | |
| "num_tokens": 131550221.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.0970149253731343, | |
| "grad_norm": 0.3517188747706852, | |
| "learning_rate": 9.044277639914176e-07, | |
| "loss": 0.6086349487304688, | |
| "num_tokens": 132486469.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.1044776119402986, | |
| "grad_norm": 0.2956143436262153, | |
| "learning_rate": 9.029735398759043e-07, | |
| "loss": 0.5634535551071167, | |
| "num_tokens": 133366950.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.1119402985074627, | |
| "grad_norm": 0.3478882615131154, | |
| "learning_rate": 9.015096709884492e-07, | |
| "loss": 0.5996171236038208, | |
| "num_tokens": 134285043.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.1194029850746268, | |
| "grad_norm": 0.2966927318472822, | |
| "learning_rate": 9.000361973283158e-07, | |
| "loss": 0.5650948882102966, | |
| "num_tokens": 135136228.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.126865671641791, | |
| "grad_norm": 0.29045922223177134, | |
| "learning_rate": 8.985531591572115e-07, | |
| "loss": 0.583465576171875, | |
| "num_tokens": 136013129.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.1343283582089552, | |
| "grad_norm": 0.29420507308387045, | |
| "learning_rate": 8.970605969981879e-07, | |
| "loss": 0.6276301741600037, | |
| "num_tokens": 136978716.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.1417910447761195, | |
| "grad_norm": 0.28206954373320775, | |
| "learning_rate": 8.955585516345332e-07, | |
| "loss": 0.5884029865264893, | |
| "num_tokens": 137957110.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.1492537313432836, | |
| "grad_norm": 0.2893761158835339, | |
| "learning_rate": 8.940470641086581e-07, | |
| "loss": 0.58906090259552, | |
| "num_tokens": 138894209.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.1567164179104479, | |
| "grad_norm": 0.3060099897084387, | |
| "learning_rate": 8.925261757209743e-07, | |
| "loss": 0.6283571720123291, | |
| "num_tokens": 139925878.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.164179104477612, | |
| "grad_norm": 0.3245985255415534, | |
| "learning_rate": 8.909959280287655e-07, | |
| "loss": 0.5938559770584106, | |
| "num_tokens": 140844266.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.171641791044776, | |
| "grad_norm": 0.30881864003106135, | |
| "learning_rate": 8.894563628450532e-07, | |
| "loss": 0.5916883945465088, | |
| "num_tokens": 141685264.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.1791044776119404, | |
| "grad_norm": 0.3596602053150199, | |
| "learning_rate": 8.879075222374521e-07, | |
| "loss": 0.563378095626831, | |
| "num_tokens": 142607439.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.1865671641791045, | |
| "grad_norm": 0.29872513574720144, | |
| "learning_rate": 8.863494485270226e-07, | |
| "loss": 0.5588960647583008, | |
| "num_tokens": 143442522.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.1940298507462686, | |
| "grad_norm": 0.35553422720166944, | |
| "learning_rate": 8.847821842871136e-07, | |
| "loss": 0.6027117967605591, | |
| "num_tokens": 144356683.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.2014925373134329, | |
| "grad_norm": 0.35962639938666846, | |
| "learning_rate": 8.832057723421988e-07, | |
| "loss": 0.5953375101089478, | |
| "num_tokens": 145164747.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.208955223880597, | |
| "grad_norm": 0.3320369471002194, | |
| "learning_rate": 8.816202557667075e-07, | |
| "loss": 0.5746063590049744, | |
| "num_tokens": 145974438.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.2164179104477613, | |
| "grad_norm": 0.28069224857904806, | |
| "learning_rate": 8.800256778838467e-07, | |
| "loss": 0.5617422461509705, | |
| "num_tokens": 146897553.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.2238805970149254, | |
| "grad_norm": 0.3248983801962701, | |
| "learning_rate": 8.784220822644178e-07, | |
| "loss": 0.6217033267021179, | |
| "num_tokens": 147706235.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.2313432835820897, | |
| "grad_norm": 0.2887208674047822, | |
| "learning_rate": 8.768095127256261e-07, | |
| "loss": 0.5414159297943115, | |
| "num_tokens": 148638477.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.2388059701492538, | |
| "grad_norm": 0.32438535835021254, | |
| "learning_rate": 8.751880133298834e-07, | |
| "loss": 0.5938442945480347, | |
| "num_tokens": 149598765.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.2462686567164178, | |
| "grad_norm": 0.2894650573081879, | |
| "learning_rate": 8.735576283836037e-07, | |
| "loss": 0.6117956638336182, | |
| "num_tokens": 150499813.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.2537313432835822, | |
| "grad_norm": 0.3397567950477019, | |
| "learning_rate": 8.719184024359934e-07, | |
| "loss": 0.5914928317070007, | |
| "num_tokens": 151406909.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.2611940298507462, | |
| "grad_norm": 0.290993077313297, | |
| "learning_rate": 8.70270380277833e-07, | |
| "loss": 0.6332953572273254, | |
| "num_tokens": 152358615.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.2686567164179103, | |
| "grad_norm": 0.30309197520313697, | |
| "learning_rate": 8.686136069402541e-07, | |
| "loss": 0.5448141694068909, | |
| "num_tokens": 153140245.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.2761194029850746, | |
| "grad_norm": 0.2861452466873225, | |
| "learning_rate": 8.669481276935083e-07, | |
| "loss": 0.6137048602104187, | |
| "num_tokens": 154065403.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.2835820895522387, | |
| "grad_norm": 0.2818788428038251, | |
| "learning_rate": 8.652739880457308e-07, | |
| "loss": 0.606778621673584, | |
| "num_tokens": 155004060.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.291044776119403, | |
| "grad_norm": 0.28295852710705355, | |
| "learning_rate": 8.635912337416962e-07, | |
| "loss": 0.599794864654541, | |
| "num_tokens": 155894044.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.2985074626865671, | |
| "grad_norm": 0.2842356600778382, | |
| "learning_rate": 8.618999107615693e-07, | |
| "loss": 0.5917081832885742, | |
| "num_tokens": 156891753.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.3059701492537314, | |
| "grad_norm": 0.6073059901583661, | |
| "learning_rate": 8.602000653196483e-07, | |
| "loss": 0.5762104988098145, | |
| "num_tokens": 157781264.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.3134328358208955, | |
| "grad_norm": 0.38500166455619816, | |
| "learning_rate": 8.58491743863102e-07, | |
| "loss": 0.5871777534484863, | |
| "num_tokens": 158729371.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.3208955223880596, | |
| "grad_norm": 0.27823289078417784, | |
| "learning_rate": 8.567749930707011e-07, | |
| "loss": 0.605941653251648, | |
| "num_tokens": 159723929.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.328358208955224, | |
| "grad_norm": 0.35213890439138806, | |
| "learning_rate": 8.55049859851542e-07, | |
| "loss": 0.6232315301895142, | |
| "num_tokens": 160655042.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.335820895522388, | |
| "grad_norm": 0.3835692532749333, | |
| "learning_rate": 8.533163913437657e-07, | |
| "loss": 0.5706331729888916, | |
| "num_tokens": 161689806.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.3432835820895521, | |
| "grad_norm": 0.30950866157021506, | |
| "learning_rate": 8.515746349132691e-07, | |
| "loss": 0.5692603588104248, | |
| "num_tokens": 162672971.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.3507462686567164, | |
| "grad_norm": 0.2670593807735062, | |
| "learning_rate": 8.498246381524123e-07, | |
| "loss": 0.55814528465271, | |
| "num_tokens": 163606727.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.3582089552238805, | |
| "grad_norm": 0.30949695351670486, | |
| "learning_rate": 8.480664488787156e-07, | |
| "loss": 0.5762124061584473, | |
| "num_tokens": 164379724.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.3656716417910448, | |
| "grad_norm": 0.5924544887792298, | |
| "learning_rate": 8.463001151335554e-07, | |
| "loss": 0.588869035243988, | |
| "num_tokens": 165282114.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.373134328358209, | |
| "grad_norm": 0.28762285578208174, | |
| "learning_rate": 8.445256851808503e-07, | |
| "loss": 0.5752467513084412, | |
| "num_tokens": 166184652.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.3805970149253732, | |
| "grad_norm": 0.3298383363971737, | |
| "learning_rate": 8.427432075057421e-07, | |
| "loss": 0.5592284798622131, | |
| "num_tokens": 167131883.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.3880597014925373, | |
| "grad_norm": 0.29907045507102953, | |
| "learning_rate": 8.409527308132717e-07, | |
| "loss": 0.6292506456375122, | |
| "num_tokens": 168105786.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.3955223880597014, | |
| "grad_norm": 0.5372519710802263, | |
| "learning_rate": 8.391543040270477e-07, | |
| "loss": 0.5994750261306763, | |
| "num_tokens": 168981965.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.4029850746268657, | |
| "grad_norm": 0.3108497777882688, | |
| "learning_rate": 8.373479762879102e-07, | |
| "loss": 0.5894546508789062, | |
| "num_tokens": 169813930.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.4104477611940298, | |
| "grad_norm": 0.263215877705637, | |
| "learning_rate": 8.355337969525874e-07, | |
| "loss": 0.5457190871238708, | |
| "num_tokens": 170803921.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.417910447761194, | |
| "grad_norm": 0.3065822581882927, | |
| "learning_rate": 8.337118155923472e-07, | |
| "loss": 0.5782487988471985, | |
| "num_tokens": 171568584.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.4253731343283582, | |
| "grad_norm": 0.33076776216010273, | |
| "learning_rate": 8.318820819916432e-07, | |
| "loss": 0.5753518342971802, | |
| "num_tokens": 172302686.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.4328358208955223, | |
| "grad_norm": 0.28171276781641486, | |
| "learning_rate": 8.300446461467532e-07, | |
| "loss": 0.6102815270423889, | |
| "num_tokens": 173251435.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.4402985074626866, | |
| "grad_norm": 0.2941656823177544, | |
| "learning_rate": 8.281995582644144e-07, | |
| "loss": 0.5915931463241577, | |
| "num_tokens": 174154926.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.4477611940298507, | |
| "grad_norm": 0.3065460235572143, | |
| "learning_rate": 8.263468687604508e-07, | |
| "loss": 0.6099899411201477, | |
| "num_tokens": 174968736.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.455223880597015, | |
| "grad_norm": 0.27914137530301547, | |
| "learning_rate": 8.244866282583955e-07, | |
| "loss": 0.6181570291519165, | |
| "num_tokens": 175993671.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.462686567164179, | |
| "grad_norm": 0.3083419332308097, | |
| "learning_rate": 8.226188875881081e-07, | |
| "loss": 0.5710784196853638, | |
| "num_tokens": 176965410.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.4701492537313432, | |
| "grad_norm": 0.5452138442343578, | |
| "learning_rate": 8.20743697784385e-07, | |
| "loss": 0.6409458518028259, | |
| "num_tokens": 177894815.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.4776119402985075, | |
| "grad_norm": 0.3312904222346608, | |
| "learning_rate": 8.188611100855654e-07, | |
| "loss": 0.5432331562042236, | |
| "num_tokens": 178840660.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.4850746268656716, | |
| "grad_norm": 0.2774128783983067, | |
| "learning_rate": 8.169711759321317e-07, | |
| "loss": 0.5913591384887695, | |
| "num_tokens": 179786009.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.4925373134328357, | |
| "grad_norm": 0.27538698518702026, | |
| "learning_rate": 8.150739469653026e-07, | |
| "loss": 0.5327359437942505, | |
| "num_tokens": 180680467.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.2839849649378776, | |
| "learning_rate": 8.131694750256233e-07, | |
| "loss": 0.59873366355896, | |
| "num_tokens": 181598316.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.5074626865671643, | |
| "grad_norm": 0.3083583005906764, | |
| "learning_rate": 8.112578121515484e-07, | |
| "loss": 0.5907875299453735, | |
| "num_tokens": 182458909.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.5149253731343284, | |
| "grad_norm": 0.29048547641188055, | |
| "learning_rate": 8.0933901057802e-07, | |
| "loss": 0.581605076789856, | |
| "num_tokens": 183258199.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.5223880597014925, | |
| "grad_norm": 0.33631459084645754, | |
| "learning_rate": 8.074131227350408e-07, | |
| "loss": 0.5947036743164062, | |
| "num_tokens": 184223376.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.5298507462686568, | |
| "grad_norm": 0.9803280537963448, | |
| "learning_rate": 8.054802012462409e-07, | |
| "loss": 0.6088910102844238, | |
| "num_tokens": 185129043.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.537313432835821, | |
| "grad_norm": 0.35614179487581604, | |
| "learning_rate": 8.035402989274402e-07, | |
| "loss": 0.595119833946228, | |
| "num_tokens": 186025421.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.544776119402985, | |
| "grad_norm": 0.28518439705384824, | |
| "learning_rate": 8.015934687852052e-07, | |
| "loss": 0.5574674606323242, | |
| "num_tokens": 186963319.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.5522388059701493, | |
| "grad_norm": 0.29270174201680466, | |
| "learning_rate": 7.99639764015401e-07, | |
| "loss": 0.6108373999595642, | |
| "num_tokens": 187973354.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.5597014925373134, | |
| "grad_norm": 0.3323311285244367, | |
| "learning_rate": 7.976792380017372e-07, | |
| "loss": 0.5401036143302917, | |
| "num_tokens": 188705328.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.5671641791044775, | |
| "grad_norm": 0.3456603920554097, | |
| "learning_rate": 7.957119443143093e-07, | |
| "loss": 0.6063162088394165, | |
| "num_tokens": 189538934.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5746268656716418, | |
| "grad_norm": 0.3082330500125609, | |
| "learning_rate": 7.937379367081354e-07, | |
| "loss": 0.5718963146209717, | |
| "num_tokens": 190336903.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.582089552238806, | |
| "grad_norm": 0.30744552461466196, | |
| "learning_rate": 7.917572691216866e-07, | |
| "loss": 0.62088942527771, | |
| "num_tokens": 191168843.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.5895522388059702, | |
| "grad_norm": 0.4103702199848832, | |
| "learning_rate": 7.897699956754142e-07, | |
| "loss": 0.5833892822265625, | |
| "num_tokens": 192111363.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.5970149253731343, | |
| "grad_norm": 0.2908559364406994, | |
| "learning_rate": 7.877761706702697e-07, | |
| "loss": 0.5971975922584534, | |
| "num_tokens": 193103746.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.6044776119402986, | |
| "grad_norm": 0.3328614806641497, | |
| "learning_rate": 7.857758485862219e-07, | |
| "loss": 0.5907278656959534, | |
| "num_tokens": 193981563.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.6119402985074627, | |
| "grad_norm": 0.37832890065438735, | |
| "learning_rate": 7.837690840807686e-07, | |
| "loss": 0.6060609817504883, | |
| "num_tokens": 194834592.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.6194029850746268, | |
| "grad_norm": 0.2677852995292723, | |
| "learning_rate": 7.817559319874417e-07, | |
| "loss": 0.5535368323326111, | |
| "num_tokens": 195762991.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.626865671641791, | |
| "grad_norm": 0.2741102900904772, | |
| "learning_rate": 7.797364473143103e-07, | |
| "loss": 0.5808444619178772, | |
| "num_tokens": 196736861.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.6343283582089554, | |
| "grad_norm": 0.2912625129099285, | |
| "learning_rate": 7.777106852424768e-07, | |
| "loss": 0.5897442102432251, | |
| "num_tokens": 197626724.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.6417910447761193, | |
| "grad_norm": 0.591147632193599, | |
| "learning_rate": 7.756787011245699e-07, | |
| "loss": 0.6097444295883179, | |
| "num_tokens": 198637589.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.6492537313432836, | |
| "grad_norm": 0.3296433282919186, | |
| "learning_rate": 7.736405504832313e-07, | |
| "loss": 0.6026604175567627, | |
| "num_tokens": 199563255.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.6567164179104479, | |
| "grad_norm": 0.27997593841499196, | |
| "learning_rate": 7.715962890095987e-07, | |
| "loss": 0.5822043418884277, | |
| "num_tokens": 200461303.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.664179104477612, | |
| "grad_norm": 0.3067564437403442, | |
| "learning_rate": 7.69545972561785e-07, | |
| "loss": 0.6166250705718994, | |
| "num_tokens": 201369977.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.671641791044776, | |
| "grad_norm": 0.2688906931144112, | |
| "learning_rate": 7.674896571633506e-07, | |
| "loss": 0.5445988178253174, | |
| "num_tokens": 202278503.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.6791044776119404, | |
| "grad_norm": 0.2691568834958204, | |
| "learning_rate": 7.65427399001774e-07, | |
| "loss": 0.5422626733779907, | |
| "num_tokens": 203242720.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.6865671641791045, | |
| "grad_norm": 0.2906038986233365, | |
| "learning_rate": 7.633592544269152e-07, | |
| "loss": 0.5783904790878296, | |
| "num_tokens": 204150301.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.6940298507462686, | |
| "grad_norm": 0.2887554770352252, | |
| "learning_rate": 7.612852799494769e-07, | |
| "loss": 0.588298499584198, | |
| "num_tokens": 205093558.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.7014925373134329, | |
| "grad_norm": 0.2993654753244292, | |
| "learning_rate": 7.592055322394602e-07, | |
| "loss": 0.6025734543800354, | |
| "num_tokens": 205970210.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.7089552238805972, | |
| "grad_norm": 0.284435640176723, | |
| "learning_rate": 7.571200681246158e-07, | |
| "loss": 0.6054296493530273, | |
| "num_tokens": 206859291.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.716417910447761, | |
| "grad_norm": 0.3195285881506451, | |
| "learning_rate": 7.550289445888914e-07, | |
| "loss": 0.5874844789505005, | |
| "num_tokens": 207786446.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.7238805970149254, | |
| "grad_norm": 0.28553073165686976, | |
| "learning_rate": 7.529322187708751e-07, | |
| "loss": 0.6177946925163269, | |
| "num_tokens": 208698287.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.7313432835820897, | |
| "grad_norm": 0.49617871796694113, | |
| "learning_rate": 7.508299479622334e-07, | |
| "loss": 0.5590040683746338, | |
| "num_tokens": 209548343.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.7388059701492538, | |
| "grad_norm": 0.28377509079902785, | |
| "learning_rate": 7.487221896061457e-07, | |
| "loss": 0.5984382629394531, | |
| "num_tokens": 210468969.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.7462686567164178, | |
| "grad_norm": 0.27598753591931857, | |
| "learning_rate": 7.46609001295736e-07, | |
| "loss": 0.614782452583313, | |
| "num_tokens": 211457470.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.7537313432835822, | |
| "grad_norm": 0.3112684553028677, | |
| "learning_rate": 7.444904407724972e-07, | |
| "loss": 0.5674484372138977, | |
| "num_tokens": 212347451.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.7611940298507462, | |
| "grad_norm": 0.6461909553210673, | |
| "learning_rate": 7.423665659247152e-07, | |
| "loss": 0.5716361999511719, | |
| "num_tokens": 213190706.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.7686567164179103, | |
| "grad_norm": 0.28012296588796226, | |
| "learning_rate": 7.40237434785886e-07, | |
| "loss": 0.5874301195144653, | |
| "num_tokens": 214169074.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.7761194029850746, | |
| "grad_norm": 0.279647753002793, | |
| "learning_rate": 7.381031055331305e-07, | |
| "loss": 0.6019556522369385, | |
| "num_tokens": 215188427.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.783582089552239, | |
| "grad_norm": 0.39177638463203673, | |
| "learning_rate": 7.359636364856043e-07, | |
| "loss": 0.6031475067138672, | |
| "num_tokens": 216080767.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.7910447761194028, | |
| "grad_norm": 0.3360890439886117, | |
| "learning_rate": 7.338190861029051e-07, | |
| "loss": 0.5989038944244385, | |
| "num_tokens": 216995394.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.7985074626865671, | |
| "grad_norm": 0.2739266991645573, | |
| "learning_rate": 7.316695129834744e-07, | |
| "loss": 0.5613197684288025, | |
| "num_tokens": 217866020.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.8059701492537314, | |
| "grad_norm": 0.5120657338819569, | |
| "learning_rate": 7.295149758629966e-07, | |
| "loss": 0.5808136463165283, | |
| "num_tokens": 218680341.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.8134328358208955, | |
| "grad_norm": 0.294200195693698, | |
| "learning_rate": 7.273555336127946e-07, | |
| "loss": 0.5945237874984741, | |
| "num_tokens": 219550948.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.8208955223880596, | |
| "grad_norm": 0.2909609394380647, | |
| "learning_rate": 7.251912452382205e-07, | |
| "loss": 0.5686002373695374, | |
| "num_tokens": 220517125.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.828358208955224, | |
| "grad_norm": 0.29553389453752155, | |
| "learning_rate": 7.230221698770439e-07, | |
| "loss": 0.5637418031692505, | |
| "num_tokens": 221365026.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.835820895522388, | |
| "grad_norm": 0.331158945273446, | |
| "learning_rate": 7.20848366797835e-07, | |
| "loss": 0.5235867500305176, | |
| "num_tokens": 222233736.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.8432835820895521, | |
| "grad_norm": 0.29651149137672, | |
| "learning_rate": 7.186698953983465e-07, | |
| "loss": 0.6124955415725708, | |
| "num_tokens": 223222809.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.8507462686567164, | |
| "grad_norm": 0.2967229855382439, | |
| "learning_rate": 7.164868152038898e-07, | |
| "loss": 0.5900925993919373, | |
| "num_tokens": 224116326.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.8582089552238807, | |
| "grad_norm": 0.2770420932471773, | |
| "learning_rate": 7.14299185865708e-07, | |
| "loss": 0.5970636606216431, | |
| "num_tokens": 991360.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.8656716417910446, | |
| "grad_norm": 0.2858538902789167, | |
| "learning_rate": 7.121070671593477e-07, | |
| "loss": 0.5819560289382935, | |
| "num_tokens": 2010167.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.873134328358209, | |
| "grad_norm": 0.2972326089151943, | |
| "learning_rate": 7.099105189830235e-07, | |
| "loss": 0.5888773202896118, | |
| "num_tokens": 2885939.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.8805970149253732, | |
| "grad_norm": 0.3057535952426567, | |
| "learning_rate": 7.07709601355983e-07, | |
| "loss": 0.5811155438423157, | |
| "num_tokens": 3762868.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.8880597014925373, | |
| "grad_norm": 0.31258132271786665, | |
| "learning_rate": 7.055043744168657e-07, | |
| "loss": 0.6176049709320068, | |
| "num_tokens": 4733514.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.8955223880597014, | |
| "grad_norm": 0.3030852287619505, | |
| "learning_rate": 7.03294898422061e-07, | |
| "loss": 0.5565370917320251, | |
| "num_tokens": 5639515.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.9029850746268657, | |
| "grad_norm": 0.3012477759016326, | |
| "learning_rate": 7.010812337440604e-07, | |
| "loss": 0.568949818611145, | |
| "num_tokens": 6415052.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.9104477611940298, | |
| "grad_norm": 0.27565638559926237, | |
| "learning_rate": 6.988634408698082e-07, | |
| "loss": 0.5407424569129944, | |
| "num_tokens": 7346190.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.917910447761194, | |
| "grad_norm": 0.32156589204612385, | |
| "learning_rate": 6.9664158039905e-07, | |
| "loss": 0.609969973564148, | |
| "num_tokens": 8239599.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.9253731343283582, | |
| "grad_norm": 0.3188048041018452, | |
| "learning_rate": 6.944157130426745e-07, | |
| "loss": 0.5987858176231384, | |
| "num_tokens": 9077707.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.9328358208955225, | |
| "grad_norm": 0.2961821437519157, | |
| "learning_rate": 6.921858996210568e-07, | |
| "loss": 0.568209171295166, | |
| "num_tokens": 9982372.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.9402985074626866, | |
| "grad_norm": 0.27392736616109464, | |
| "learning_rate": 6.899522010623958e-07, | |
| "loss": 0.5922641158103943, | |
| "num_tokens": 11023445.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.9477611940298507, | |
| "grad_norm": 0.2933002440532853, | |
| "learning_rate": 6.877146784010486e-07, | |
| "loss": 0.5736743211746216, | |
| "num_tokens": 11857709.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.955223880597015, | |
| "grad_norm": 0.27846912836784765, | |
| "learning_rate": 6.854733927758636e-07, | |
| "loss": 0.5894352197647095, | |
| "num_tokens": 12766496.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.962686567164179, | |
| "grad_norm": 0.27777120339996386, | |
| "learning_rate": 6.8322840542851e-07, | |
| "loss": 0.601696789264679, | |
| "num_tokens": 13767472.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.9701492537313432, | |
| "grad_norm": 0.46660244180149263, | |
| "learning_rate": 6.80979777701804e-07, | |
| "loss": 0.5974367260932922, | |
| "num_tokens": 14594712.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.9776119402985075, | |
| "grad_norm": 0.2956712799688728, | |
| "learning_rate": 6.787275710380329e-07, | |
| "loss": 0.5965464115142822, | |
| "num_tokens": 15486445.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.9850746268656716, | |
| "grad_norm": 0.31107227320954434, | |
| "learning_rate": 6.764718469772757e-07, | |
| "loss": 0.576676607131958, | |
| "num_tokens": 16227990.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.9925373134328357, | |
| "grad_norm": 0.28037710557720436, | |
| "learning_rate": 6.742126671557227e-07, | |
| "loss": 0.556594729423523, | |
| "num_tokens": 17105978.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.31658535211949906, | |
| "learning_rate": 6.719500933039897e-07, | |
| "loss": 0.5741510391235352, | |
| "num_tokens": 18011768.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.0074626865671643, | |
| "grad_norm": 0.2959704877731208, | |
| "learning_rate": 6.69684187245433e-07, | |
| "loss": 0.596227765083313, | |
| "num_tokens": 18916003.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.014925373134328, | |
| "grad_norm": 0.29107376493477805, | |
| "learning_rate": 6.674150108944592e-07, | |
| "loss": 0.5445001125335693, | |
| "num_tokens": 19684628.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.0223880597014925, | |
| "grad_norm": 0.2777883261006439, | |
| "learning_rate": 6.651426262548325e-07, | |
| "loss": 0.5889461636543274, | |
| "num_tokens": 20690086.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.029850746268657, | |
| "grad_norm": 0.2701961134075662, | |
| "learning_rate": 6.628670954179829e-07, | |
| "loss": 0.5695216655731201, | |
| "num_tokens": 21686072.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.0373134328358207, | |
| "grad_norm": 0.26760015658938013, | |
| "learning_rate": 6.605884805613072e-07, | |
| "loss": 0.5295987129211426, | |
| "num_tokens": 22622971.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.044776119402985, | |
| "grad_norm": 0.323703653894069, | |
| "learning_rate": 6.583068439464715e-07, | |
| "loss": 0.5844870209693909, | |
| "num_tokens": 23496905.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.0522388059701493, | |
| "grad_norm": 0.30828913191565205, | |
| "learning_rate": 6.560222479177094e-07, | |
| "loss": 0.5690542459487915, | |
| "num_tokens": 24365149.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.0597014925373136, | |
| "grad_norm": 0.3084417281876953, | |
| "learning_rate": 6.537347549001184e-07, | |
| "loss": 0.5742576122283936, | |
| "num_tokens": 25184612.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.0671641791044775, | |
| "grad_norm": 0.3080393843745802, | |
| "learning_rate": 6.514444273979543e-07, | |
| "loss": 0.5722700357437134, | |
| "num_tokens": 26054937.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.074626865671642, | |
| "grad_norm": 0.26775870499252885, | |
| "learning_rate": 6.491513279929237e-07, | |
| "loss": 0.5365396738052368, | |
| "num_tokens": 26954789.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.082089552238806, | |
| "grad_norm": 0.35802085892606694, | |
| "learning_rate": 6.468555193424735e-07, | |
| "loss": 0.5596331357955933, | |
| "num_tokens": 27845072.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.08955223880597, | |
| "grad_norm": 0.2719625157722236, | |
| "learning_rate": 6.445570641780786e-07, | |
| "loss": 0.5419675707817078, | |
| "num_tokens": 28658754.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.0970149253731343, | |
| "grad_norm": 0.27709779905677195, | |
| "learning_rate": 6.422560253035287e-07, | |
| "loss": 0.5775716304779053, | |
| "num_tokens": 29562576.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.1044776119402986, | |
| "grad_norm": 0.3348867650726701, | |
| "learning_rate": 6.39952465593211e-07, | |
| "loss": 0.585283637046814, | |
| "num_tokens": 30431256.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.111940298507463, | |
| "grad_norm": 0.29694286893571703, | |
| "learning_rate": 6.376464479903937e-07, | |
| "loss": 0.5197538733482361, | |
| "num_tokens": 31183415.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.1194029850746268, | |
| "grad_norm": 0.29840728825899304, | |
| "learning_rate": 6.35338035505505e-07, | |
| "loss": 0.5599273443222046, | |
| "num_tokens": 32067616.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.126865671641791, | |
| "grad_norm": 0.33678060078261246, | |
| "learning_rate": 6.330272912144116e-07, | |
| "loss": 0.6192691326141357, | |
| "num_tokens": 32981757.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.1343283582089554, | |
| "grad_norm": 0.2824243012069984, | |
| "learning_rate": 6.307142782566951e-07, | |
| "loss": 0.5863723754882812, | |
| "num_tokens": 34022251.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.1417910447761193, | |
| "grad_norm": 0.299007428336042, | |
| "learning_rate": 6.283990598339274e-07, | |
| "loss": 0.5666537284851074, | |
| "num_tokens": 34771710.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.1492537313432836, | |
| "grad_norm": 0.32004542775898487, | |
| "learning_rate": 6.260816992079431e-07, | |
| "loss": 0.5231757760047913, | |
| "num_tokens": 35650183.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.156716417910448, | |
| "grad_norm": 0.3045721000640588, | |
| "learning_rate": 6.237622596991106e-07, | |
| "loss": 0.5760424137115479, | |
| "num_tokens": 36493771.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.1641791044776117, | |
| "grad_norm": 0.2937692676912212, | |
| "learning_rate": 6.214408046846034e-07, | |
| "loss": 0.568109393119812, | |
| "num_tokens": 37330886.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.171641791044776, | |
| "grad_norm": 0.27011730349579827, | |
| "learning_rate": 6.191173975966668e-07, | |
| "loss": 0.5667808055877686, | |
| "num_tokens": 38365287.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.1791044776119404, | |
| "grad_norm": 0.28692265505240294, | |
| "learning_rate": 6.16792101920885e-07, | |
| "loss": 0.6112924814224243, | |
| "num_tokens": 39420111.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.1865671641791047, | |
| "grad_norm": 0.28526825922243654, | |
| "learning_rate": 6.144649811944473e-07, | |
| "loss": 0.5639245510101318, | |
| "num_tokens": 40263636.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.1940298507462686, | |
| "grad_norm": 0.2908552938644807, | |
| "learning_rate": 6.121360990044106e-07, | |
| "loss": 0.5848294496536255, | |
| "num_tokens": 41080304.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.201492537313433, | |
| "grad_norm": 0.2821005110679085, | |
| "learning_rate": 6.098055189859634e-07, | |
| "loss": 0.5666854381561279, | |
| "num_tokens": 42076069.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.208955223880597, | |
| "grad_norm": 0.2609076970175634, | |
| "learning_rate": 6.074733048206852e-07, | |
| "loss": 0.5690361857414246, | |
| "num_tokens": 43082365.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.216417910447761, | |
| "grad_norm": 0.2836464990705104, | |
| "learning_rate": 6.051395202348089e-07, | |
| "loss": 0.5679644346237183, | |
| "num_tokens": 44012700.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.2238805970149254, | |
| "grad_norm": 0.25600506754304947, | |
| "learning_rate": 6.028042289974768e-07, | |
| "loss": 0.5512281656265259, | |
| "num_tokens": 44917778.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.2313432835820897, | |
| "grad_norm": 0.3257772292478769, | |
| "learning_rate": 6.004674949190003e-07, | |
| "loss": 0.5415934324264526, | |
| "num_tokens": 45740145.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.2388059701492535, | |
| "grad_norm": 0.3286166291890802, | |
| "learning_rate": 5.981293818491152e-07, | |
| "loss": 0.5995659828186035, | |
| "num_tokens": 46620715.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.246268656716418, | |
| "grad_norm": 0.2894099742683797, | |
| "learning_rate": 5.957899536752373e-07, | |
| "loss": 0.608267605304718, | |
| "num_tokens": 47539124.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.253731343283582, | |
| "grad_norm": 0.2895508079853879, | |
| "learning_rate": 5.934492743207168e-07, | |
| "loss": 0.5291934013366699, | |
| "num_tokens": 48336408.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.2611940298507465, | |
| "grad_norm": 0.3174861796364666, | |
| "learning_rate": 5.911074077430916e-07, | |
| "loss": 0.5688158273696899, | |
| "num_tokens": 49205406.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.2686567164179103, | |
| "grad_norm": 0.3232273733079749, | |
| "learning_rate": 5.887644179323403e-07, | |
| "loss": 0.5540226697921753, | |
| "num_tokens": 50043421.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.2761194029850746, | |
| "grad_norm": 0.28558608567310023, | |
| "learning_rate": 5.864203689091315e-07, | |
| "loss": 0.5832343697547913, | |
| "num_tokens": 50915233.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.283582089552239, | |
| "grad_norm": 0.3069099454369789, | |
| "learning_rate": 5.84075324723078e-07, | |
| "loss": 0.5831292867660522, | |
| "num_tokens": 51814606.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.291044776119403, | |
| "grad_norm": 0.2697329326673818, | |
| "learning_rate": 5.817293494509836e-07, | |
| "loss": 0.5265708565711975, | |
| "num_tokens": 52624758.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.298507462686567, | |
| "grad_norm": 0.27527418320187297, | |
| "learning_rate": 5.793825071950935e-07, | |
| "loss": 0.5518659353256226, | |
| "num_tokens": 53591262.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.3059701492537314, | |
| "grad_norm": 0.31199334247822685, | |
| "learning_rate": 5.770348620813432e-07, | |
| "loss": 0.5563576221466064, | |
| "num_tokens": 54586868.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.3134328358208958, | |
| "grad_norm": 0.33111603191568395, | |
| "learning_rate": 5.746864782576053e-07, | |
| "loss": 0.5557553768157959, | |
| "num_tokens": 55375213.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.3208955223880596, | |
| "grad_norm": 0.2955172821022238, | |
| "learning_rate": 5.723374198919376e-07, | |
| "loss": 0.5784043073654175, | |
| "num_tokens": 56208304.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.328358208955224, | |
| "grad_norm": 0.27282274560683967, | |
| "learning_rate": 5.699877511708284e-07, | |
| "loss": 0.5383070111274719, | |
| "num_tokens": 57191922.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.3358208955223883, | |
| "grad_norm": 0.258279345220606, | |
| "learning_rate": 5.676375362974449e-07, | |
| "loss": 0.5381882786750793, | |
| "num_tokens": 58105389.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.343283582089552, | |
| "grad_norm": 0.33949214023855984, | |
| "learning_rate": 5.652868394898766e-07, | |
| "loss": 0.5437734723091125, | |
| "num_tokens": 58961497.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.3507462686567164, | |
| "grad_norm": 0.2833579384746108, | |
| "learning_rate": 5.629357249793816e-07, | |
| "loss": 0.592788815498352, | |
| "num_tokens": 59947795.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.3582089552238807, | |
| "grad_norm": 0.30062364724661333, | |
| "learning_rate": 5.605842570086319e-07, | |
| "loss": 0.5617958307266235, | |
| "num_tokens": 60964098.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.3656716417910446, | |
| "grad_norm": 0.2800076043555411, | |
| "learning_rate": 5.582324998299572e-07, | |
| "loss": 0.5720120072364807, | |
| "num_tokens": 61889873.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.373134328358209, | |
| "grad_norm": 0.27101523720971354, | |
| "learning_rate": 5.558805177035901e-07, | |
| "loss": 0.5741963386535645, | |
| "num_tokens": 62849188.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.3805970149253732, | |
| "grad_norm": 0.285471903870457, | |
| "learning_rate": 5.53528374895909e-07, | |
| "loss": 0.5414842963218689, | |
| "num_tokens": 63726113.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.388059701492537, | |
| "grad_norm": 0.27878281549210715, | |
| "learning_rate": 5.511761356776833e-07, | |
| "loss": 0.5728551745414734, | |
| "num_tokens": 64647597.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.3955223880597014, | |
| "grad_norm": 0.28622078113655536, | |
| "learning_rate": 5.488238643223167e-07, | |
| "loss": 0.5815838575363159, | |
| "num_tokens": 65508928.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.4029850746268657, | |
| "grad_norm": 0.27899596544094324, | |
| "learning_rate": 5.464716251040911e-07, | |
| "loss": 0.5664654970169067, | |
| "num_tokens": 66358099.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.41044776119403, | |
| "grad_norm": 0.2825063898870399, | |
| "learning_rate": 5.441194822964099e-07, | |
| "loss": 0.5764633417129517, | |
| "num_tokens": 67219625.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.417910447761194, | |
| "grad_norm": 0.2889902676953764, | |
| "learning_rate": 5.417675001700427e-07, | |
| "loss": 0.5656483173370361, | |
| "num_tokens": 68141332.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.425373134328358, | |
| "grad_norm": 0.27885504864685967, | |
| "learning_rate": 5.39415742991368e-07, | |
| "loss": 0.6192145943641663, | |
| "num_tokens": 69046407.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.4328358208955225, | |
| "grad_norm": 0.286670961202952, | |
| "learning_rate": 5.370642750206184e-07, | |
| "loss": 0.6090319156646729, | |
| "num_tokens": 70083093.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.4402985074626864, | |
| "grad_norm": 0.275072660826794, | |
| "learning_rate": 5.347131605101236e-07, | |
| "loss": 0.6045145988464355, | |
| "num_tokens": 71047395.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.4477611940298507, | |
| "grad_norm": 0.2864324709863456, | |
| "learning_rate": 5.323624637025551e-07, | |
| "loss": 0.5572278499603271, | |
| "num_tokens": 71932159.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.455223880597015, | |
| "grad_norm": 0.2973709054279998, | |
| "learning_rate": 5.300122488291716e-07, | |
| "loss": 0.5611422061920166, | |
| "num_tokens": 72789371.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.4626865671641793, | |
| "grad_norm": 0.3318220155418688, | |
| "learning_rate": 5.276625801080625e-07, | |
| "loss": 0.5865360498428345, | |
| "num_tokens": 73721478.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.470149253731343, | |
| "grad_norm": 0.3436341281789925, | |
| "learning_rate": 5.253135217423947e-07, | |
| "loss": 0.5705252885818481, | |
| "num_tokens": 74706274.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.4776119402985075, | |
| "grad_norm": 0.37986006551326945, | |
| "learning_rate": 5.229651379186569e-07, | |
| "loss": 0.5907820463180542, | |
| "num_tokens": 75647716.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.485074626865672, | |
| "grad_norm": 0.3122329879913656, | |
| "learning_rate": 5.206174928049065e-07, | |
| "loss": 0.5766445994377136, | |
| "num_tokens": 76637809.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 2.4925373134328357, | |
| "grad_norm": 0.2998854461296799, | |
| "learning_rate": 5.182706505490165e-07, | |
| "loss": 0.5649234652519226, | |
| "num_tokens": 77528162.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.25745394101844005, | |
| "learning_rate": 5.15924675276922e-07, | |
| "loss": 0.5354350805282593, | |
| "num_tokens": 78421720.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.5074626865671643, | |
| "grad_norm": 0.2829213028380422, | |
| "learning_rate": 5.135796310908685e-07, | |
| "loss": 0.5751874446868896, | |
| "num_tokens": 79217572.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 2.5149253731343286, | |
| "grad_norm": 0.3139108030566433, | |
| "learning_rate": 5.112355820676599e-07, | |
| "loss": 0.5624819993972778, | |
| "num_tokens": 80184157.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 2.5223880597014925, | |
| "grad_norm": 0.28913655087516726, | |
| "learning_rate": 5.088925922569083e-07, | |
| "loss": 0.568986713886261, | |
| "num_tokens": 81059812.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 2.529850746268657, | |
| "grad_norm": 0.2960772237491209, | |
| "learning_rate": 5.065507256792833e-07, | |
| "loss": 0.5797086954116821, | |
| "num_tokens": 81975922.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 2.5373134328358207, | |
| "grad_norm": 0.2882326553054164, | |
| "learning_rate": 5.042100463247629e-07, | |
| "loss": 0.5706868171691895, | |
| "num_tokens": 82823460.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.544776119402985, | |
| "grad_norm": 0.307599479684034, | |
| "learning_rate": 5.018706181508851e-07, | |
| "loss": 0.5756710767745972, | |
| "num_tokens": 83559785.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 2.5522388059701493, | |
| "grad_norm": 0.308017074889723, | |
| "learning_rate": 4.995325050809999e-07, | |
| "loss": 0.6031478643417358, | |
| "num_tokens": 84441431.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 2.5597014925373136, | |
| "grad_norm": 0.29529571185477965, | |
| "learning_rate": 4.971957710025234e-07, | |
| "loss": 0.5946158170700073, | |
| "num_tokens": 85349485.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 2.5671641791044775, | |
| "grad_norm": 0.2819610880293821, | |
| "learning_rate": 4.948604797651913e-07, | |
| "loss": 0.5992064476013184, | |
| "num_tokens": 86267065.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 2.574626865671642, | |
| "grad_norm": 0.2972660099307388, | |
| "learning_rate": 4.925266951793149e-07, | |
| "loss": 0.573174774646759, | |
| "num_tokens": 87077996.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.582089552238806, | |
| "grad_norm": 0.301461452019035, | |
| "learning_rate": 4.901944810140369e-07, | |
| "loss": 0.589251697063446, | |
| "num_tokens": 88180031.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 2.58955223880597, | |
| "grad_norm": 0.2904479295025236, | |
| "learning_rate": 4.878639009955895e-07, | |
| "loss": 0.54721999168396, | |
| "num_tokens": 89096524.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 2.5970149253731343, | |
| "grad_norm": 0.2633320073378902, | |
| "learning_rate": 4.855350188055528e-07, | |
| "loss": 0.5418224334716797, | |
| "num_tokens": 90020467.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 2.6044776119402986, | |
| "grad_norm": 0.27176928239419323, | |
| "learning_rate": 4.83207898079115e-07, | |
| "loss": 0.565537691116333, | |
| "num_tokens": 90987416.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 2.611940298507463, | |
| "grad_norm": 0.32160604849841345, | |
| "learning_rate": 4.808826024033334e-07, | |
| "loss": 0.5598034262657166, | |
| "num_tokens": 91795663.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.6194029850746268, | |
| "grad_norm": 0.30348956227704144, | |
| "learning_rate": 4.785591953153966e-07, | |
| "loss": 0.5576733946800232, | |
| "num_tokens": 92702072.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 2.626865671641791, | |
| "grad_norm": 0.2744155218003863, | |
| "learning_rate": 4.762377403008895e-07, | |
| "loss": 0.5912754535675049, | |
| "num_tokens": 93699627.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 2.6343283582089554, | |
| "grad_norm": 0.2666509179401252, | |
| "learning_rate": 4.739183007920571e-07, | |
| "loss": 0.5752925276756287, | |
| "num_tokens": 94666002.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 2.6417910447761193, | |
| "grad_norm": 0.261062380795103, | |
| "learning_rate": 4.7160094016607276e-07, | |
| "loss": 0.5275688767433167, | |
| "num_tokens": 95605433.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 2.6492537313432836, | |
| "grad_norm": 0.2811617365131969, | |
| "learning_rate": 4.6928572174330495e-07, | |
| "loss": 0.5722550749778748, | |
| "num_tokens": 96516441.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.656716417910448, | |
| "grad_norm": 0.2735603112854696, | |
| "learning_rate": 4.669727087855886e-07, | |
| "loss": 0.5699527859687805, | |
| "num_tokens": 97450967.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 2.664179104477612, | |
| "grad_norm": 0.3395844478312138, | |
| "learning_rate": 4.6466196449449504e-07, | |
| "loss": 0.5282535552978516, | |
| "num_tokens": 98350106.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 2.671641791044776, | |
| "grad_norm": 0.2830556116676336, | |
| "learning_rate": 4.6235355200960623e-07, | |
| "loss": 0.5501178503036499, | |
| "num_tokens": 99299833.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 2.6791044776119404, | |
| "grad_norm": 0.2720175319739195, | |
| "learning_rate": 4.600475344067889e-07, | |
| "loss": 0.5554410219192505, | |
| "num_tokens": 100163789.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 2.6865671641791042, | |
| "grad_norm": 0.39231713406243224, | |
| "learning_rate": 4.577439746964714e-07, | |
| "loss": 0.5649659633636475, | |
| "num_tokens": 101065769.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.6940298507462686, | |
| "grad_norm": 0.3840423060468296, | |
| "learning_rate": 4.554429358219213e-07, | |
| "loss": 0.5463579893112183, | |
| "num_tokens": 102054742.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 2.701492537313433, | |
| "grad_norm": 0.2823797561004669, | |
| "learning_rate": 4.531444806575265e-07, | |
| "loss": 0.5806522369384766, | |
| "num_tokens": 102999309.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 2.708955223880597, | |
| "grad_norm": 0.2847114226753591, | |
| "learning_rate": 4.508486720070761e-07, | |
| "loss": 0.5655279159545898, | |
| "num_tokens": 103943807.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 2.716417910447761, | |
| "grad_norm": 0.28205090767545954, | |
| "learning_rate": 4.4855557260204547e-07, | |
| "loss": 0.5465511083602905, | |
| "num_tokens": 104752259.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 2.7238805970149254, | |
| "grad_norm": 0.31707271181231406, | |
| "learning_rate": 4.462652450998815e-07, | |
| "loss": 0.56863933801651, | |
| "num_tokens": 105560152.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.7313432835820897, | |
| "grad_norm": 0.28484578348583783, | |
| "learning_rate": 4.439777520822905e-07, | |
| "loss": 0.5578351020812988, | |
| "num_tokens": 106370949.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 2.7388059701492535, | |
| "grad_norm": 0.2911877277567285, | |
| "learning_rate": 4.416931560535284e-07, | |
| "loss": 0.560371994972229, | |
| "num_tokens": 107223604.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 2.746268656716418, | |
| "grad_norm": 0.27157022459261115, | |
| "learning_rate": 4.394115194386927e-07, | |
| "loss": 0.5460314750671387, | |
| "num_tokens": 108240912.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 2.753731343283582, | |
| "grad_norm": 0.2866648017188484, | |
| "learning_rate": 4.3713290458201714e-07, | |
| "loss": 0.567893922328949, | |
| "num_tokens": 109178166.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 2.7611940298507465, | |
| "grad_norm": 0.29527473129759935, | |
| "learning_rate": 4.348573737451674e-07, | |
| "loss": 0.6049559116363525, | |
| "num_tokens": 109991427.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.7686567164179103, | |
| "grad_norm": 0.31366814462249815, | |
| "learning_rate": 4.3258498910554085e-07, | |
| "loss": 0.5512971878051758, | |
| "num_tokens": 110872756.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 2.7761194029850746, | |
| "grad_norm": 0.3023387372899213, | |
| "learning_rate": 4.3031581275456687e-07, | |
| "loss": 0.5625054836273193, | |
| "num_tokens": 111784756.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 2.783582089552239, | |
| "grad_norm": 0.3327888958164682, | |
| "learning_rate": 4.2804990669601015e-07, | |
| "loss": 0.559493899345398, | |
| "num_tokens": 112842525.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 2.791044776119403, | |
| "grad_norm": 0.2864205118032827, | |
| "learning_rate": 4.2578733284427735e-07, | |
| "loss": 0.541454553604126, | |
| "num_tokens": 113670083.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 2.798507462686567, | |
| "grad_norm": 0.2787810223726183, | |
| "learning_rate": 4.2352815302272415e-07, | |
| "loss": 0.5479576587677002, | |
| "num_tokens": 114578655.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.8059701492537314, | |
| "grad_norm": 0.2870728358422289, | |
| "learning_rate": 4.2127242896196715e-07, | |
| "loss": 0.5296257138252258, | |
| "num_tokens": 115385601.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 2.8134328358208958, | |
| "grad_norm": 0.2787997170860601, | |
| "learning_rate": 4.190202222981959e-07, | |
| "loss": 0.5955355763435364, | |
| "num_tokens": 116307550.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 2.8208955223880596, | |
| "grad_norm": 0.26436536089418516, | |
| "learning_rate": 4.1677159457149e-07, | |
| "loss": 0.5246421098709106, | |
| "num_tokens": 117169272.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 2.828358208955224, | |
| "grad_norm": 0.29264065144374635, | |
| "learning_rate": 4.145266072241365e-07, | |
| "loss": 0.5522100329399109, | |
| "num_tokens": 118096842.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 2.835820895522388, | |
| "grad_norm": 0.28815451535722664, | |
| "learning_rate": 4.1228532159895146e-07, | |
| "loss": 0.5797725915908813, | |
| "num_tokens": 119100115.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.843283582089552, | |
| "grad_norm": 0.3003290343487832, | |
| "learning_rate": 4.100477989376042e-07, | |
| "loss": 0.5710124969482422, | |
| "num_tokens": 120047947.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 2.8507462686567164, | |
| "grad_norm": 0.3124248817525316, | |
| "learning_rate": 4.0781410037894305e-07, | |
| "loss": 0.5675666332244873, | |
| "num_tokens": 120918603.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 2.8582089552238807, | |
| "grad_norm": 0.29192127163210346, | |
| "learning_rate": 4.0558428695732563e-07, | |
| "loss": 0.5678380727767944, | |
| "num_tokens": 121705889.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 2.8656716417910446, | |
| "grad_norm": 0.45590175900628427, | |
| "learning_rate": 4.033584196009502e-07, | |
| "loss": 0.5677410960197449, | |
| "num_tokens": 122662818.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 2.873134328358209, | |
| "grad_norm": 0.2630633298189877, | |
| "learning_rate": 4.0113655913019173e-07, | |
| "loss": 0.5765926837921143, | |
| "num_tokens": 123634255.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.8805970149253732, | |
| "grad_norm": 0.4068807334874914, | |
| "learning_rate": 3.989187662559397e-07, | |
| "loss": 0.5568211078643799, | |
| "num_tokens": 124693287.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 2.888059701492537, | |
| "grad_norm": 0.2899519882101068, | |
| "learning_rate": 3.967051015779389e-07, | |
| "loss": 0.5638155937194824, | |
| "num_tokens": 125727015.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 2.8955223880597014, | |
| "grad_norm": 0.27969326214227774, | |
| "learning_rate": 3.944956255831342e-07, | |
| "loss": 0.5610464215278625, | |
| "num_tokens": 126569685.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 2.9029850746268657, | |
| "grad_norm": 0.2771191212457944, | |
| "learning_rate": 3.9229039864401703e-07, | |
| "loss": 0.5670617818832397, | |
| "num_tokens": 127486971.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 2.91044776119403, | |
| "grad_norm": 0.28740458029106764, | |
| "learning_rate": 3.900894810169766e-07, | |
| "loss": 0.573495626449585, | |
| "num_tokens": 128449869.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.917910447761194, | |
| "grad_norm": 0.3229624542793302, | |
| "learning_rate": 3.8789293284065236e-07, | |
| "loss": 0.5427689552307129, | |
| "num_tokens": 129068910.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 2.925373134328358, | |
| "grad_norm": 0.28158062590946553, | |
| "learning_rate": 3.85700814134292e-07, | |
| "loss": 0.5718903541564941, | |
| "num_tokens": 129934302.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 2.9328358208955225, | |
| "grad_norm": 0.29563596535877035, | |
| "learning_rate": 3.8351318479611037e-07, | |
| "loss": 0.5753850340843201, | |
| "num_tokens": 130851893.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 2.9402985074626864, | |
| "grad_norm": 0.2766878486514577, | |
| "learning_rate": 3.813301046016536e-07, | |
| "loss": 0.5622212886810303, | |
| "num_tokens": 131790942.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 2.9477611940298507, | |
| "grad_norm": 0.31331114946966404, | |
| "learning_rate": 3.7915163320216506e-07, | |
| "loss": 0.5439543724060059, | |
| "num_tokens": 132669917.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.955223880597015, | |
| "grad_norm": 0.3302390373570611, | |
| "learning_rate": 3.7697783012295614e-07, | |
| "loss": 0.560044527053833, | |
| "num_tokens": 133626565.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 2.9626865671641793, | |
| "grad_norm": 0.2829029075854862, | |
| "learning_rate": 3.7480875476177944e-07, | |
| "loss": 0.5379583835601807, | |
| "num_tokens": 134404690.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 2.970149253731343, | |
| "grad_norm": 0.27234325967681716, | |
| "learning_rate": 3.7264446638720537e-07, | |
| "loss": 0.5365550518035889, | |
| "num_tokens": 135338794.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 2.9776119402985075, | |
| "grad_norm": 0.2767187314816525, | |
| "learning_rate": 3.7048502413700343e-07, | |
| "loss": 0.5605146288871765, | |
| "num_tokens": 136245478.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 2.9850746268656714, | |
| "grad_norm": 0.2660191705819811, | |
| "learning_rate": 3.683304870165257e-07, | |
| "loss": 0.5613399744033813, | |
| "num_tokens": 137308357.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.9925373134328357, | |
| "grad_norm": 0.33641507483104477, | |
| "learning_rate": 3.66180913897095e-07, | |
| "loss": 0.5563279390335083, | |
| "num_tokens": 138207663.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.2972265795339739, | |
| "learning_rate": 3.640363635143957e-07, | |
| "loss": 0.5664753913879395, | |
| "num_tokens": 139009002.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.0074626865671643, | |
| "grad_norm": 0.2870907018242013, | |
| "learning_rate": 3.6189689446686957e-07, | |
| "loss": 0.5246941447257996, | |
| "num_tokens": 139777882.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.014925373134328, | |
| "grad_norm": 0.2916504470337103, | |
| "learning_rate": 3.5976256521411397e-07, | |
| "loss": 0.5544458627700806, | |
| "num_tokens": 140621165.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.0223880597014925, | |
| "grad_norm": 0.6265255165946803, | |
| "learning_rate": 3.576334340752847e-07, | |
| "loss": 0.5519254207611084, | |
| "num_tokens": 141606435.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.029850746268657, | |
| "grad_norm": 0.3145949002923175, | |
| "learning_rate": 3.555095592275027e-07, | |
| "loss": 0.5923848152160645, | |
| "num_tokens": 142396631.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.0373134328358207, | |
| "grad_norm": 0.2812921044098366, | |
| "learning_rate": 3.5339099870426415e-07, | |
| "loss": 0.586621105670929, | |
| "num_tokens": 143299300.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.044776119402985, | |
| "grad_norm": 0.3253513362223779, | |
| "learning_rate": 3.512778103938542e-07, | |
| "loss": 0.5898826122283936, | |
| "num_tokens": 144081283.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.0522388059701493, | |
| "grad_norm": 0.2829701335583593, | |
| "learning_rate": 3.491700520377667e-07, | |
| "loss": 0.5714683532714844, | |
| "num_tokens": 144948721.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.0597014925373136, | |
| "grad_norm": 0.28049591422033593, | |
| "learning_rate": 3.470677812291248e-07, | |
| "loss": 0.5455187559127808, | |
| "num_tokens": 145915502.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.0671641791044775, | |
| "grad_norm": 0.3105939716549867, | |
| "learning_rate": 3.4497105541110846e-07, | |
| "loss": 0.5836495161056519, | |
| "num_tokens": 146848524.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.074626865671642, | |
| "grad_norm": 0.32302139932598506, | |
| "learning_rate": 3.428799318753844e-07, | |
| "loss": 0.5365943908691406, | |
| "num_tokens": 147673557.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.082089552238806, | |
| "grad_norm": 0.2868589436031599, | |
| "learning_rate": 3.407944677605399e-07, | |
| "loss": 0.6071346998214722, | |
| "num_tokens": 148546880.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.08955223880597, | |
| "grad_norm": 1.1120530008183112, | |
| "learning_rate": 3.3871472005052315e-07, | |
| "loss": 0.5794011354446411, | |
| "num_tokens": 149372372.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.0970149253731343, | |
| "grad_norm": 0.29942808195408777, | |
| "learning_rate": 3.3664074557308484e-07, | |
| "loss": 0.609196662902832, | |
| "num_tokens": 150192682.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.1044776119402986, | |
| "grad_norm": 0.26999532517075925, | |
| "learning_rate": 3.345726009982262e-07, | |
| "loss": 0.5523053407669067, | |
| "num_tokens": 151127243.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.111940298507463, | |
| "grad_norm": 0.27391131798248525, | |
| "learning_rate": 3.325103428366494e-07, | |
| "loss": 0.5864978432655334, | |
| "num_tokens": 152087089.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.1194029850746268, | |
| "grad_norm": 0.4729472521720116, | |
| "learning_rate": 3.3045402743821503e-07, | |
| "loss": 0.5435307025909424, | |
| "num_tokens": 153037095.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.126865671641791, | |
| "grad_norm": 0.2613169380583225, | |
| "learning_rate": 3.284037109904013e-07, | |
| "loss": 0.5703420042991638, | |
| "num_tokens": 153970950.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.1343283582089554, | |
| "grad_norm": 0.26928897911274874, | |
| "learning_rate": 3.2635944951676874e-07, | |
| "loss": 0.5258716344833374, | |
| "num_tokens": 154860955.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.1417910447761193, | |
| "grad_norm": 0.2763903661377402, | |
| "learning_rate": 3.243212988754302e-07, | |
| "loss": 0.5877372026443481, | |
| "num_tokens": 155783554.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 3.1492537313432836, | |
| "grad_norm": 0.29336310605505883, | |
| "learning_rate": 3.2228931475752317e-07, | |
| "loss": 0.5202987790107727, | |
| "num_tokens": 156633643.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 3.156716417910448, | |
| "grad_norm": 0.25416164853887924, | |
| "learning_rate": 3.2026355268568985e-07, | |
| "loss": 0.5262839794158936, | |
| "num_tokens": 157509233.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 3.1641791044776117, | |
| "grad_norm": 0.2833930281713182, | |
| "learning_rate": 3.1824406801255833e-07, | |
| "loss": 0.541146993637085, | |
| "num_tokens": 158408189.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 3.171641791044776, | |
| "grad_norm": 0.27826344635794753, | |
| "learning_rate": 3.1623091591923155e-07, | |
| "loss": 0.5324054956436157, | |
| "num_tokens": 159344619.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.1791044776119404, | |
| "grad_norm": 0.2641432062082384, | |
| "learning_rate": 3.142241514137781e-07, | |
| "loss": 0.512749969959259, | |
| "num_tokens": 160147804.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 3.1865671641791047, | |
| "grad_norm": 0.29499294962840417, | |
| "learning_rate": 3.1222382932973044e-07, | |
| "loss": 0.5644066333770752, | |
| "num_tokens": 161152253.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 3.1940298507462686, | |
| "grad_norm": 0.40120876254377613, | |
| "learning_rate": 3.1023000432458594e-07, | |
| "loss": 0.5188844203948975, | |
| "num_tokens": 161912590.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 3.201492537313433, | |
| "grad_norm": 0.2760447831486433, | |
| "learning_rate": 3.082427308783133e-07, | |
| "loss": 0.581289529800415, | |
| "num_tokens": 162873772.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 3.208955223880597, | |
| "grad_norm": 0.38218690237559466, | |
| "learning_rate": 3.0626206329186475e-07, | |
| "loss": 0.5367913246154785, | |
| "num_tokens": 163747353.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.216417910447761, | |
| "grad_norm": 0.2964101267001395, | |
| "learning_rate": 3.042880556856907e-07, | |
| "loss": 0.5629439353942871, | |
| "num_tokens": 164553836.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 3.2238805970149254, | |
| "grad_norm": 0.2759084567994263, | |
| "learning_rate": 3.023207619982628e-07, | |
| "loss": 0.5370494723320007, | |
| "num_tokens": 165403798.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 3.2313432835820897, | |
| "grad_norm": 0.5359782382978606, | |
| "learning_rate": 3.003602359845989e-07, | |
| "loss": 0.5838747620582581, | |
| "num_tokens": 166345805.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 3.2388059701492535, | |
| "grad_norm": 0.3148655670627395, | |
| "learning_rate": 2.9840653121479474e-07, | |
| "loss": 0.5563722848892212, | |
| "num_tokens": 167178202.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 3.246268656716418, | |
| "grad_norm": 0.25689828139696275, | |
| "learning_rate": 2.964597010725599e-07, | |
| "loss": 0.5305824875831604, | |
| "num_tokens": 168180314.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.253731343283582, | |
| "grad_norm": 0.2613039059579068, | |
| "learning_rate": 2.945197987537591e-07, | |
| "loss": 0.5461628437042236, | |
| "num_tokens": 169040092.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 3.2611940298507465, | |
| "grad_norm": 0.2974430760623621, | |
| "learning_rate": 2.9258687726495905e-07, | |
| "loss": 0.5644657611846924, | |
| "num_tokens": 169917341.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 3.2686567164179103, | |
| "grad_norm": 0.2609767450471703, | |
| "learning_rate": 2.9066098942197993e-07, | |
| "loss": 0.5402700901031494, | |
| "num_tokens": 170787209.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 3.2761194029850746, | |
| "grad_norm": 0.3010762400531778, | |
| "learning_rate": 2.8874218784845154e-07, | |
| "loss": 0.560728907585144, | |
| "num_tokens": 171730223.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 3.283582089552239, | |
| "grad_norm": 0.2799906205843846, | |
| "learning_rate": 2.868305249743766e-07, | |
| "loss": 0.5792785882949829, | |
| "num_tokens": 172620879.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.291044776119403, | |
| "grad_norm": 0.27973702443764925, | |
| "learning_rate": 2.849260530346973e-07, | |
| "loss": 0.5594302415847778, | |
| "num_tokens": 173513731.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.298507462686567, | |
| "grad_norm": 0.2604858147066468, | |
| "learning_rate": 2.830288240678682e-07, | |
| "loss": 0.5618335008621216, | |
| "num_tokens": 174466652.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 3.3059701492537314, | |
| "grad_norm": 0.28140102117529975, | |
| "learning_rate": 2.8113888991443446e-07, | |
| "loss": 0.5599676370620728, | |
| "num_tokens": 175305008.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 3.3134328358208958, | |
| "grad_norm": 0.26261993494203545, | |
| "learning_rate": 2.7925630221561505e-07, | |
| "loss": 0.5589733719825745, | |
| "num_tokens": 176287960.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 3.3208955223880596, | |
| "grad_norm": 0.2691459794281276, | |
| "learning_rate": 2.773811124118918e-07, | |
| "loss": 0.5410393476486206, | |
| "num_tokens": 177240918.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.328358208955224, | |
| "grad_norm": 0.27073917117802515, | |
| "learning_rate": 2.7551337174160425e-07, | |
| "loss": 0.550033688545227, | |
| "num_tokens": 178155824.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 3.3358208955223883, | |
| "grad_norm": 0.27889915727953724, | |
| "learning_rate": 2.736531312395491e-07, | |
| "loss": 0.5926166772842407, | |
| "num_tokens": 179172034.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 3.343283582089552, | |
| "grad_norm": 0.28321328409431284, | |
| "learning_rate": 2.718004417355855e-07, | |
| "loss": 0.5419124960899353, | |
| "num_tokens": 180085508.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 3.3507462686567164, | |
| "grad_norm": 0.25769815974818394, | |
| "learning_rate": 2.6995535385324667e-07, | |
| "loss": 0.5644470453262329, | |
| "num_tokens": 181111200.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 3.3582089552238807, | |
| "grad_norm": 0.2645694828416433, | |
| "learning_rate": 2.6811791800835684e-07, | |
| "loss": 0.5500813722610474, | |
| "num_tokens": 182084448.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.3656716417910446, | |
| "grad_norm": 0.3398596794365398, | |
| "learning_rate": 2.6628818440765267e-07, | |
| "loss": 0.5711795687675476, | |
| "num_tokens": 183008409.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 3.373134328358209, | |
| "grad_norm": 0.28312078619309694, | |
| "learning_rate": 2.6446620304741265e-07, | |
| "loss": 0.49891045689582825, | |
| "num_tokens": 183851194.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 3.3805970149253732, | |
| "grad_norm": 0.30224336146290764, | |
| "learning_rate": 2.626520237120898e-07, | |
| "loss": 0.5533944368362427, | |
| "num_tokens": 184757031.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 3.388059701492537, | |
| "grad_norm": 0.2635231520858661, | |
| "learning_rate": 2.6084569597295224e-07, | |
| "loss": 0.5472126007080078, | |
| "num_tokens": 185664557.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 3.3955223880597014, | |
| "grad_norm": 0.38599423556926155, | |
| "learning_rate": 2.590472691867284e-07, | |
| "loss": 0.5409133434295654, | |
| "num_tokens": 186629979.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.4029850746268657, | |
| "grad_norm": 0.2607919939858689, | |
| "learning_rate": 2.57256792494258e-07, | |
| "loss": 0.5315978527069092, | |
| "num_tokens": 187564334.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 3.41044776119403, | |
| "grad_norm": 0.2805243275573726, | |
| "learning_rate": 2.554743148191497e-07, | |
| "loss": 0.5706053376197815, | |
| "num_tokens": 188489207.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 3.417910447761194, | |
| "grad_norm": 0.27087210896579755, | |
| "learning_rate": 2.5369988486644446e-07, | |
| "loss": 0.5453130006790161, | |
| "num_tokens": 189462195.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 3.425373134328358, | |
| "grad_norm": 0.2701409088054617, | |
| "learning_rate": 2.5193355112128434e-07, | |
| "loss": 0.5617469549179077, | |
| "num_tokens": 190385848.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 3.4328358208955225, | |
| "grad_norm": 0.27610398300578565, | |
| "learning_rate": 2.501753618475877e-07, | |
| "loss": 0.5490225553512573, | |
| "num_tokens": 191288170.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.4402985074626864, | |
| "grad_norm": 0.2791596724267742, | |
| "learning_rate": 2.4842536508673086e-07, | |
| "loss": 0.5552560091018677, | |
| "num_tokens": 192212892.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 3.4477611940298507, | |
| "grad_norm": 0.2818729998392585, | |
| "learning_rate": 2.4668360865623443e-07, | |
| "loss": 0.5438352227210999, | |
| "num_tokens": 193052710.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 3.455223880597015, | |
| "grad_norm": 0.29076648116800047, | |
| "learning_rate": 2.4495014014845805e-07, | |
| "loss": 0.5421488285064697, | |
| "num_tokens": 193971934.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 3.4626865671641793, | |
| "grad_norm": 0.35528872999469885, | |
| "learning_rate": 2.432250069292989e-07, | |
| "loss": 0.5626663565635681, | |
| "num_tokens": 194797236.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 3.470149253731343, | |
| "grad_norm": 0.2931920860035253, | |
| "learning_rate": 2.4150825613689786e-07, | |
| "loss": 0.575283944606781, | |
| "num_tokens": 195700091.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.4776119402985075, | |
| "grad_norm": 0.2696233856000557, | |
| "learning_rate": 2.397999346803518e-07, | |
| "loss": 0.5804455280303955, | |
| "num_tokens": 196607890.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 3.485074626865672, | |
| "grad_norm": 0.2612143539512627, | |
| "learning_rate": 2.3810008923843075e-07, | |
| "loss": 0.5534828901290894, | |
| "num_tokens": 197595641.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 3.4925373134328357, | |
| "grad_norm": 0.3003385075218394, | |
| "learning_rate": 2.3640876625830382e-07, | |
| "loss": 0.5445773601531982, | |
| "num_tokens": 198539047.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.27091005264565227, | |
| "learning_rate": 2.347260119542692e-07, | |
| "loss": 0.5666298866271973, | |
| "num_tokens": 199529062.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 3.5074626865671643, | |
| "grad_norm": 0.2856115330105266, | |
| "learning_rate": 2.3305187230649173e-07, | |
| "loss": 0.5649522542953491, | |
| "num_tokens": 200452743.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.5149253731343286, | |
| "grad_norm": 0.26678768606458664, | |
| "learning_rate": 2.3138639305974592e-07, | |
| "loss": 0.5633753538131714, | |
| "num_tokens": 201375863.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 3.5223880597014925, | |
| "grad_norm": 0.24743082299382807, | |
| "learning_rate": 2.29729619722167e-07, | |
| "loss": 0.5463535785675049, | |
| "num_tokens": 202342633.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 3.529850746268657, | |
| "grad_norm": 0.2849265779918233, | |
| "learning_rate": 2.2808159756400664e-07, | |
| "loss": 0.5450330376625061, | |
| "num_tokens": 203206779.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 3.5373134328358207, | |
| "grad_norm": 0.2959172618289051, | |
| "learning_rate": 2.264423716163962e-07, | |
| "loss": 0.5645024180412292, | |
| "num_tokens": 204166390.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 3.544776119402985, | |
| "grad_norm": 0.30652775818807565, | |
| "learning_rate": 2.248119866701167e-07, | |
| "loss": 0.6000641584396362, | |
| "num_tokens": 205136083.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.5522388059701493, | |
| "grad_norm": 0.2649068122185818, | |
| "learning_rate": 2.231904872743739e-07, | |
| "loss": 0.563923180103302, | |
| "num_tokens": 206063663.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 3.5597014925373136, | |
| "grad_norm": 0.30105804677037196, | |
| "learning_rate": 2.2157791773558222e-07, | |
| "loss": 0.5499534606933594, | |
| "num_tokens": 207014752.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 3.5671641791044775, | |
| "grad_norm": 0.2719779878868544, | |
| "learning_rate": 2.1997432211615324e-07, | |
| "loss": 0.5947707891464233, | |
| "num_tokens": 208002031.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 3.574626865671642, | |
| "grad_norm": 0.27217392575188515, | |
| "learning_rate": 2.1837974423329254e-07, | |
| "loss": 0.5516700744628906, | |
| "num_tokens": 208865142.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 3.582089552238806, | |
| "grad_norm": 0.37099057189831747, | |
| "learning_rate": 2.1679422765780113e-07, | |
| "loss": 0.557658851146698, | |
| "num_tokens": 209775786.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.58955223880597, | |
| "grad_norm": 0.27880227675709274, | |
| "learning_rate": 2.1521781571288644e-07, | |
| "loss": 0.5569248199462891, | |
| "num_tokens": 210690185.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 3.5970149253731343, | |
| "grad_norm": 0.27705864225434784, | |
| "learning_rate": 2.136505514729774e-07, | |
| "loss": 0.5474062561988831, | |
| "num_tokens": 211593514.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 3.6044776119402986, | |
| "grad_norm": 0.5224112991843156, | |
| "learning_rate": 2.120924777625479e-07, | |
| "loss": 0.5869604349136353, | |
| "num_tokens": 212523789.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 3.611940298507463, | |
| "grad_norm": 0.2577684602730275, | |
| "learning_rate": 2.1054363715494693e-07, | |
| "loss": 0.5051690340042114, | |
| "num_tokens": 213378219.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 3.6194029850746268, | |
| "grad_norm": 0.28941698042641906, | |
| "learning_rate": 2.090040719712344e-07, | |
| "loss": 0.5571575164794922, | |
| "num_tokens": 214172580.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.626865671641791, | |
| "grad_norm": 0.4260866776138794, | |
| "learning_rate": 2.0747382427902572e-07, | |
| "loss": 0.5927166938781738, | |
| "num_tokens": 214985813.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 3.6343283582089554, | |
| "grad_norm": 0.3024307973363642, | |
| "learning_rate": 2.0595293589134176e-07, | |
| "loss": 0.5418879985809326, | |
| "num_tokens": 215879921.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 3.6417910447761193, | |
| "grad_norm": 0.2780348633827344, | |
| "learning_rate": 2.044414483654668e-07, | |
| "loss": 0.5637257695198059, | |
| "num_tokens": 216746710.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 3.6492537313432836, | |
| "grad_norm": 0.36018093543345575, | |
| "learning_rate": 2.0293940300181212e-07, | |
| "loss": 0.5574115514755249, | |
| "num_tokens": 217502005.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 3.656716417910448, | |
| "grad_norm": 0.36563238506727497, | |
| "learning_rate": 2.0144684084278846e-07, | |
| "loss": 0.5422406792640686, | |
| "num_tokens": 218245391.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.664179104477612, | |
| "grad_norm": 0.26076893929904454, | |
| "learning_rate": 1.9996380267168416e-07, | |
| "loss": 0.5316330194473267, | |
| "num_tokens": 219197443.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 3.671641791044776, | |
| "grad_norm": 0.2756117047272477, | |
| "learning_rate": 1.9849032901155073e-07, | |
| "loss": 0.576492965221405, | |
| "num_tokens": 220198270.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 3.6791044776119404, | |
| "grad_norm": 0.3104844021800582, | |
| "learning_rate": 1.9702646012409576e-07, | |
| "loss": 0.5465894937515259, | |
| "num_tokens": 221120937.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 3.6865671641791042, | |
| "grad_norm": 0.28862702731917284, | |
| "learning_rate": 1.9557223600858236e-07, | |
| "loss": 0.562412679195404, | |
| "num_tokens": 222035264.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 3.6940298507462686, | |
| "grad_norm": 0.2990711536721463, | |
| "learning_rate": 1.9412769640073686e-07, | |
| "loss": 0.6177443265914917, | |
| "num_tokens": 222924164.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.701492537313433, | |
| "grad_norm": 0.29123869800526553, | |
| "learning_rate": 1.9269288077166264e-07, | |
| "loss": 0.6057195067405701, | |
| "num_tokens": 223814612.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 3.708955223880597, | |
| "grad_norm": 0.30962841555286785, | |
| "learning_rate": 1.9126782832676173e-07, | |
| "loss": 0.5551049709320068, | |
| "num_tokens": 224678747.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 3.716417910447761, | |
| "grad_norm": 0.31789120255947023, | |
| "learning_rate": 1.8985257800466348e-07, | |
| "loss": 0.5476455092430115, | |
| "num_tokens": 225631946.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 3.7238805970149254, | |
| "grad_norm": 0.2781242088040202, | |
| "learning_rate": 1.8844716847616052e-07, | |
| "loss": 0.5808273553848267, | |
| "num_tokens": 226600195.0, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 3.7313432835820897, | |
| "grad_norm": 0.2954115885416602, | |
| "learning_rate": 1.8705163814315228e-07, | |
| "loss": 0.5603234767913818, | |
| "num_tokens": 227420424.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.7388059701492535, | |
| "grad_norm": 0.26394812594414746, | |
| "learning_rate": 1.856660251375957e-07, | |
| "loss": 0.5475826263427734, | |
| "num_tokens": 228393641.0, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 3.746268656716418, | |
| "grad_norm": 0.25638710494334194, | |
| "learning_rate": 1.8429036732046327e-07, | |
| "loss": 0.5315807461738586, | |
| "num_tokens": 229383627.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 3.753731343283582, | |
| "grad_norm": 0.2820486917822845, | |
| "learning_rate": 1.8292470228070805e-07, | |
| "loss": 0.555698037147522, | |
| "num_tokens": 230312293.0, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 3.7611940298507465, | |
| "grad_norm": 0.26963402638209555, | |
| "learning_rate": 1.8156906733423738e-07, | |
| "loss": 0.5559597611427307, | |
| "num_tokens": 231227207.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 3.7686567164179103, | |
| "grad_norm": 0.2635960871590189, | |
| "learning_rate": 1.8022349952289273e-07, | |
| "loss": 0.5315006971359253, | |
| "num_tokens": 232129690.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 3.7761194029850746, | |
| "grad_norm": 0.28835042727096805, | |
| "learning_rate": 1.7888803561343751e-07, | |
| "loss": 0.5724339485168457, | |
| "num_tokens": 232988180.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 3.783582089552239, | |
| "grad_norm": 0.2880295787825169, | |
| "learning_rate": 1.7756271209655294e-07, | |
| "loss": 0.6089663505554199, | |
| "num_tokens": 233887459.0, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 3.791044776119403, | |
| "grad_norm": 0.2923917838459817, | |
| "learning_rate": 1.7624756518584013e-07, | |
| "loss": 0.5508089065551758, | |
| "num_tokens": 234724288.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 3.798507462686567, | |
| "grad_norm": 0.2663262112656659, | |
| "learning_rate": 1.7494263081683131e-07, | |
| "loss": 0.5383226871490479, | |
| "num_tokens": 235591261.0, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 3.8059701492537314, | |
| "grad_norm": 0.2810986277113732, | |
| "learning_rate": 1.7364794464600808e-07, | |
| "loss": 0.5323266983032227, | |
| "num_tokens": 236513360.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.8134328358208958, | |
| "grad_norm": 0.3206761651223088, | |
| "learning_rate": 1.7236354204982587e-07, | |
| "loss": 0.5368841290473938, | |
| "num_tokens": 237427821.0, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 3.8208955223880596, | |
| "grad_norm": 0.2831284138508691, | |
| "learning_rate": 1.7108945812374873e-07, | |
| "loss": 0.5697877407073975, | |
| "num_tokens": 238361524.0, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 3.828358208955224, | |
| "grad_norm": 0.4678969236120579, | |
| "learning_rate": 1.698257276812896e-07, | |
| "loss": 0.567964494228363, | |
| "num_tokens": 239295734.0, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 3.835820895522388, | |
| "grad_norm": 0.3454163016526369, | |
| "learning_rate": 1.6857238525305922e-07, | |
| "loss": 0.5614358186721802, | |
| "num_tokens": 240192414.0, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 3.843283582089552, | |
| "grad_norm": 0.26229554588086373, | |
| "learning_rate": 1.6732946508582286e-07, | |
| "loss": 0.5396016836166382, | |
| "num_tokens": 241149058.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 3.8507462686567164, | |
| "grad_norm": 0.25830144071410005, | |
| "learning_rate": 1.6609700114156368e-07, | |
| "loss": 0.548250675201416, | |
| "num_tokens": 242110168.0, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 3.8582089552238807, | |
| "grad_norm": 0.2577863607708949, | |
| "learning_rate": 1.648750270965559e-07, | |
| "loss": 0.5675839185714722, | |
| "num_tokens": 243142913.0, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 3.8656716417910446, | |
| "grad_norm": 1.2576352530776749, | |
| "learning_rate": 1.6366357634044403e-07, | |
| "loss": 0.5479030609130859, | |
| "num_tokens": 244026260.0, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 3.873134328358209, | |
| "grad_norm": 0.43195789720816985, | |
| "learning_rate": 1.6246268197533046e-07, | |
| "loss": 0.5657459497451782, | |
| "num_tokens": 244835255.0, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 3.8805970149253732, | |
| "grad_norm": 0.2850328915996735, | |
| "learning_rate": 1.6127237681487092e-07, | |
| "loss": 0.5788131952285767, | |
| "num_tokens": 245744839.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.888059701492537, | |
| "grad_norm": 0.2734777728630755, | |
| "learning_rate": 1.600926933833783e-07, | |
| "loss": 0.5809911489486694, | |
| "num_tokens": 246688392.0, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 3.8955223880597014, | |
| "grad_norm": 0.2933774394372255, | |
| "learning_rate": 1.5892366391493362e-07, | |
| "loss": 0.5803858637809753, | |
| "num_tokens": 247632902.0, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 3.9029850746268657, | |
| "grad_norm": 0.2903603425087314, | |
| "learning_rate": 1.5776532035250513e-07, | |
| "loss": 0.5569208860397339, | |
| "num_tokens": 248582604.0, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 3.91044776119403, | |
| "grad_norm": 0.2633779070798848, | |
| "learning_rate": 1.5661769434707583e-07, | |
| "loss": 0.5375438928604126, | |
| "num_tokens": 249449227.0, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 3.917910447761194, | |
| "grad_norm": 0.31705662168623416, | |
| "learning_rate": 1.5548081725677842e-07, | |
| "loss": 0.5713478326797485, | |
| "num_tokens": 250309108.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.925373134328358, | |
| "grad_norm": 0.2743174912786303, | |
| "learning_rate": 1.5435472014603838e-07, | |
| "loss": 0.5781571865081787, | |
| "num_tokens": 251284224.0, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 3.9328358208955225, | |
| "grad_norm": 0.2781053851258556, | |
| "learning_rate": 1.5323943378472546e-07, | |
| "loss": 0.5639330148696899, | |
| "num_tokens": 252205748.0, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 3.9402985074626864, | |
| "grad_norm": 0.27546182816312653, | |
| "learning_rate": 1.5213498864731265e-07, | |
| "loss": 0.5076487064361572, | |
| "num_tokens": 253011360.0, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 3.9477611940298507, | |
| "grad_norm": 0.2724349176732762, | |
| "learning_rate": 1.5104141491204357e-07, | |
| "loss": 0.5303751230239868, | |
| "num_tokens": 253876442.0, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 3.955223880597015, | |
| "grad_norm": 0.29045934843508436, | |
| "learning_rate": 1.4995874246010776e-07, | |
| "loss": 0.5791366100311279, | |
| "num_tokens": 254702285.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.9626865671641793, | |
| "grad_norm": 0.25781806828726467, | |
| "learning_rate": 1.4888700087482444e-07, | |
| "loss": 0.5378929376602173, | |
| "num_tokens": 255641666.0, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 3.970149253731343, | |
| "grad_norm": 0.2657141262373794, | |
| "learning_rate": 1.4782621944083392e-07, | |
| "loss": 0.5480854511260986, | |
| "num_tokens": 256570444.0, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 3.9776119402985075, | |
| "grad_norm": 0.27332662946272, | |
| "learning_rate": 1.467764271432977e-07, | |
| "loss": 0.5349365472793579, | |
| "num_tokens": 257440148.0, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 3.9850746268656714, | |
| "grad_norm": 0.27927002916412313, | |
| "learning_rate": 1.4573765266710598e-07, | |
| "loss": 0.5557724237442017, | |
| "num_tokens": 258286072.0, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 3.9925373134328357, | |
| "grad_norm": 0.30540619594506574, | |
| "learning_rate": 1.4470992439609444e-07, | |
| "loss": 0.5325461626052856, | |
| "num_tokens": 259027900.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.2554852841852584, | |
| "learning_rate": 1.4369327041226831e-07, | |
| "loss": 0.5564035177230835, | |
| "num_tokens": 260054853.0, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 4.007462686567164, | |
| "grad_norm": 0.3774222258568171, | |
| "learning_rate": 1.4268771849503506e-07, | |
| "loss": 0.5198606252670288, | |
| "num_tokens": 260848096.0, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 4.014925373134329, | |
| "grad_norm": 0.42926748854255425, | |
| "learning_rate": 1.4169329612044566e-07, | |
| "loss": 0.5263375043869019, | |
| "num_tokens": 261795072.0, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 4.022388059701493, | |
| "grad_norm": 0.28442769544889995, | |
| "learning_rate": 1.4071003046044322e-07, | |
| "loss": 0.5481403470039368, | |
| "num_tokens": 262649190.0, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 4.029850746268656, | |
| "grad_norm": 0.33957785588251854, | |
| "learning_rate": 1.397379483821212e-07, | |
| "loss": 0.5446444749832153, | |
| "num_tokens": 263639124.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.037313432835821, | |
| "grad_norm": 0.31285290824877576, | |
| "learning_rate": 1.3877707644698893e-07, | |
| "loss": 0.5856173038482666, | |
| "num_tokens": 264480175.0, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 4.044776119402985, | |
| "grad_norm": 0.3146654769143484, | |
| "learning_rate": 1.3782744091024584e-07, | |
| "loss": 0.5640919208526611, | |
| "num_tokens": 265217661.0, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 4.052238805970149, | |
| "grad_norm": 0.2543877235448555, | |
| "learning_rate": 1.3688906772006393e-07, | |
| "loss": 0.545689582824707, | |
| "num_tokens": 266155540.0, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 4.059701492537314, | |
| "grad_norm": 0.2831883473552513, | |
| "learning_rate": 1.3596198251687917e-07, | |
| "loss": 0.5562140941619873, | |
| "num_tokens": 267051346.0, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 4.067164179104478, | |
| "grad_norm": 0.2634821254048522, | |
| "learning_rate": 1.3504621063269057e-07, | |
| "loss": 0.5558310747146606, | |
| "num_tokens": 268001048.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.074626865671641, | |
| "grad_norm": 0.2740473313761773, | |
| "learning_rate": 1.34141777090368e-07, | |
| "loss": 0.5498157739639282, | |
| "num_tokens": 268948251.0, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 4.082089552238806, | |
| "grad_norm": 0.3034010911009059, | |
| "learning_rate": 1.3324870660296866e-07, | |
| "loss": 0.5079299211502075, | |
| "num_tokens": 269891870.0, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 4.08955223880597, | |
| "grad_norm": 0.25296398139696874, | |
| "learning_rate": 1.3236702357306156e-07, | |
| "loss": 0.557180643081665, | |
| "num_tokens": 270893706.0, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 4.097014925373134, | |
| "grad_norm": 0.5617876126540791, | |
| "learning_rate": 1.3149675209206084e-07, | |
| "loss": 0.5518041253089905, | |
| "num_tokens": 271655159.0, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 4.104477611940299, | |
| "grad_norm": 0.26496316816661675, | |
| "learning_rate": 1.3063791593956756e-07, | |
| "loss": 0.5603747367858887, | |
| "num_tokens": 272587675.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.111940298507463, | |
| "grad_norm": 0.2601339382276699, | |
| "learning_rate": 1.2979053858271993e-07, | |
| "loss": 0.5405164957046509, | |
| "num_tokens": 273463891.0, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 4.119402985074627, | |
| "grad_norm": 0.2795698909044753, | |
| "learning_rate": 1.2895464317555206e-07, | |
| "loss": 0.5839468240737915, | |
| "num_tokens": 274283621.0, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 4.126865671641791, | |
| "grad_norm": 0.24753013274997315, | |
| "learning_rate": 1.28130252558361e-07, | |
| "loss": 0.5279031991958618, | |
| "num_tokens": 275221184.0, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 4.134328358208955, | |
| "grad_norm": 0.28266540188156414, | |
| "learning_rate": 1.2731738925708327e-07, | |
| "loss": 0.5553559064865112, | |
| "num_tokens": 276094732.0, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 4.141791044776119, | |
| "grad_norm": 0.2690824174144065, | |
| "learning_rate": 1.265160754826787e-07, | |
| "loss": 0.572119951248169, | |
| "num_tokens": 277122845.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.149253731343284, | |
| "grad_norm": 0.2472737773051283, | |
| "learning_rate": 1.2572633313052409e-07, | |
| "loss": 0.569811999797821, | |
| "num_tokens": 278203814.0, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 4.156716417910448, | |
| "grad_norm": 0.2742488775651172, | |
| "learning_rate": 1.249481837798144e-07, | |
| "loss": 0.5402873754501343, | |
| "num_tokens": 279084275.0, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 4.164179104477612, | |
| "grad_norm": 0.45576858034969514, | |
| "learning_rate": 1.2418164869297352e-07, | |
| "loss": 0.5487810373306274, | |
| "num_tokens": 279995589.0, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 4.1716417910447765, | |
| "grad_norm": 0.33886009051736005, | |
| "learning_rate": 1.2342674881507325e-07, | |
| "loss": 0.5493899583816528, | |
| "num_tokens": 280947164.0, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 4.17910447761194, | |
| "grad_norm": 0.2672882959311712, | |
| "learning_rate": 1.226835047732607e-07, | |
| "loss": 0.5846470594406128, | |
| "num_tokens": 281865957.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.186567164179104, | |
| "grad_norm": 0.26682211241453646, | |
| "learning_rate": 1.2195193687619503e-07, | |
| "loss": 0.5684331655502319, | |
| "num_tokens": 282822635.0, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 4.1940298507462686, | |
| "grad_norm": 0.5844175934203363, | |
| "learning_rate": 1.212320651134921e-07, | |
| "loss": 0.5448155403137207, | |
| "num_tokens": 283735562.0, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 4.201492537313433, | |
| "grad_norm": 0.28667238801807454, | |
| "learning_rate": 1.2052390915517878e-07, | |
| "loss": 0.552519679069519, | |
| "num_tokens": 284514293.0, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 4.208955223880597, | |
| "grad_norm": 0.27209099234946366, | |
| "learning_rate": 1.198274883511551e-07, | |
| "loss": 0.5868443250656128, | |
| "num_tokens": 285496842.0, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 4.2164179104477615, | |
| "grad_norm": 0.282926588733111, | |
| "learning_rate": 1.1914282173066572e-07, | |
| "loss": 0.5723504424095154, | |
| "num_tokens": 286397891.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 4.223880597014926, | |
| "grad_norm": 0.2770197201382834, | |
| "learning_rate": 1.1846992800177977e-07, | |
| "loss": 0.5528011918067932, | |
| "num_tokens": 287291637.0, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 4.231343283582089, | |
| "grad_norm": 0.28361549703489075, | |
| "learning_rate": 1.1780882555087987e-07, | |
| "loss": 0.5853151082992554, | |
| "num_tokens": 288241806.0, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 4.2388059701492535, | |
| "grad_norm": 0.2939822179720846, | |
| "learning_rate": 1.1715953244215962e-07, | |
| "loss": 0.5159034729003906, | |
| "num_tokens": 289026082.0, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 4.246268656716418, | |
| "grad_norm": 0.27406466943349356, | |
| "learning_rate": 1.1652206641713017e-07, | |
| "loss": 0.5613383054733276, | |
| "num_tokens": 289932433.0, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 4.253731343283582, | |
| "grad_norm": 0.26361825748402795, | |
| "learning_rate": 1.1589644489413516e-07, | |
| "loss": 0.5283357501029968, | |
| "num_tokens": 290832565.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.2611940298507465, | |
| "grad_norm": 0.2760993964502923, | |
| "learning_rate": 1.1528268496787496e-07, | |
| "loss": 0.5750157833099365, | |
| "num_tokens": 291807812.0, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 4.268656716417911, | |
| "grad_norm": 0.28258368565429337, | |
| "learning_rate": 1.1468080340893957e-07, | |
| "loss": 0.5445358157157898, | |
| "num_tokens": 292611203.0, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 4.276119402985074, | |
| "grad_norm": 0.2730636420985558, | |
| "learning_rate": 1.1409081666335033e-07, | |
| "loss": 0.6081241369247437, | |
| "num_tokens": 293563137.0, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 4.2835820895522385, | |
| "grad_norm": 0.2887060017580731, | |
| "learning_rate": 1.1351274085211066e-07, | |
| "loss": 0.5485525131225586, | |
| "num_tokens": 294390720.0, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 4.291044776119403, | |
| "grad_norm": 0.3012747873135312, | |
| "learning_rate": 1.1294659177076522e-07, | |
| "loss": 0.5155702829360962, | |
| "num_tokens": 295331187.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 4.298507462686567, | |
| "grad_norm": 0.2920426716987418, | |
| "learning_rate": 1.1239238488896874e-07, | |
| "loss": 0.5878146886825562, | |
| "num_tokens": 296174930.0, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 4.3059701492537314, | |
| "grad_norm": 0.2717219604372653, | |
| "learning_rate": 1.118501353500631e-07, | |
| "loss": 0.5488337278366089, | |
| "num_tokens": 296984260.0, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 4.313432835820896, | |
| "grad_norm": 0.2881268506141246, | |
| "learning_rate": 1.1131985797066362e-07, | |
| "loss": 0.5962164402008057, | |
| "num_tokens": 297814492.0, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 4.32089552238806, | |
| "grad_norm": 0.3429404437909788, | |
| "learning_rate": 1.1080156724025409e-07, | |
| "loss": 0.5432817935943604, | |
| "num_tokens": 298682103.0, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 4.3283582089552235, | |
| "grad_norm": 0.26831503316886746, | |
| "learning_rate": 1.1029527732079083e-07, | |
| "loss": 0.5613952875137329, | |
| "num_tokens": 299706050.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.335820895522388, | |
| "grad_norm": 0.30116453227315987, | |
| "learning_rate": 1.0980100204631603e-07, | |
| "loss": 0.5974493026733398, | |
| "num_tokens": 300500751.0, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 4.343283582089552, | |
| "grad_norm": 0.2574290282127498, | |
| "learning_rate": 1.0931875492257944e-07, | |
| "loss": 0.5080505609512329, | |
| "num_tokens": 301436049.0, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 4.350746268656716, | |
| "grad_norm": 0.2768297952700454, | |
| "learning_rate": 1.088485491266694e-07, | |
| "loss": 0.5769013166427612, | |
| "num_tokens": 302245987.0, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 4.358208955223881, | |
| "grad_norm": 0.2614099752067767, | |
| "learning_rate": 1.0839039750665291e-07, | |
| "loss": 0.5327722430229187, | |
| "num_tokens": 303180329.0, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 4.365671641791045, | |
| "grad_norm": 0.2545576461755387, | |
| "learning_rate": 1.0794431258122429e-07, | |
| "loss": 0.5465179085731506, | |
| "num_tokens": 304106987.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 4.373134328358209, | |
| "grad_norm": 0.3028807540174107, | |
| "learning_rate": 1.0751030653936354e-07, | |
| "loss": 0.5673878192901611, | |
| "num_tokens": 304931031.0, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 4.380597014925373, | |
| "grad_norm": 0.2623967772914622, | |
| "learning_rate": 1.0708839124000287e-07, | |
| "loss": 0.5716835260391235, | |
| "num_tokens": 305846255.0, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 4.388059701492537, | |
| "grad_norm": 0.2805523792578899, | |
| "learning_rate": 1.066785782117028e-07, | |
| "loss": 0.5245805978775024, | |
| "num_tokens": 306627892.0, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 4.395522388059701, | |
| "grad_norm": 0.3121504154269003, | |
| "learning_rate": 1.0628087865233737e-07, | |
| "loss": 0.5411394238471985, | |
| "num_tokens": 307519113.0, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 4.402985074626866, | |
| "grad_norm": 0.2913376914541736, | |
| "learning_rate": 1.0589530342878769e-07, | |
| "loss": 0.5592665672302246, | |
| "num_tokens": 308359627.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.41044776119403, | |
| "grad_norm": 0.26729488095826903, | |
| "learning_rate": 1.0552186307664565e-07, | |
| "loss": 0.5448157787322998, | |
| "num_tokens": 309250463.0, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 4.417910447761194, | |
| "grad_norm": 0.30314603500542503, | |
| "learning_rate": 1.0516056779992541e-07, | |
| "loss": 0.5698049664497375, | |
| "num_tokens": 310094707.0, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 4.425373134328359, | |
| "grad_norm": 0.2786233963992514, | |
| "learning_rate": 1.0481142747078492e-07, | |
| "loss": 0.5542622804641724, | |
| "num_tokens": 310932669.0, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 4.432835820895522, | |
| "grad_norm": 0.28934032270495663, | |
| "learning_rate": 1.0447445162925613e-07, | |
| "loss": 0.5697283744812012, | |
| "num_tokens": 311864048.0, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 4.440298507462686, | |
| "grad_norm": 0.2500391659266081, | |
| "learning_rate": 1.0414964948298435e-07, | |
| "loss": 0.5528576374053955, | |
| "num_tokens": 312840365.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 4.447761194029851, | |
| "grad_norm": 0.2916622377572014, | |
| "learning_rate": 1.0383702990697656e-07, | |
| "loss": 0.5366314649581909, | |
| "num_tokens": 313795804.0, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 4.455223880597015, | |
| "grad_norm": 0.271646171110688, | |
| "learning_rate": 1.035366014433589e-07, | |
| "loss": 0.5479257106781006, | |
| "num_tokens": 314648164.0, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 4.462686567164179, | |
| "grad_norm": 0.28220408400221697, | |
| "learning_rate": 1.032483723011433e-07, | |
| "loss": 0.5544242858886719, | |
| "num_tokens": 315521665.0, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 4.470149253731344, | |
| "grad_norm": 0.32415992407130506, | |
| "learning_rate": 1.0297235035600334e-07, | |
| "loss": 0.5346230268478394, | |
| "num_tokens": 316460972.0, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 4.477611940298507, | |
| "grad_norm": 0.29902595518304415, | |
| "learning_rate": 1.0270854315005874e-07, | |
| "loss": 0.5251238346099854, | |
| "num_tokens": 317398198.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.485074626865671, | |
| "grad_norm": 0.27529822550906163, | |
| "learning_rate": 1.0245695789166948e-07, | |
| "loss": 0.550391674041748, | |
| "num_tokens": 318368138.0, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 4.492537313432836, | |
| "grad_norm": 0.25678842717501, | |
| "learning_rate": 1.0221760145523875e-07, | |
| "loss": 0.5486523509025574, | |
| "num_tokens": 319254359.0, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.30422330388419494, | |
| "learning_rate": 1.0199048038102526e-07, | |
| "loss": 0.5667173266410828, | |
| "num_tokens": 320163581.0, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 4.507462686567164, | |
| "grad_norm": 0.2590403748368139, | |
| "learning_rate": 1.0177560087496423e-07, | |
| "loss": 0.5528400540351868, | |
| "num_tokens": 321099182.0, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 4.514925373134329, | |
| "grad_norm": 0.2779131559748047, | |
| "learning_rate": 1.0157296880849824e-07, | |
| "loss": 0.5901874303817749, | |
| "num_tokens": 321960509.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 4.522388059701493, | |
| "grad_norm": 0.29485819884103376, | |
| "learning_rate": 1.0138258971841641e-07, | |
| "loss": 0.5388875007629395, | |
| "num_tokens": 322828287.0, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 4.529850746268656, | |
| "grad_norm": 0.2672600011322682, | |
| "learning_rate": 1.0120446880670325e-07, | |
| "loss": 0.5676090121269226, | |
| "num_tokens": 323747430.0, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 4.537313432835821, | |
| "grad_norm": 0.2612827675126176, | |
| "learning_rate": 1.0103861094039667e-07, | |
| "loss": 0.5471125245094299, | |
| "num_tokens": 324666159.0, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 4.544776119402985, | |
| "grad_norm": 0.27778633049164236, | |
| "learning_rate": 1.008850206514547e-07, | |
| "loss": 0.5418146848678589, | |
| "num_tokens": 325560938.0, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 4.552238805970149, | |
| "grad_norm": 0.2969440332344535, | |
| "learning_rate": 1.0074370213663201e-07, | |
| "loss": 0.5470881462097168, | |
| "num_tokens": 326330466.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.559701492537314, | |
| "grad_norm": 0.31944728844902104, | |
| "learning_rate": 1.0061465925736478e-07, | |
| "loss": 0.5502467155456543, | |
| "num_tokens": 327193522.0, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 4.567164179104478, | |
| "grad_norm": 0.2519446230589244, | |
| "learning_rate": 1.0049789553966569e-07, | |
| "loss": 0.5561034679412842, | |
| "num_tokens": 328181555.0, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 4.574626865671641, | |
| "grad_norm": 0.36293634063352515, | |
| "learning_rate": 1.0039341417402715e-07, | |
| "loss": 0.5579421520233154, | |
| "num_tokens": 329048630.0, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 4.582089552238806, | |
| "grad_norm": 0.26255387893449383, | |
| "learning_rate": 1.0030121801533441e-07, | |
| "loss": 0.5714669823646545, | |
| "num_tokens": 329968258.0, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 4.58955223880597, | |
| "grad_norm": 0.28393932823044754, | |
| "learning_rate": 1.002213095827875e-07, | |
| "loss": 0.542944610118866, | |
| "num_tokens": 330754847.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 4.597014925373134, | |
| "grad_norm": 0.25741865731733854, | |
| "learning_rate": 1.0015369105983216e-07, | |
| "loss": 0.5193674564361572, | |
| "num_tokens": 331683385.0, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 4.604477611940299, | |
| "grad_norm": 0.28009977442278555, | |
| "learning_rate": 1.0009836429410053e-07, | |
| "loss": 0.5400401949882507, | |
| "num_tokens": 332655820.0, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 4.611940298507463, | |
| "grad_norm": 0.2840147519658574, | |
| "learning_rate": 1.0005533079736037e-07, | |
| "loss": 0.5117232203483582, | |
| "num_tokens": 333524783.0, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 4.619402985074627, | |
| "grad_norm": 0.25572802620429147, | |
| "learning_rate": 1.0002459174547398e-07, | |
| "loss": 0.5419676303863525, | |
| "num_tokens": 334451276.0, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 4.6268656716417915, | |
| "grad_norm": 0.2837927983840193, | |
| "learning_rate": 1.0000614797836585e-07, | |
| "loss": 0.5926541090011597, | |
| "num_tokens": 335471551.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.6268656716417915, | |
| "step": 620, | |
| "total_flos": 829202911068160.0, | |
| "train_loss": 0.33651591361530364, | |
| "train_runtime": 10914.7384, | |
| "train_samples_per_second": 1.818, | |
| "train_steps_per_second": 0.057 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 620, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 62, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 829202911068160.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |