Text Generation
Transformers
Safetensors
qwen2
llama-factory
full
Generated from Trainer
conversational
text-generation-inference
Instructions to use lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml") model = AutoModelForCausalLM.from_pretrained("lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml
- SGLang
How to use lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml with Docker Model Runner:
docker model run hf.co/lemonhat/Qwen2.5-Coder-7B-Instruct-swe-xml
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.3110419906687403, | |
| "eval_steps": 100, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00015552099533437013, | |
| "grad_norm": 4.662841910859059, | |
| "learning_rate": 9.999999403215137e-06, | |
| "loss": 0.5705, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00031104199066874026, | |
| "grad_norm": 3.2101601568771754, | |
| "learning_rate": 9.999997612860688e-06, | |
| "loss": 0.5483, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.00046656298600311044, | |
| "grad_norm": 4.156927485367742, | |
| "learning_rate": 9.999994628937082e-06, | |
| "loss": 0.5274, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0006220839813374805, | |
| "grad_norm": 3.3680995992825533, | |
| "learning_rate": 9.99999045144503e-06, | |
| "loss": 0.5289, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0007776049766718507, | |
| "grad_norm": 3.3625564866317608, | |
| "learning_rate": 9.99998508038553e-06, | |
| "loss": 0.379, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0009331259720062209, | |
| "grad_norm": 2.7876669390361055, | |
| "learning_rate": 9.999978515759865e-06, | |
| "loss": 0.4019, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.001088646967340591, | |
| "grad_norm": 2.8774460632164955, | |
| "learning_rate": 9.999970757569602e-06, | |
| "loss": 0.4947, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.001244167962674961, | |
| "grad_norm": 2.126555417939935, | |
| "learning_rate": 9.999961805816589e-06, | |
| "loss": 0.3555, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0013996889580093312, | |
| "grad_norm": 1.9764001175562893, | |
| "learning_rate": 9.999951660502969e-06, | |
| "loss": 0.3102, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0015552099533437014, | |
| "grad_norm": 1.122915497849003, | |
| "learning_rate": 9.999940321631158e-06, | |
| "loss": 0.2802, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0017107309486780716, | |
| "grad_norm": 2.6849893357934436, | |
| "learning_rate": 9.99992778920387e-06, | |
| "loss": 0.3883, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0018662519440124418, | |
| "grad_norm": 1.5146624591903277, | |
| "learning_rate": 9.999914063224088e-06, | |
| "loss": 0.2749, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.002021772939346812, | |
| "grad_norm": 1.4042567809467454, | |
| "learning_rate": 9.999899143695095e-06, | |
| "loss": 0.296, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.002177293934681182, | |
| "grad_norm": 1.817010561320059, | |
| "learning_rate": 9.99988303062045e-06, | |
| "loss": 0.3278, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0023328149300155523, | |
| "grad_norm": 1.795433870197738, | |
| "learning_rate": 9.999865724003998e-06, | |
| "loss": 0.3146, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.002488335925349922, | |
| "grad_norm": 1.7826807436565577, | |
| "learning_rate": 9.999847223849875e-06, | |
| "loss": 0.3233, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0026438569206842922, | |
| "grad_norm": 1.7206284351475924, | |
| "learning_rate": 9.999827530162493e-06, | |
| "loss": 0.3246, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0027993779160186624, | |
| "grad_norm": 3.8760819362380867, | |
| "learning_rate": 9.999806642946554e-06, | |
| "loss": 0.2648, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0029548989113530326, | |
| "grad_norm": 1.5644293516985985, | |
| "learning_rate": 9.999784562207046e-06, | |
| "loss": 0.3096, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.003110419906687403, | |
| "grad_norm": 2.0190853150190877, | |
| "learning_rate": 9.999761287949237e-06, | |
| "loss": 0.307, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.003265940902021773, | |
| "grad_norm": 2.763319388592032, | |
| "learning_rate": 9.999736820178686e-06, | |
| "loss": 0.4327, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.003421461897356143, | |
| "grad_norm": 1.6605096033172442, | |
| "learning_rate": 9.999711158901231e-06, | |
| "loss": 0.3918, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0035769828926905133, | |
| "grad_norm": 1.7508401571856476, | |
| "learning_rate": 9.999684304123e-06, | |
| "loss": 0.3852, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.0037325038880248835, | |
| "grad_norm": 2.0163360210179335, | |
| "learning_rate": 9.999656255850401e-06, | |
| "loss": 0.3567, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0038880248833592537, | |
| "grad_norm": 1.5224484473221345, | |
| "learning_rate": 9.999627014090133e-06, | |
| "loss": 0.3185, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.004043545878693624, | |
| "grad_norm": 1.5651644136387708, | |
| "learning_rate": 9.999596578849173e-06, | |
| "loss": 0.2548, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.004199066874027994, | |
| "grad_norm": 1.506984699015577, | |
| "learning_rate": 9.999564950134788e-06, | |
| "loss": 0.2719, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.004354587869362364, | |
| "grad_norm": 1.5707101400798584, | |
| "learning_rate": 9.99953212795453e-06, | |
| "loss": 0.2585, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.004510108864696734, | |
| "grad_norm": 1.6678601949561362, | |
| "learning_rate": 9.999498112316231e-06, | |
| "loss": 0.2642, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.004665629860031105, | |
| "grad_norm": 1.1937228959267376, | |
| "learning_rate": 9.99946290322801e-06, | |
| "loss": 0.3348, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.004821150855365474, | |
| "grad_norm": 1.474398491556367, | |
| "learning_rate": 9.999426500698277e-06, | |
| "loss": 0.2936, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.004976671850699844, | |
| "grad_norm": 1.4230321858584387, | |
| "learning_rate": 9.999388904735718e-06, | |
| "loss": 0.316, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.005132192846034215, | |
| "grad_norm": 2.5118600752998645, | |
| "learning_rate": 9.999350115349309e-06, | |
| "loss": 0.3135, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.0052877138413685845, | |
| "grad_norm": 1.7910755988881728, | |
| "learning_rate": 9.999310132548308e-06, | |
| "loss": 0.249, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.005443234836702955, | |
| "grad_norm": 1.4981333944055653, | |
| "learning_rate": 9.999268956342261e-06, | |
| "loss": 0.2594, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.005598755832037325, | |
| "grad_norm": 0.9261919071743852, | |
| "learning_rate": 9.999226586740995e-06, | |
| "loss": 0.2333, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.0057542768273716955, | |
| "grad_norm": 1.26246346078558, | |
| "learning_rate": 9.999183023754628e-06, | |
| "loss": 0.1787, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.005909797822706065, | |
| "grad_norm": 1.9545697787374448, | |
| "learning_rate": 9.999138267393557e-06, | |
| "loss": 0.3246, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.006065318818040436, | |
| "grad_norm": 1.4285410822616305, | |
| "learning_rate": 9.999092317668467e-06, | |
| "loss": 0.223, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.006220839813374806, | |
| "grad_norm": 1.4526856529113084, | |
| "learning_rate": 9.999045174590324e-06, | |
| "loss": 0.182, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.006376360808709175, | |
| "grad_norm": 2.4846217662340995, | |
| "learning_rate": 9.998996838170387e-06, | |
| "loss": 0.36, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.006531881804043546, | |
| "grad_norm": 1.2772759621800358, | |
| "learning_rate": 9.998947308420189e-06, | |
| "loss": 0.241, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.006687402799377916, | |
| "grad_norm": 2.7720889102611945, | |
| "learning_rate": 9.998896585351557e-06, | |
| "loss": 0.3213, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.006842923794712286, | |
| "grad_norm": 1.7490095603308047, | |
| "learning_rate": 9.998844668976595e-06, | |
| "loss": 0.3155, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.006998444790046656, | |
| "grad_norm": 1.3823301922226903, | |
| "learning_rate": 9.998791559307702e-06, | |
| "loss": 0.2149, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.007153965785381027, | |
| "grad_norm": 1.288871141891326, | |
| "learning_rate": 9.998737256357551e-06, | |
| "loss": 0.2887, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.007309486780715396, | |
| "grad_norm": 3.483009451782568, | |
| "learning_rate": 9.99868176013911e-06, | |
| "loss": 0.263, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.007465007776049767, | |
| "grad_norm": 1.652490483156804, | |
| "learning_rate": 9.998625070665622e-06, | |
| "loss": 0.2664, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.007620528771384137, | |
| "grad_norm": 1.8206039592741312, | |
| "learning_rate": 9.99856718795062e-06, | |
| "loss": 0.224, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.007776049766718507, | |
| "grad_norm": 3.2471818448644743, | |
| "learning_rate": 9.998508112007925e-06, | |
| "loss": 0.293, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.007931570762052876, | |
| "grad_norm": 2.4630640416023, | |
| "learning_rate": 9.998447842851638e-06, | |
| "loss": 0.2958, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.008087091757387248, | |
| "grad_norm": 2.1952255314920817, | |
| "learning_rate": 9.998386380496144e-06, | |
| "loss": 0.2841, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.008242612752721618, | |
| "grad_norm": 1.7440998263562653, | |
| "learning_rate": 9.998323724956114e-06, | |
| "loss": 0.2392, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.008398133748055987, | |
| "grad_norm": 1.7713538170023606, | |
| "learning_rate": 9.998259876246509e-06, | |
| "loss": 0.2148, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.008553654743390357, | |
| "grad_norm": 2.196248357816803, | |
| "learning_rate": 9.998194834382567e-06, | |
| "loss": 0.2314, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.008709175738724729, | |
| "grad_norm": 1.5241920091736059, | |
| "learning_rate": 9.998128599379817e-06, | |
| "loss": 0.3538, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.008864696734059098, | |
| "grad_norm": 1.084932443566165, | |
| "learning_rate": 9.998061171254068e-06, | |
| "loss": 0.2061, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.009020217729393468, | |
| "grad_norm": 1.7028355052947441, | |
| "learning_rate": 9.997992550021418e-06, | |
| "loss": 0.2286, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.009175738724727838, | |
| "grad_norm": 1.7850241306158636, | |
| "learning_rate": 9.997922735698247e-06, | |
| "loss": 0.1935, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.00933125972006221, | |
| "grad_norm": 2.7780720350287287, | |
| "learning_rate": 9.997851728301219e-06, | |
| "loss": 0.2658, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.009486780715396579, | |
| "grad_norm": 1.8811033125325856, | |
| "learning_rate": 9.997779527847287e-06, | |
| "loss": 0.1963, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.009642301710730949, | |
| "grad_norm": 1.3758579938738247, | |
| "learning_rate": 9.997706134353687e-06, | |
| "loss": 0.2529, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.009797822706065318, | |
| "grad_norm": 1.9634000227706385, | |
| "learning_rate": 9.997631547837934e-06, | |
| "loss": 0.2544, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.009953343701399688, | |
| "grad_norm": 1.594710372018227, | |
| "learning_rate": 9.997555768317838e-06, | |
| "loss": 0.3528, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.01010886469673406, | |
| "grad_norm": 1.8005547220704254, | |
| "learning_rate": 9.997478795811486e-06, | |
| "loss": 0.2165, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.01026438569206843, | |
| "grad_norm": 2.290269323202059, | |
| "learning_rate": 9.997400630337254e-06, | |
| "loss": 0.2786, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.0104199066874028, | |
| "grad_norm": 1.5486051696063095, | |
| "learning_rate": 9.997321271913801e-06, | |
| "loss": 0.2188, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.010575427682737169, | |
| "grad_norm": 0.9684733219759649, | |
| "learning_rate": 9.997240720560068e-06, | |
| "loss": 0.2043, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.01073094867807154, | |
| "grad_norm": 2.1081587478577437, | |
| "learning_rate": 9.997158976295288e-06, | |
| "loss": 0.2908, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.01088646967340591, | |
| "grad_norm": 3.6233628477076736, | |
| "learning_rate": 9.99707603913897e-06, | |
| "loss": 0.2579, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.01104199066874028, | |
| "grad_norm": 1.090209411261846, | |
| "learning_rate": 9.996991909110918e-06, | |
| "loss": 0.2864, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.01119751166407465, | |
| "grad_norm": 1.3430452010098815, | |
| "learning_rate": 9.99690658623121e-06, | |
| "loss": 0.2217, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.01135303265940902, | |
| "grad_norm": 2.3549515267664005, | |
| "learning_rate": 9.996820070520216e-06, | |
| "loss": 0.2822, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.011508553654743391, | |
| "grad_norm": 1.5602820881890913, | |
| "learning_rate": 9.996732361998588e-06, | |
| "loss": 0.2456, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.01166407465007776, | |
| "grad_norm": 1.5856862134183374, | |
| "learning_rate": 9.996643460687264e-06, | |
| "loss": 0.3056, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.01181959564541213, | |
| "grad_norm": 1.6134033436501471, | |
| "learning_rate": 9.996553366607464e-06, | |
| "loss": 0.2141, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.0119751166407465, | |
| "grad_norm": 1.3597955630988308, | |
| "learning_rate": 9.996462079780696e-06, | |
| "loss": 0.2295, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.012130637636080872, | |
| "grad_norm": 1.1374281802105086, | |
| "learning_rate": 9.996369600228753e-06, | |
| "loss": 0.2487, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.012286158631415241, | |
| "grad_norm": 1.4298077500438133, | |
| "learning_rate": 9.99627592797371e-06, | |
| "loss": 0.2446, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.012441679626749611, | |
| "grad_norm": 1.3975983522660094, | |
| "learning_rate": 9.996181063037924e-06, | |
| "loss": 0.2611, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.012597200622083981, | |
| "grad_norm": 1.5544782250742402, | |
| "learning_rate": 9.996085005444046e-06, | |
| "loss": 0.2311, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.01275272161741835, | |
| "grad_norm": 1.3603452791878323, | |
| "learning_rate": 9.995987755215006e-06, | |
| "loss": 0.2003, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.012908242612752722, | |
| "grad_norm": 1.3071118505273163, | |
| "learning_rate": 9.995889312374016e-06, | |
| "loss": 0.2338, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.013063763608087092, | |
| "grad_norm": 1.7380116089178919, | |
| "learning_rate": 9.995789676944576e-06, | |
| "loss": 0.2645, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.013219284603421462, | |
| "grad_norm": 1.263086313797395, | |
| "learning_rate": 9.995688848950473e-06, | |
| "loss": 0.2215, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.013374805598755831, | |
| "grad_norm": 1.5100086739523095, | |
| "learning_rate": 9.995586828415774e-06, | |
| "loss": 0.2444, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.013530326594090203, | |
| "grad_norm": 1.0847005275250092, | |
| "learning_rate": 9.995483615364833e-06, | |
| "loss": 0.2129, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.013685847589424573, | |
| "grad_norm": 1.3155329082198164, | |
| "learning_rate": 9.995379209822289e-06, | |
| "loss": 0.2788, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.013841368584758942, | |
| "grad_norm": 1.8214452427995387, | |
| "learning_rate": 9.995273611813065e-06, | |
| "loss": 0.3027, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.013996889580093312, | |
| "grad_norm": 0.8312908694387112, | |
| "learning_rate": 9.995166821362368e-06, | |
| "loss": 0.226, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.014152410575427682, | |
| "grad_norm": 1.6627232520442479, | |
| "learning_rate": 9.995058838495689e-06, | |
| "loss": 0.2742, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.014307931570762053, | |
| "grad_norm": 0.9378761990044046, | |
| "learning_rate": 9.994949663238809e-06, | |
| "loss": 0.267, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.014463452566096423, | |
| "grad_norm": 2.122534441012584, | |
| "learning_rate": 9.994839295617786e-06, | |
| "loss": 0.2438, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.014618973561430793, | |
| "grad_norm": 1.577456726662404, | |
| "learning_rate": 9.994727735658968e-06, | |
| "loss": 0.2659, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.014774494556765163, | |
| "grad_norm": 1.6054087269070507, | |
| "learning_rate": 9.994614983388986e-06, | |
| "loss": 0.2404, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.014930015552099534, | |
| "grad_norm": 1.5558443339273214, | |
| "learning_rate": 9.994501038834755e-06, | |
| "loss": 0.2703, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.015085536547433904, | |
| "grad_norm": 1.7101494645663162, | |
| "learning_rate": 9.994385902023474e-06, | |
| "loss": 0.2148, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.015241057542768274, | |
| "grad_norm": 1.5422168422798725, | |
| "learning_rate": 9.99426957298263e-06, | |
| "loss": 0.2045, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.015396578538102643, | |
| "grad_norm": 1.3177834617215995, | |
| "learning_rate": 9.994152051739991e-06, | |
| "loss": 0.2097, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.015552099533437015, | |
| "grad_norm": 1.7250496500116883, | |
| "learning_rate": 9.994033338323612e-06, | |
| "loss": 0.2309, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.015552099533437015, | |
| "eval_loss": 0.255514532327652, | |
| "eval_runtime": 9.4404, | |
| "eval_samples_per_second": 2.754, | |
| "eval_steps_per_second": 0.741, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.015707620528771383, | |
| "grad_norm": 1.2242797466034228, | |
| "learning_rate": 9.993913432761831e-06, | |
| "loss": 0.2309, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.015863141524105753, | |
| "grad_norm": 1.2091727169379591, | |
| "learning_rate": 9.993792335083272e-06, | |
| "loss": 0.215, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.016018662519440126, | |
| "grad_norm": 1.6991288183534923, | |
| "learning_rate": 9.99367004531684e-06, | |
| "loss": 0.2716, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.016174183514774496, | |
| "grad_norm": 1.8626540300463013, | |
| "learning_rate": 9.99354656349173e-06, | |
| "loss": 0.287, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.016329704510108865, | |
| "grad_norm": 1.2626220604624867, | |
| "learning_rate": 9.993421889637418e-06, | |
| "loss": 0.1737, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.016485225505443235, | |
| "grad_norm": 0.8807151838598477, | |
| "learning_rate": 9.993296023783664e-06, | |
| "loss": 0.227, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.016640746500777605, | |
| "grad_norm": 1.4662006360846318, | |
| "learning_rate": 9.993168965960515e-06, | |
| "loss": 0.2698, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.016796267496111975, | |
| "grad_norm": 2.5676508719496383, | |
| "learning_rate": 9.993040716198304e-06, | |
| "loss": 0.2231, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.016951788491446344, | |
| "grad_norm": 2.144008184181988, | |
| "learning_rate": 9.992911274527641e-06, | |
| "loss": 0.2729, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.017107309486780714, | |
| "grad_norm": 1.3871826576036752, | |
| "learning_rate": 9.99278064097943e-06, | |
| "loss": 0.2078, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.017262830482115084, | |
| "grad_norm": 1.9299054218636398, | |
| "learning_rate": 9.992648815584853e-06, | |
| "loss": 0.2543, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.017418351477449457, | |
| "grad_norm": 6.182669074382352, | |
| "learning_rate": 9.992515798375379e-06, | |
| "loss": 0.2442, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.017573872472783827, | |
| "grad_norm": 1.9218049477099652, | |
| "learning_rate": 9.992381589382761e-06, | |
| "loss": 0.2909, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.017729393468118197, | |
| "grad_norm": 1.7558505152868706, | |
| "learning_rate": 9.992246188639035e-06, | |
| "loss": 0.2182, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.017884914463452566, | |
| "grad_norm": 1.3145893008937046, | |
| "learning_rate": 9.992109596176525e-06, | |
| "loss": 0.2445, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.018040435458786936, | |
| "grad_norm": 2.3756692802265094, | |
| "learning_rate": 9.991971812027836e-06, | |
| "loss": 0.2961, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.018195956454121306, | |
| "grad_norm": 2.027933705938777, | |
| "learning_rate": 9.991832836225863e-06, | |
| "loss": 0.2459, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.018351477449455676, | |
| "grad_norm": 1.9997556478308784, | |
| "learning_rate": 9.991692668803775e-06, | |
| "loss": 0.2108, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.018506998444790045, | |
| "grad_norm": 1.39831187226532, | |
| "learning_rate": 9.991551309795038e-06, | |
| "loss": 0.1902, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.01866251944012442, | |
| "grad_norm": 1.6377700259822823, | |
| "learning_rate": 9.991408759233394e-06, | |
| "loss": 0.2491, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.018818040435458788, | |
| "grad_norm": 2.09576564356888, | |
| "learning_rate": 9.991265017152869e-06, | |
| "loss": 0.2526, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.018973561430793158, | |
| "grad_norm": 2.031216743667695, | |
| "learning_rate": 9.991120083587779e-06, | |
| "loss": 0.2418, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.019129082426127528, | |
| "grad_norm": 1.9897151692182136, | |
| "learning_rate": 9.990973958572723e-06, | |
| "loss": 0.2786, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.019284603421461897, | |
| "grad_norm": 1.7503968375792016, | |
| "learning_rate": 9.990826642142581e-06, | |
| "loss": 0.3231, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.019440124416796267, | |
| "grad_norm": 0.8307156104752434, | |
| "learning_rate": 9.990678134332521e-06, | |
| "loss": 0.2058, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.019595645412130637, | |
| "grad_norm": 2.105265419067902, | |
| "learning_rate": 9.990528435177992e-06, | |
| "loss": 0.2665, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.019751166407465007, | |
| "grad_norm": 0.845573052530141, | |
| "learning_rate": 9.99037754471473e-06, | |
| "loss": 0.1706, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.019906687402799376, | |
| "grad_norm": 1.3561288374051286, | |
| "learning_rate": 9.990225462978756e-06, | |
| "loss": 0.2834, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.02006220839813375, | |
| "grad_norm": 1.4639985615099256, | |
| "learning_rate": 9.990072190006371e-06, | |
| "loss": 0.2775, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.02021772939346812, | |
| "grad_norm": 1.424715750901468, | |
| "learning_rate": 9.989917725834166e-06, | |
| "loss": 0.2331, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.02037325038880249, | |
| "grad_norm": 1.4908712495988423, | |
| "learning_rate": 9.989762070499015e-06, | |
| "loss": 0.2326, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.02052877138413686, | |
| "grad_norm": 1.9371986234951772, | |
| "learning_rate": 9.98960522403807e-06, | |
| "loss": 0.248, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.02068429237947123, | |
| "grad_norm": 1.7802271420639102, | |
| "learning_rate": 9.989447186488777e-06, | |
| "loss": 0.2881, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.0208398133748056, | |
| "grad_norm": 1.1250396512690675, | |
| "learning_rate": 9.98928795788886e-06, | |
| "loss": 0.2309, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.020995334370139968, | |
| "grad_norm": 1.6801724252117862, | |
| "learning_rate": 9.989127538276329e-06, | |
| "loss": 0.2292, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.021150855365474338, | |
| "grad_norm": 1.1771299351260398, | |
| "learning_rate": 9.98896592768948e-06, | |
| "loss": 0.1553, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.021306376360808708, | |
| "grad_norm": 2.1842202518230645, | |
| "learning_rate": 9.988803126166889e-06, | |
| "loss": 0.3029, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.02146189735614308, | |
| "grad_norm": 1.3745547142156036, | |
| "learning_rate": 9.988639133747422e-06, | |
| "loss": 0.1702, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.02161741835147745, | |
| "grad_norm": 1.8504088238591443, | |
| "learning_rate": 9.988473950470223e-06, | |
| "loss": 0.2318, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.02177293934681182, | |
| "grad_norm": 1.7870069125473158, | |
| "learning_rate": 9.988307576374727e-06, | |
| "loss": 0.2008, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.02192846034214619, | |
| "grad_norm": 2.3953898044564883, | |
| "learning_rate": 9.988140011500647e-06, | |
| "loss": 0.2007, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.02208398133748056, | |
| "grad_norm": 1.1845465157973594, | |
| "learning_rate": 9.987971255887985e-06, | |
| "loss": 0.2334, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.02223950233281493, | |
| "grad_norm": 1.747163885973197, | |
| "learning_rate": 9.987801309577026e-06, | |
| "loss": 0.2559, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.0223950233281493, | |
| "grad_norm": 1.6909380164686145, | |
| "learning_rate": 9.987630172608333e-06, | |
| "loss": 0.2819, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.02255054432348367, | |
| "grad_norm": 1.6459040836915735, | |
| "learning_rate": 9.987457845022767e-06, | |
| "loss": 0.2283, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.02270606531881804, | |
| "grad_norm": 1.0639213494130906, | |
| "learning_rate": 9.987284326861459e-06, | |
| "loss": 0.2947, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.022861586314152412, | |
| "grad_norm": 1.423659630662775, | |
| "learning_rate": 9.987109618165832e-06, | |
| "loss": 0.1895, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.023017107309486782, | |
| "grad_norm": 2.1171729246911966, | |
| "learning_rate": 9.986933718977591e-06, | |
| "loss": 0.1967, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.02317262830482115, | |
| "grad_norm": 1.4659656443481106, | |
| "learning_rate": 9.986756629338728e-06, | |
| "loss": 0.1553, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.02332814930015552, | |
| "grad_norm": 3.3524464413937762, | |
| "learning_rate": 9.986578349291514e-06, | |
| "loss": 0.2472, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02348367029548989, | |
| "grad_norm": 1.4421209559287633, | |
| "learning_rate": 9.986398878878507e-06, | |
| "loss": 0.1791, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.02363919129082426, | |
| "grad_norm": 1.7313564339261944, | |
| "learning_rate": 9.98621821814255e-06, | |
| "loss": 0.2238, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.02379471228615863, | |
| "grad_norm": 1.7017996756379121, | |
| "learning_rate": 9.986036367126769e-06, | |
| "loss": 0.2007, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.023950233281493, | |
| "grad_norm": 1.515471002124247, | |
| "learning_rate": 9.985853325874575e-06, | |
| "loss": 0.2688, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.02410575427682737, | |
| "grad_norm": 0.8049651881516254, | |
| "learning_rate": 9.985669094429662e-06, | |
| "loss": 0.1865, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.024261275272161743, | |
| "grad_norm": 1.2861650933813724, | |
| "learning_rate": 9.985483672836007e-06, | |
| "loss": 0.2403, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.024416796267496113, | |
| "grad_norm": 2.173379700965189, | |
| "learning_rate": 9.985297061137877e-06, | |
| "loss": 0.2045, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.024572317262830483, | |
| "grad_norm": 1.5915407935889336, | |
| "learning_rate": 9.985109259379813e-06, | |
| "loss": 0.2063, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.024727838258164853, | |
| "grad_norm": 1.877271886192633, | |
| "learning_rate": 9.98492026760665e-06, | |
| "loss": 0.226, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.024883359253499222, | |
| "grad_norm": 1.590999803444347, | |
| "learning_rate": 9.984730085863504e-06, | |
| "loss": 0.2243, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.025038880248833592, | |
| "grad_norm": 2.2602490405621016, | |
| "learning_rate": 9.98453871419577e-06, | |
| "loss": 0.2599, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.025194401244167962, | |
| "grad_norm": 1.8247502790432317, | |
| "learning_rate": 9.984346152649135e-06, | |
| "loss": 0.2575, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.02534992223950233, | |
| "grad_norm": 1.6317702406563646, | |
| "learning_rate": 9.984152401269562e-06, | |
| "loss": 0.2513, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.0255054432348367, | |
| "grad_norm": 1.479820350518653, | |
| "learning_rate": 9.983957460103307e-06, | |
| "loss": 0.2134, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.025660964230171075, | |
| "grad_norm": 2.2204278110409716, | |
| "learning_rate": 9.9837613291969e-06, | |
| "loss": 0.2288, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.025816485225505444, | |
| "grad_norm": 1.8249773963334357, | |
| "learning_rate": 9.983564008597164e-06, | |
| "loss": 0.2342, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.025972006220839814, | |
| "grad_norm": 1.892476010698033, | |
| "learning_rate": 9.9833654983512e-06, | |
| "loss": 0.2263, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.026127527216174184, | |
| "grad_norm": 1.593847254715758, | |
| "learning_rate": 9.983165798506398e-06, | |
| "loss": 0.2163, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.026283048211508554, | |
| "grad_norm": 1.7653992228114257, | |
| "learning_rate": 9.982964909110426e-06, | |
| "loss": 0.2938, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.026438569206842923, | |
| "grad_norm": 1.3352350617943483, | |
| "learning_rate": 9.982762830211239e-06, | |
| "loss": 0.2069, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.026594090202177293, | |
| "grad_norm": 1.6623662216358996, | |
| "learning_rate": 9.982559561857079e-06, | |
| "loss": 0.213, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.026749611197511663, | |
| "grad_norm": 1.1923151136153478, | |
| "learning_rate": 9.982355104096468e-06, | |
| "loss": 0.2068, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.026905132192846033, | |
| "grad_norm": 1.5009321240819553, | |
| "learning_rate": 9.98214945697821e-06, | |
| "loss": 0.3292, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.027060653188180406, | |
| "grad_norm": 1.6168504596283289, | |
| "learning_rate": 9.981942620551399e-06, | |
| "loss": 0.2001, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.027216174183514776, | |
| "grad_norm": 1.0410735731938325, | |
| "learning_rate": 9.98173459486541e-06, | |
| "loss": 0.2697, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.027371695178849145, | |
| "grad_norm": 1.477725722611291, | |
| "learning_rate": 9.9815253799699e-06, | |
| "loss": 0.1796, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.027527216174183515, | |
| "grad_norm": 1.5159741098115525, | |
| "learning_rate": 9.981314975914811e-06, | |
| "loss": 0.2203, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.027682737169517885, | |
| "grad_norm": 0.8954975243967727, | |
| "learning_rate": 9.981103382750372e-06, | |
| "loss": 0.2662, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.027838258164852255, | |
| "grad_norm": 1.418625218985406, | |
| "learning_rate": 9.980890600527092e-06, | |
| "loss": 0.2484, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.027993779160186624, | |
| "grad_norm": 1.4411516373436362, | |
| "learning_rate": 9.980676629295763e-06, | |
| "loss": 0.302, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.028149300155520994, | |
| "grad_norm": 0.9480510792156464, | |
| "learning_rate": 9.980461469107463e-06, | |
| "loss": 0.2075, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.028304821150855364, | |
| "grad_norm": 2.081864475923441, | |
| "learning_rate": 9.980245120013558e-06, | |
| "loss": 0.2942, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.028460342146189737, | |
| "grad_norm": 1.2615838373847896, | |
| "learning_rate": 9.980027582065691e-06, | |
| "loss": 0.2018, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.028615863141524107, | |
| "grad_norm": 1.2086223544731691, | |
| "learning_rate": 9.979808855315792e-06, | |
| "loss": 0.2743, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.028771384136858476, | |
| "grad_norm": 0.9412206342605678, | |
| "learning_rate": 9.979588939816071e-06, | |
| "loss": 0.2318, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.028926905132192846, | |
| "grad_norm": 1.365479987499767, | |
| "learning_rate": 9.979367835619029e-06, | |
| "loss": 0.2813, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.029082426127527216, | |
| "grad_norm": 1.1385427599520912, | |
| "learning_rate": 9.979145542777444e-06, | |
| "loss": 0.2627, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.029237947122861586, | |
| "grad_norm": 1.560448582637042, | |
| "learning_rate": 9.97892206134438e-06, | |
| "loss": 0.2042, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.029393468118195955, | |
| "grad_norm": 1.9585068672638826, | |
| "learning_rate": 9.97869739137319e-06, | |
| "loss": 0.2647, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.029548989113530325, | |
| "grad_norm": 1.612253014357388, | |
| "learning_rate": 9.9784715329175e-06, | |
| "loss": 0.2573, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0297045101088647, | |
| "grad_norm": 1.2782552177366555, | |
| "learning_rate": 9.978244486031228e-06, | |
| "loss": 0.1914, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.029860031104199068, | |
| "grad_norm": 2.1188620010348163, | |
| "learning_rate": 9.978016250768573e-06, | |
| "loss": 0.245, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.030015552099533438, | |
| "grad_norm": 1.9777647488169638, | |
| "learning_rate": 9.977786827184019e-06, | |
| "loss": 0.2774, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.030171073094867808, | |
| "grad_norm": 2.0157801185629407, | |
| "learning_rate": 9.977556215332332e-06, | |
| "loss": 0.297, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.030326594090202177, | |
| "grad_norm": 0.8845310906810993, | |
| "learning_rate": 9.97732441526856e-06, | |
| "loss": 0.1756, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.030482115085536547, | |
| "grad_norm": 1.2647941053184737, | |
| "learning_rate": 9.97709142704804e-06, | |
| "loss": 0.1773, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.030637636080870917, | |
| "grad_norm": 1.1823797462719756, | |
| "learning_rate": 9.976857250726389e-06, | |
| "loss": 0.2501, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.030793157076205287, | |
| "grad_norm": 1.643272741263538, | |
| "learning_rate": 9.976621886359506e-06, | |
| "loss": 0.2794, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.030948678071539656, | |
| "grad_norm": 1.6415813649465196, | |
| "learning_rate": 9.976385334003577e-06, | |
| "loss": 0.2562, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.03110419906687403, | |
| "grad_norm": 1.4019518238717095, | |
| "learning_rate": 9.976147593715074e-06, | |
| "loss": 0.2066, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03110419906687403, | |
| "eval_loss": 0.2431146204471588, | |
| "eval_runtime": 9.4441, | |
| "eval_samples_per_second": 2.753, | |
| "eval_steps_per_second": 0.741, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.031259720062208396, | |
| "grad_norm": 1.5261705881041825, | |
| "learning_rate": 9.975908665550742e-06, | |
| "loss": 0.168, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.031415241057542766, | |
| "grad_norm": 1.3552305394454693, | |
| "learning_rate": 9.975668549567623e-06, | |
| "loss": 0.2513, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.031570762052877135, | |
| "grad_norm": 1.09704983539552, | |
| "learning_rate": 9.97542724582303e-06, | |
| "loss": 0.1877, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.031726283048211505, | |
| "grad_norm": 1.8452203060592092, | |
| "learning_rate": 9.975184754374572e-06, | |
| "loss": 0.3442, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.03188180404354588, | |
| "grad_norm": 1.4512649025391702, | |
| "learning_rate": 9.974941075280128e-06, | |
| "loss": 0.2172, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.03203732503888025, | |
| "grad_norm": 1.5376722263850107, | |
| "learning_rate": 9.974696208597874e-06, | |
| "loss": 0.2206, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.03219284603421462, | |
| "grad_norm": 1.6097488768932668, | |
| "learning_rate": 9.97445015438626e-06, | |
| "loss": 0.2134, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.03234836702954899, | |
| "grad_norm": 1.2381378734127797, | |
| "learning_rate": 9.974202912704022e-06, | |
| "loss": 0.2026, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.03250388802488336, | |
| "grad_norm": 2.0110329862327663, | |
| "learning_rate": 9.973954483610184e-06, | |
| "loss": 0.2117, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.03265940902021773, | |
| "grad_norm": 4.938465463538487, | |
| "learning_rate": 9.973704867164044e-06, | |
| "loss": 0.2787, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0328149300155521, | |
| "grad_norm": 1.9318587506840115, | |
| "learning_rate": 9.973454063425191e-06, | |
| "loss": 0.2901, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.03297045101088647, | |
| "grad_norm": 1.5730776773238022, | |
| "learning_rate": 9.973202072453498e-06, | |
| "loss": 0.3557, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.03312597200622084, | |
| "grad_norm": 2.333406801079277, | |
| "learning_rate": 9.972948894309116e-06, | |
| "loss": 0.2553, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.03328149300155521, | |
| "grad_norm": 1.2613725609366824, | |
| "learning_rate": 9.972694529052482e-06, | |
| "loss": 0.2721, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.03343701399688958, | |
| "grad_norm": 1.233807021429561, | |
| "learning_rate": 9.972438976744317e-06, | |
| "loss": 0.194, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.03359253499222395, | |
| "grad_norm": 1.0922019141763, | |
| "learning_rate": 9.972182237445624e-06, | |
| "loss": 0.2625, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.03374805598755832, | |
| "grad_norm": 1.5332376003824164, | |
| "learning_rate": 9.971924311217693e-06, | |
| "loss": 0.2369, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.03390357698289269, | |
| "grad_norm": 2.1386234582292856, | |
| "learning_rate": 9.971665198122093e-06, | |
| "loss": 0.2691, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.03405909797822706, | |
| "grad_norm": 1.4374027394103162, | |
| "learning_rate": 9.97140489822068e-06, | |
| "loss": 0.2217, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.03421461897356143, | |
| "grad_norm": 1.7261766376116665, | |
| "learning_rate": 9.971143411575585e-06, | |
| "loss": 0.3063, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0343701399688958, | |
| "grad_norm": 1.5632670578977363, | |
| "learning_rate": 9.970880738249236e-06, | |
| "loss": 0.2333, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.03452566096423017, | |
| "grad_norm": 1.6709935682257062, | |
| "learning_rate": 9.97061687830433e-06, | |
| "loss": 0.2808, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.034681181959564544, | |
| "grad_norm": 1.7747486994884278, | |
| "learning_rate": 9.970351831803862e-06, | |
| "loss": 0.3182, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.034836702954898914, | |
| "grad_norm": 1.2079739996818415, | |
| "learning_rate": 9.970085598811094e-06, | |
| "loss": 0.2426, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.034992223950233284, | |
| "grad_norm": 2.269795435480081, | |
| "learning_rate": 9.969818179389586e-06, | |
| "loss": 0.1933, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.035147744945567654, | |
| "grad_norm": 1.28324330975912, | |
| "learning_rate": 9.96954957360317e-06, | |
| "loss": 0.2078, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.03530326594090202, | |
| "grad_norm": 3.0240429569891147, | |
| "learning_rate": 9.969279781515967e-06, | |
| "loss": 0.2865, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.03545878693623639, | |
| "grad_norm": 1.4022531253860526, | |
| "learning_rate": 9.969008803192385e-06, | |
| "loss": 0.189, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.03561430793157076, | |
| "grad_norm": 1.4481645110880101, | |
| "learning_rate": 9.968736638697105e-06, | |
| "loss": 0.2038, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.03576982892690513, | |
| "grad_norm": 1.2439638440320844, | |
| "learning_rate": 9.968463288095096e-06, | |
| "loss": 0.1962, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0359253499222395, | |
| "grad_norm": 1.550618674775446, | |
| "learning_rate": 9.968188751451613e-06, | |
| "loss": 0.2461, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.03608087091757387, | |
| "grad_norm": 1.2590441656933422, | |
| "learning_rate": 9.967913028832192e-06, | |
| "loss": 0.28, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.03623639191290824, | |
| "grad_norm": 15.743047596573488, | |
| "learning_rate": 9.96763612030265e-06, | |
| "loss": 0.2272, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.03639191290824261, | |
| "grad_norm": 1.0832646660805165, | |
| "learning_rate": 9.967358025929092e-06, | |
| "loss": 0.2766, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.03654743390357698, | |
| "grad_norm": 1.496152606461021, | |
| "learning_rate": 9.9670787457779e-06, | |
| "loss": 0.1928, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.03670295489891135, | |
| "grad_norm": 1.5049076518304147, | |
| "learning_rate": 9.966798279915744e-06, | |
| "loss": 0.2023, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.03685847589424572, | |
| "grad_norm": 0.9377725167524534, | |
| "learning_rate": 9.966516628409573e-06, | |
| "loss": 0.1657, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.03701399688958009, | |
| "grad_norm": 1.5646202349920761, | |
| "learning_rate": 9.96623379132662e-06, | |
| "loss": 0.2157, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.03716951788491446, | |
| "grad_norm": 1.14277577769819, | |
| "learning_rate": 9.965949768734409e-06, | |
| "loss": 0.2163, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.03732503888024884, | |
| "grad_norm": 2.158716016882222, | |
| "learning_rate": 9.965664560700734e-06, | |
| "loss": 0.2041, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03748055987558321, | |
| "grad_norm": 1.8568349342429766, | |
| "learning_rate": 9.965378167293679e-06, | |
| "loss": 0.2266, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.037636080870917576, | |
| "grad_norm": 2.035673543431871, | |
| "learning_rate": 9.965090588581609e-06, | |
| "loss": 0.2893, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.037791601866251946, | |
| "grad_norm": 1.2421527558787024, | |
| "learning_rate": 9.964801824633177e-06, | |
| "loss": 0.166, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.037947122861586316, | |
| "grad_norm": 1.7368625294642988, | |
| "learning_rate": 9.964511875517313e-06, | |
| "loss": 0.2593, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.038102643856920686, | |
| "grad_norm": 1.274064232837515, | |
| "learning_rate": 9.964220741303232e-06, | |
| "loss": 0.1676, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.038258164852255055, | |
| "grad_norm": 1.3271094523398685, | |
| "learning_rate": 9.963928422060432e-06, | |
| "loss": 0.2048, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.038413685847589425, | |
| "grad_norm": 1.441894820882409, | |
| "learning_rate": 9.963634917858692e-06, | |
| "loss": 0.2102, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.038569206842923795, | |
| "grad_norm": 1.3882607946902543, | |
| "learning_rate": 9.963340228768077e-06, | |
| "loss": 0.1862, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.038724727838258165, | |
| "grad_norm": 1.1529068772443192, | |
| "learning_rate": 9.963044354858934e-06, | |
| "loss": 0.2519, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.038880248833592534, | |
| "grad_norm": 2.236043321099024, | |
| "learning_rate": 9.962747296201891e-06, | |
| "loss": 0.1635, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.039035769828926904, | |
| "grad_norm": 1.8503487939836718, | |
| "learning_rate": 9.96244905286786e-06, | |
| "loss": 0.181, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.039191290824261274, | |
| "grad_norm": 1.4083157880171735, | |
| "learning_rate": 9.962149624928037e-06, | |
| "loss": 0.1781, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.039346811819595644, | |
| "grad_norm": 1.6536407646222175, | |
| "learning_rate": 9.961849012453899e-06, | |
| "loss": 0.2699, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.039502332814930013, | |
| "grad_norm": 1.3154495432198843, | |
| "learning_rate": 9.961547215517206e-06, | |
| "loss": 0.2096, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.03965785381026438, | |
| "grad_norm": 1.222944730470649, | |
| "learning_rate": 9.961244234190001e-06, | |
| "loss": 0.209, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.03981337480559875, | |
| "grad_norm": 1.3903861430735245, | |
| "learning_rate": 9.96094006854461e-06, | |
| "loss": 0.177, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.03996889580093312, | |
| "grad_norm": 1.8733569984170189, | |
| "learning_rate": 9.960634718653644e-06, | |
| "loss": 0.4051, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.0401244167962675, | |
| "grad_norm": 1.3013086938531622, | |
| "learning_rate": 9.96032818458999e-06, | |
| "loss": 0.2215, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.04027993779160187, | |
| "grad_norm": 1.9062067810307814, | |
| "learning_rate": 9.960020466426825e-06, | |
| "loss": 0.2131, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.04043545878693624, | |
| "grad_norm": 1.240725461727028, | |
| "learning_rate": 9.959711564237603e-06, | |
| "loss": 0.2376, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04059097978227061, | |
| "grad_norm": 1.504578258989953, | |
| "learning_rate": 9.95940147809607e-06, | |
| "loss": 0.2238, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.04074650077760498, | |
| "grad_norm": 1.112441665378311, | |
| "learning_rate": 9.959090208076239e-06, | |
| "loss": 0.175, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.04090202177293935, | |
| "grad_norm": 1.492328645699945, | |
| "learning_rate": 9.958777754252418e-06, | |
| "loss": 0.2332, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.04105754276827372, | |
| "grad_norm": 1.4626777112927891, | |
| "learning_rate": 9.958464116699196e-06, | |
| "loss": 0.2093, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.04121306376360809, | |
| "grad_norm": 2.4304182018626266, | |
| "learning_rate": 9.958149295491441e-06, | |
| "loss": 0.2495, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.04136858475894246, | |
| "grad_norm": 2.1830670676642256, | |
| "learning_rate": 9.957833290704305e-06, | |
| "loss": 0.2151, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.04152410575427683, | |
| "grad_norm": 0.9776646131405466, | |
| "learning_rate": 9.957516102413223e-06, | |
| "loss": 0.2215, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.0416796267496112, | |
| "grad_norm": 0.9811824757237497, | |
| "learning_rate": 9.957197730693912e-06, | |
| "loss": 0.2671, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.04183514774494557, | |
| "grad_norm": 1.025030756788744, | |
| "learning_rate": 9.956878175622372e-06, | |
| "loss": 0.1935, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.041990668740279936, | |
| "grad_norm": 1.715248799705313, | |
| "learning_rate": 9.956557437274887e-06, | |
| "loss": 0.2639, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.042146189735614306, | |
| "grad_norm": 1.4715136542514509, | |
| "learning_rate": 9.95623551572802e-06, | |
| "loss": 0.1863, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.042301710730948676, | |
| "grad_norm": 2.0941396313348766, | |
| "learning_rate": 9.955912411058616e-06, | |
| "loss": 0.1764, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.042457231726283046, | |
| "grad_norm": 1.4113410003708207, | |
| "learning_rate": 9.955588123343808e-06, | |
| "loss": 0.2635, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.042612752721617415, | |
| "grad_norm": 1.0999635349018924, | |
| "learning_rate": 9.955262652661009e-06, | |
| "loss": 0.2424, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.042768273716951785, | |
| "grad_norm": 1.0847541480257452, | |
| "learning_rate": 9.954935999087908e-06, | |
| "loss": 0.276, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.04292379471228616, | |
| "grad_norm": 1.695906274664277, | |
| "learning_rate": 9.954608162702488e-06, | |
| "loss": 0.2316, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.04307931570762053, | |
| "grad_norm": 1.428650374776818, | |
| "learning_rate": 9.954279143583003e-06, | |
| "loss": 0.234, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.0432348367029549, | |
| "grad_norm": 1.261831528775643, | |
| "learning_rate": 9.953948941807998e-06, | |
| "loss": 0.2331, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.04339035769828927, | |
| "grad_norm": 1.1389240235405695, | |
| "learning_rate": 9.953617557456295e-06, | |
| "loss": 0.1813, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.04354587869362364, | |
| "grad_norm": 2.1356821017337264, | |
| "learning_rate": 9.953284990607e-06, | |
| "loss": 0.2716, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.04370139968895801, | |
| "grad_norm": 1.256196669200449, | |
| "learning_rate": 9.952951241339501e-06, | |
| "loss": 0.2586, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.04385692068429238, | |
| "grad_norm": 1.6264279435141102, | |
| "learning_rate": 9.952616309733471e-06, | |
| "loss": 0.2138, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.04401244167962675, | |
| "grad_norm": 1.0771562874552736, | |
| "learning_rate": 9.952280195868859e-06, | |
| "loss": 0.2798, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.04416796267496112, | |
| "grad_norm": 1.6634031368562676, | |
| "learning_rate": 9.951942899825906e-06, | |
| "loss": 0.3159, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.04432348367029549, | |
| "grad_norm": 1.5379741925800816, | |
| "learning_rate": 9.951604421685121e-06, | |
| "loss": 0.3275, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.04447900466562986, | |
| "grad_norm": 1.4489954817264272, | |
| "learning_rate": 9.951264761527311e-06, | |
| "loss": 0.1989, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.04463452566096423, | |
| "grad_norm": 1.6369744606712289, | |
| "learning_rate": 9.950923919433555e-06, | |
| "loss": 0.2068, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.0447900466562986, | |
| "grad_norm": 1.8400125131547473, | |
| "learning_rate": 9.950581895485214e-06, | |
| "loss": 0.1977, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.04494556765163297, | |
| "grad_norm": 2.1448208174547743, | |
| "learning_rate": 9.950238689763937e-06, | |
| "loss": 0.1882, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.04510108864696734, | |
| "grad_norm": 1.1002755110550755, | |
| "learning_rate": 9.949894302351653e-06, | |
| "loss": 0.2422, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.04525660964230171, | |
| "grad_norm": 0.8557887132764603, | |
| "learning_rate": 9.94954873333057e-06, | |
| "loss": 0.2249, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.04541213063763608, | |
| "grad_norm": 1.800548229871832, | |
| "learning_rate": 9.94920198278318e-06, | |
| "loss": 0.2462, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.04556765163297045, | |
| "grad_norm": 1.077848623865367, | |
| "learning_rate": 9.948854050792256e-06, | |
| "loss": 0.1693, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.045723172628304824, | |
| "grad_norm": 1.3420617788641933, | |
| "learning_rate": 9.948504937440857e-06, | |
| "loss": 0.2632, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.045878693623639194, | |
| "grad_norm": 1.786889545891979, | |
| "learning_rate": 9.948154642812321e-06, | |
| "loss": 0.1812, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.046034214618973564, | |
| "grad_norm": 1.6608331504976344, | |
| "learning_rate": 9.947803166990267e-06, | |
| "loss": 0.2781, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.046189735614307934, | |
| "grad_norm": 1.479079510539959, | |
| "learning_rate": 9.947450510058596e-06, | |
| "loss": 0.2176, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.0463452566096423, | |
| "grad_norm": 1.1205653962227666, | |
| "learning_rate": 9.947096672101496e-06, | |
| "loss": 0.2189, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.04650077760497667, | |
| "grad_norm": 1.6903970393534788, | |
| "learning_rate": 9.94674165320343e-06, | |
| "loss": 0.1715, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.04665629860031104, | |
| "grad_norm": 3.020535469766265, | |
| "learning_rate": 9.946385453449145e-06, | |
| "loss": 0.2334, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04665629860031104, | |
| "eval_loss": 0.23520340025424957, | |
| "eval_runtime": 9.4655, | |
| "eval_samples_per_second": 2.747, | |
| "eval_steps_per_second": 0.74, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04681181959564541, | |
| "grad_norm": 1.2625213750296742, | |
| "learning_rate": 9.946028072923675e-06, | |
| "loss": 0.2153, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.04696734059097978, | |
| "grad_norm": 1.326552639234392, | |
| "learning_rate": 9.945669511712328e-06, | |
| "loss": 0.1378, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.04712286158631415, | |
| "grad_norm": 1.1353660480206176, | |
| "learning_rate": 9.945309769900698e-06, | |
| "loss": 0.2505, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.04727838258164852, | |
| "grad_norm": 1.2591178630665596, | |
| "learning_rate": 9.944948847574662e-06, | |
| "loss": 0.1704, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.04743390357698289, | |
| "grad_norm": 1.3520689396483014, | |
| "learning_rate": 9.944586744820377e-06, | |
| "loss": 0.2324, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.04758942457231726, | |
| "grad_norm": 1.0116417439713241, | |
| "learning_rate": 9.94422346172428e-06, | |
| "loss": 0.1512, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.04774494556765163, | |
| "grad_norm": 1.479626380132595, | |
| "learning_rate": 9.943858998373093e-06, | |
| "loss": 0.2121, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.047900466562986, | |
| "grad_norm": 1.4227055232441543, | |
| "learning_rate": 9.94349335485382e-06, | |
| "loss": 0.2667, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.04805598755832037, | |
| "grad_norm": 1.583200032514501, | |
| "learning_rate": 9.943126531253744e-06, | |
| "loss": 0.289, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.04821150855365474, | |
| "grad_norm": 1.8189938486203978, | |
| "learning_rate": 9.942758527660429e-06, | |
| "loss": 0.3084, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.04836702954898912, | |
| "grad_norm": 1.146189412882889, | |
| "learning_rate": 9.942389344161724e-06, | |
| "loss": 0.1669, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.04852255054432349, | |
| "grad_norm": 1.547896984860253, | |
| "learning_rate": 9.94201898084576e-06, | |
| "loss": 0.2064, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.048678071539657856, | |
| "grad_norm": 1.5949794296702688, | |
| "learning_rate": 9.941647437800946e-06, | |
| "loss": 0.1929, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.048833592534992226, | |
| "grad_norm": 1.803377063241175, | |
| "learning_rate": 9.941274715115976e-06, | |
| "loss": 0.2791, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.048989113530326596, | |
| "grad_norm": 1.3837921692775779, | |
| "learning_rate": 9.940900812879822e-06, | |
| "loss": 0.1767, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.049144634525660966, | |
| "grad_norm": 1.3433932609509933, | |
| "learning_rate": 9.940525731181741e-06, | |
| "loss": 0.2084, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.049300155520995335, | |
| "grad_norm": 1.357062528683942, | |
| "learning_rate": 9.940149470111269e-06, | |
| "loss": 0.2047, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.049455676516329705, | |
| "grad_norm": 1.6539883727473814, | |
| "learning_rate": 9.939772029758225e-06, | |
| "loss": 0.2925, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.049611197511664075, | |
| "grad_norm": 1.2278880982790155, | |
| "learning_rate": 9.939393410212713e-06, | |
| "loss": 0.2649, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.049766718506998445, | |
| "grad_norm": 1.6247947056783312, | |
| "learning_rate": 9.93901361156511e-06, | |
| "loss": 0.3355, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.049922239502332814, | |
| "grad_norm": 1.1732603342184649, | |
| "learning_rate": 9.93863263390608e-06, | |
| "loss": 0.2603, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.050077760497667184, | |
| "grad_norm": 1.4022468720638315, | |
| "learning_rate": 9.93825047732657e-06, | |
| "loss": 0.3171, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.050233281493001554, | |
| "grad_norm": 1.3668475608164796, | |
| "learning_rate": 9.937867141917804e-06, | |
| "loss": 0.2952, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.050388802488335924, | |
| "grad_norm": 1.4553813573539522, | |
| "learning_rate": 9.93748262777129e-06, | |
| "loss": 0.1581, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.05054432348367029, | |
| "grad_norm": 1.9871080316775154, | |
| "learning_rate": 9.937096934978819e-06, | |
| "loss": 0.2368, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.05069984447900466, | |
| "grad_norm": 1.2900065629907207, | |
| "learning_rate": 9.936710063632457e-06, | |
| "loss": 0.2831, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.05085536547433903, | |
| "grad_norm": 0.9263549089146618, | |
| "learning_rate": 9.93632201382456e-06, | |
| "loss": 0.2086, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.0510108864696734, | |
| "grad_norm": 1.9892589335821493, | |
| "learning_rate": 9.935932785647756e-06, | |
| "loss": 0.2717, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.05116640746500778, | |
| "grad_norm": 1.1155547773179386, | |
| "learning_rate": 9.935542379194965e-06, | |
| "loss": 0.2731, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.05132192846034215, | |
| "grad_norm": 1.0330106857849222, | |
| "learning_rate": 9.935150794559379e-06, | |
| "loss": 0.1841, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.05147744945567652, | |
| "grad_norm": 1.52093348670823, | |
| "learning_rate": 9.934758031834475e-06, | |
| "loss": 0.2061, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.05163297045101089, | |
| "grad_norm": 1.1824055834479263, | |
| "learning_rate": 9.93436409111401e-06, | |
| "loss": 0.2613, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.05178849144634526, | |
| "grad_norm": 1.5329142188470473, | |
| "learning_rate": 9.933968972492026e-06, | |
| "loss": 0.2541, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.05194401244167963, | |
| "grad_norm": 1.0304282737168275, | |
| "learning_rate": 9.933572676062841e-06, | |
| "loss": 0.2024, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.052099533437014, | |
| "grad_norm": 1.1252175849664872, | |
| "learning_rate": 9.933175201921057e-06, | |
| "loss": 0.201, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.05225505443234837, | |
| "grad_norm": 1.6828294804696526, | |
| "learning_rate": 9.932776550161559e-06, | |
| "loss": 0.2298, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.05241057542768274, | |
| "grad_norm": 1.2831001226274117, | |
| "learning_rate": 9.932376720879503e-06, | |
| "loss": 0.2352, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.05256609642301711, | |
| "grad_norm": 2.152789286567263, | |
| "learning_rate": 9.931975714170345e-06, | |
| "loss": 0.3382, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.05272161741835148, | |
| "grad_norm": 1.702657664273862, | |
| "learning_rate": 9.931573530129803e-06, | |
| "loss": 0.2368, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.05287713841368585, | |
| "grad_norm": 2.05056832602719, | |
| "learning_rate": 9.931170168853886e-06, | |
| "loss": 0.2992, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.053032659409020216, | |
| "grad_norm": 1.5775290622934088, | |
| "learning_rate": 9.930765630438882e-06, | |
| "loss": 0.212, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.053188180404354586, | |
| "grad_norm": 1.166034186090071, | |
| "learning_rate": 9.93035991498136e-06, | |
| "loss": 0.2081, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.053343701399688956, | |
| "grad_norm": 1.4555896083998001, | |
| "learning_rate": 9.929953022578171e-06, | |
| "loss": 0.1857, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.053499222395023326, | |
| "grad_norm": 1.343927833342108, | |
| "learning_rate": 9.929544953326445e-06, | |
| "loss": 0.2691, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.053654743390357695, | |
| "grad_norm": 1.8890642830307378, | |
| "learning_rate": 9.929135707323592e-06, | |
| "loss": 0.1967, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.053810264385692065, | |
| "grad_norm": 1.4990308791372666, | |
| "learning_rate": 9.928725284667308e-06, | |
| "loss": 0.1774, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.05396578538102644, | |
| "grad_norm": 1.615806257387967, | |
| "learning_rate": 9.928313685455565e-06, | |
| "loss": 0.2234, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.05412130637636081, | |
| "grad_norm": 1.3758078431089233, | |
| "learning_rate": 9.927900909786617e-06, | |
| "loss": 0.259, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.05427682737169518, | |
| "grad_norm": 0.855435278326685, | |
| "learning_rate": 9.927486957759001e-06, | |
| "loss": 0.2068, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.05443234836702955, | |
| "grad_norm": 1.5217482862634222, | |
| "learning_rate": 9.927071829471531e-06, | |
| "loss": 0.1551, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.05458786936236392, | |
| "grad_norm": 1.5111503264835533, | |
| "learning_rate": 9.926655525023304e-06, | |
| "loss": 0.2599, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.05474339035769829, | |
| "grad_norm": 0.8967930843733002, | |
| "learning_rate": 9.9262380445137e-06, | |
| "loss": 0.169, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.05489891135303266, | |
| "grad_norm": 1.9464375941159884, | |
| "learning_rate": 9.925819388042374e-06, | |
| "loss": 0.2983, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.05505443234836703, | |
| "grad_norm": 1.574189824318599, | |
| "learning_rate": 9.925399555709269e-06, | |
| "loss": 0.1937, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.0552099533437014, | |
| "grad_norm": 3.1438752373638232, | |
| "learning_rate": 9.924978547614604e-06, | |
| "loss": 0.2181, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.05536547433903577, | |
| "grad_norm": 1.6348127637741856, | |
| "learning_rate": 9.924556363858877e-06, | |
| "loss": 0.1847, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.05552099533437014, | |
| "grad_norm": 1.724455721347507, | |
| "learning_rate": 9.92413300454287e-06, | |
| "loss": 0.1924, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.05567651632970451, | |
| "grad_norm": 0.9215074637606898, | |
| "learning_rate": 9.923708469767645e-06, | |
| "loss": 0.1484, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.05583203732503888, | |
| "grad_norm": 1.0048144642733263, | |
| "learning_rate": 9.923282759634547e-06, | |
| "loss": 0.139, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.05598755832037325, | |
| "grad_norm": 1.6563473574979655, | |
| "learning_rate": 9.922855874245197e-06, | |
| "loss": 0.2462, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.05614307931570762, | |
| "grad_norm": 1.0753481257964308, | |
| "learning_rate": 9.922427813701495e-06, | |
| "loss": 0.2543, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.05629860031104199, | |
| "grad_norm": 1.1607722120362791, | |
| "learning_rate": 9.92199857810563e-06, | |
| "loss": 0.1919, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.05645412130637636, | |
| "grad_norm": 1.0235707105593828, | |
| "learning_rate": 9.921568167560065e-06, | |
| "loss": 0.1851, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.05660964230171073, | |
| "grad_norm": 1.443489161948352, | |
| "learning_rate": 9.921136582167545e-06, | |
| "loss": 0.2566, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.056765163297045104, | |
| "grad_norm": 1.1047251832726421, | |
| "learning_rate": 9.920703822031094e-06, | |
| "loss": 0.2268, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.056920684292379474, | |
| "grad_norm": 1.8071891113724519, | |
| "learning_rate": 9.92026988725402e-06, | |
| "loss": 0.286, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.057076205287713844, | |
| "grad_norm": 1.127534519608966, | |
| "learning_rate": 9.919834777939908e-06, | |
| "loss": 0.2078, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.05723172628304821, | |
| "grad_norm": 1.3537981754957027, | |
| "learning_rate": 9.919398494192625e-06, | |
| "loss": 0.2574, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.05738724727838258, | |
| "grad_norm": 1.5740289483284484, | |
| "learning_rate": 9.918961036116317e-06, | |
| "loss": 0.2168, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.05754276827371695, | |
| "grad_norm": 2.1521943324617854, | |
| "learning_rate": 9.918522403815414e-06, | |
| "loss": 0.5388, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.05769828926905132, | |
| "grad_norm": 0.9621156840694527, | |
| "learning_rate": 9.918082597394621e-06, | |
| "loss": 0.2206, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.05785381026438569, | |
| "grad_norm": 0.8374473543740336, | |
| "learning_rate": 9.91764161695893e-06, | |
| "loss": 0.1931, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.05800933125972006, | |
| "grad_norm": 1.594565893913882, | |
| "learning_rate": 9.917199462613601e-06, | |
| "loss": 0.2664, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.05816485225505443, | |
| "grad_norm": 2.539276249800021, | |
| "learning_rate": 9.916756134464191e-06, | |
| "loss": 0.3158, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.0583203732503888, | |
| "grad_norm": 1.0461962066836652, | |
| "learning_rate": 9.916311632616525e-06, | |
| "loss": 0.2489, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.05847589424572317, | |
| "grad_norm": 1.1340444520472663, | |
| "learning_rate": 9.915865957176709e-06, | |
| "loss": 0.2718, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.05863141524105754, | |
| "grad_norm": 1.467480205738983, | |
| "learning_rate": 9.915419108251138e-06, | |
| "loss": 0.1753, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.05878693623639191, | |
| "grad_norm": 1.4394725259816188, | |
| "learning_rate": 9.914971085946476e-06, | |
| "loss": 0.1973, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.05894245723172628, | |
| "grad_norm": 1.2534669496284443, | |
| "learning_rate": 9.914521890369676e-06, | |
| "loss": 0.2127, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.05909797822706065, | |
| "grad_norm": 1.282361137311585, | |
| "learning_rate": 9.914071521627964e-06, | |
| "loss": 0.1881, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.05925349922239502, | |
| "grad_norm": 1.7744186005576332, | |
| "learning_rate": 9.913619979828851e-06, | |
| "loss": 0.1875, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.0594090202177294, | |
| "grad_norm": 1.5020250209663002, | |
| "learning_rate": 9.913167265080126e-06, | |
| "loss": 0.1684, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.05956454121306377, | |
| "grad_norm": 1.259074929221576, | |
| "learning_rate": 9.912713377489858e-06, | |
| "loss": 0.2268, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.059720062208398136, | |
| "grad_norm": 1.7761373693512776, | |
| "learning_rate": 9.912258317166398e-06, | |
| "loss": 0.223, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.059875583203732506, | |
| "grad_norm": 2.38865888975245, | |
| "learning_rate": 9.911802084218374e-06, | |
| "loss": 0.2401, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.060031104199066876, | |
| "grad_norm": 0.8949382740792282, | |
| "learning_rate": 9.911344678754694e-06, | |
| "loss": 0.1922, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.060186625194401246, | |
| "grad_norm": 1.5889982876131292, | |
| "learning_rate": 9.910886100884547e-06, | |
| "loss": 0.1943, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.060342146189735615, | |
| "grad_norm": 1.4147870380604834, | |
| "learning_rate": 9.910426350717404e-06, | |
| "loss": 0.1812, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.060497667185069985, | |
| "grad_norm": 1.8231195124047115, | |
| "learning_rate": 9.909965428363012e-06, | |
| "loss": 0.2312, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.060653188180404355, | |
| "grad_norm": 1.8874621933930384, | |
| "learning_rate": 9.909503333931402e-06, | |
| "loss": 0.287, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.060808709175738725, | |
| "grad_norm": 1.7665216636429069, | |
| "learning_rate": 9.90904006753288e-06, | |
| "loss": 0.2185, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.060964230171073094, | |
| "grad_norm": 1.256357590139898, | |
| "learning_rate": 9.908575629278034e-06, | |
| "loss": 0.1919, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.061119751166407464, | |
| "grad_norm": 4.375967721306914, | |
| "learning_rate": 9.908110019277735e-06, | |
| "loss": 0.1781, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.061275272161741834, | |
| "grad_norm": 1.4286735960699084, | |
| "learning_rate": 9.907643237643127e-06, | |
| "loss": 0.253, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.061430793157076204, | |
| "grad_norm": 1.6229980414007696, | |
| "learning_rate": 9.90717528448564e-06, | |
| "loss": 0.2598, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.06158631415241057, | |
| "grad_norm": 1.654127403226531, | |
| "learning_rate": 9.906706159916977e-06, | |
| "loss": 0.2677, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.06174183514774494, | |
| "grad_norm": 0.7489317566220969, | |
| "learning_rate": 9.90623586404913e-06, | |
| "loss": 0.1595, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.06189735614307931, | |
| "grad_norm": 1.0243584995437751, | |
| "learning_rate": 9.90576439699436e-06, | |
| "loss": 0.2089, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.06205287713841368, | |
| "grad_norm": 1.2843274122650117, | |
| "learning_rate": 9.905291758865217e-06, | |
| "loss": 0.2458, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.06220839813374806, | |
| "grad_norm": 1.482986812845832, | |
| "learning_rate": 9.904817949774524e-06, | |
| "loss": 0.2611, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06220839813374806, | |
| "eval_loss": 0.23184187710285187, | |
| "eval_runtime": 9.4466, | |
| "eval_samples_per_second": 2.752, | |
| "eval_steps_per_second": 0.741, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06236391912908243, | |
| "grad_norm": 2.01899839511783, | |
| "learning_rate": 9.904342969835385e-06, | |
| "loss": 0.2178, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.06251944012441679, | |
| "grad_norm": 1.4244669635257896, | |
| "learning_rate": 9.903866819161188e-06, | |
| "loss": 0.2321, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.06267496111975117, | |
| "grad_norm": 1.7090867256976423, | |
| "learning_rate": 9.903389497865593e-06, | |
| "loss": 0.2071, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.06283048211508553, | |
| "grad_norm": 1.305136754505658, | |
| "learning_rate": 9.902911006062543e-06, | |
| "loss": 0.1899, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.06298600311041991, | |
| "grad_norm": 1.0188677304744835, | |
| "learning_rate": 9.902431343866266e-06, | |
| "loss": 0.2457, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.06314152410575427, | |
| "grad_norm": 1.6042710170666996, | |
| "learning_rate": 9.901950511391259e-06, | |
| "loss": 0.1894, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.06329704510108865, | |
| "grad_norm": 1.3017493690494788, | |
| "learning_rate": 9.901468508752304e-06, | |
| "loss": 0.2908, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.06345256609642301, | |
| "grad_norm": 1.3230633029674432, | |
| "learning_rate": 9.900985336064463e-06, | |
| "loss": 0.2786, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.06360808709175739, | |
| "grad_norm": 1.5120257860737862, | |
| "learning_rate": 9.900500993443076e-06, | |
| "loss": 0.2516, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.06376360808709176, | |
| "grad_norm": 1.004582433223966, | |
| "learning_rate": 9.900015481003762e-06, | |
| "loss": 0.2232, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.06391912908242613, | |
| "grad_norm": 1.399115724283105, | |
| "learning_rate": 9.89952879886242e-06, | |
| "loss": 0.2763, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.0640746500777605, | |
| "grad_norm": 1.816764777159624, | |
| "learning_rate": 9.899040947135225e-06, | |
| "loss": 0.2913, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.06423017107309487, | |
| "grad_norm": 1.1949304261760583, | |
| "learning_rate": 9.898551925938638e-06, | |
| "loss": 0.191, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.06438569206842924, | |
| "grad_norm": 1.6899096837752585, | |
| "learning_rate": 9.898061735389395e-06, | |
| "loss": 0.2314, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.0645412130637636, | |
| "grad_norm": 1.6400875402483213, | |
| "learning_rate": 9.897570375604508e-06, | |
| "loss": 0.1985, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.06469673405909798, | |
| "grad_norm": 1.1700291435704913, | |
| "learning_rate": 9.897077846701274e-06, | |
| "loss": 0.2178, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.06485225505443235, | |
| "grad_norm": 1.6396026705753728, | |
| "learning_rate": 9.896584148797265e-06, | |
| "loss": 0.2443, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.06500777604976672, | |
| "grad_norm": 0.8511496035113331, | |
| "learning_rate": 9.896089282010338e-06, | |
| "loss": 0.1619, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.06516329704510108, | |
| "grad_norm": 1.3924064844406538, | |
| "learning_rate": 9.895593246458617e-06, | |
| "loss": 0.2021, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.06531881804043546, | |
| "grad_norm": 0.8605197503722029, | |
| "learning_rate": 9.895096042260517e-06, | |
| "loss": 0.1628, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.06547433903576982, | |
| "grad_norm": 1.3908417494412908, | |
| "learning_rate": 9.894597669534729e-06, | |
| "loss": 0.2054, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.0656298600311042, | |
| "grad_norm": 1.445540354985538, | |
| "learning_rate": 9.894098128400219e-06, | |
| "loss": 0.2197, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.06578538102643856, | |
| "grad_norm": 1.3103752658839474, | |
| "learning_rate": 9.893597418976234e-06, | |
| "loss": 0.2297, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.06594090202177294, | |
| "grad_norm": 1.0497805770986521, | |
| "learning_rate": 9.893095541382304e-06, | |
| "loss": 0.1747, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.0660964230171073, | |
| "grad_norm": 1.513640843523071, | |
| "learning_rate": 9.892592495738229e-06, | |
| "loss": 0.1754, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.06625194401244168, | |
| "grad_norm": 1.0493517604475748, | |
| "learning_rate": 9.892088282164098e-06, | |
| "loss": 0.2586, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.06640746500777606, | |
| "grad_norm": 1.4678962231044086, | |
| "learning_rate": 9.89158290078027e-06, | |
| "loss": 0.2932, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.06656298600311042, | |
| "grad_norm": 1.6765991678498569, | |
| "learning_rate": 9.891076351707389e-06, | |
| "loss": 0.2116, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.0667185069984448, | |
| "grad_norm": 1.4655721822686016, | |
| "learning_rate": 9.890568635066373e-06, | |
| "loss": 0.1543, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.06687402799377916, | |
| "grad_norm": 1.6313534003780414, | |
| "learning_rate": 9.890059750978425e-06, | |
| "loss": 0.1571, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.06702954898911354, | |
| "grad_norm": 1.0261848775525118, | |
| "learning_rate": 9.889549699565017e-06, | |
| "loss": 0.2865, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.0671850699844479, | |
| "grad_norm": 1.5225780156038968, | |
| "learning_rate": 9.88903848094791e-06, | |
| "loss": 0.1914, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.06734059097978227, | |
| "grad_norm": 1.3350387169313882, | |
| "learning_rate": 9.888526095249138e-06, | |
| "loss": 0.2754, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.06749611197511664, | |
| "grad_norm": 1.192180411270206, | |
| "learning_rate": 9.888012542591014e-06, | |
| "loss": 0.1974, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.06765163297045101, | |
| "grad_norm": 1.3005497242232493, | |
| "learning_rate": 9.88749782309613e-06, | |
| "loss": 0.1903, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.06780715396578538, | |
| "grad_norm": 1.1288456938448086, | |
| "learning_rate": 9.88698193688736e-06, | |
| "loss": 0.2333, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.06796267496111975, | |
| "grad_norm": 1.130396483559975, | |
| "learning_rate": 9.886464884087846e-06, | |
| "loss": 0.2674, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.06811819595645412, | |
| "grad_norm": 0.9035948769600225, | |
| "learning_rate": 9.885946664821021e-06, | |
| "loss": 0.1864, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.0682737169517885, | |
| "grad_norm": 1.1233476167867031, | |
| "learning_rate": 9.885427279210592e-06, | |
| "loss": 0.1787, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.06842923794712286, | |
| "grad_norm": 1.2410015017602511, | |
| "learning_rate": 9.88490672738054e-06, | |
| "loss": 0.2509, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.06858475894245723, | |
| "grad_norm": 1.3429869818046247, | |
| "learning_rate": 9.884385009455131e-06, | |
| "loss": 0.2811, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.0687402799377916, | |
| "grad_norm": 0.7587532198438675, | |
| "learning_rate": 9.883862125558904e-06, | |
| "loss": 0.1781, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.06889580093312597, | |
| "grad_norm": 0.9782244567957874, | |
| "learning_rate": 9.88333807581668e-06, | |
| "loss": 0.1891, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.06905132192846034, | |
| "grad_norm": 1.8354472673215871, | |
| "learning_rate": 9.882812860353558e-06, | |
| "loss": 0.2372, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.06920684292379471, | |
| "grad_norm": 1.0210293095436775, | |
| "learning_rate": 9.882286479294911e-06, | |
| "loss": 0.1988, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.06936236391912909, | |
| "grad_norm": 2.117567357062213, | |
| "learning_rate": 9.881758932766398e-06, | |
| "loss": 0.1992, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.06951788491446345, | |
| "grad_norm": 1.1644685693150085, | |
| "learning_rate": 9.881230220893948e-06, | |
| "loss": 0.18, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.06967340590979783, | |
| "grad_norm": 1.1209275418337545, | |
| "learning_rate": 9.880700343803773e-06, | |
| "loss": 0.3069, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.06982892690513219, | |
| "grad_norm": 1.155686416296927, | |
| "learning_rate": 9.880169301622362e-06, | |
| "loss": 0.1744, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.06998444790046657, | |
| "grad_norm": 0.9709514091501408, | |
| "learning_rate": 9.879637094476482e-06, | |
| "loss": 0.1871, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07013996889580093, | |
| "grad_norm": 1.1219093494884402, | |
| "learning_rate": 9.87910372249318e-06, | |
| "loss": 0.1932, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.07029548989113531, | |
| "grad_norm": 1.9094748023939434, | |
| "learning_rate": 9.878569185799778e-06, | |
| "loss": 0.2339, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.07045101088646967, | |
| "grad_norm": 1.3264334862739553, | |
| "learning_rate": 9.878033484523876e-06, | |
| "loss": 0.1407, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.07060653188180405, | |
| "grad_norm": 1.667180383137504, | |
| "learning_rate": 9.877496618793356e-06, | |
| "loss": 0.1867, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.07076205287713841, | |
| "grad_norm": 1.0486860196671894, | |
| "learning_rate": 9.876958588736371e-06, | |
| "loss": 0.1683, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.07091757387247279, | |
| "grad_norm": 1.2507603637095628, | |
| "learning_rate": 9.876419394481363e-06, | |
| "loss": 0.1958, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.07107309486780715, | |
| "grad_norm": 1.7806763122908775, | |
| "learning_rate": 9.87587903615704e-06, | |
| "loss": 0.2466, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.07122861586314153, | |
| "grad_norm": 1.0570385231053188, | |
| "learning_rate": 9.875337513892395e-06, | |
| "loss": 0.1336, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.07138413685847589, | |
| "grad_norm": 1.8093621923009064, | |
| "learning_rate": 9.874794827816696e-06, | |
| "loss": 0.245, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.07153965785381027, | |
| "grad_norm": 1.6343174119313473, | |
| "learning_rate": 9.874250978059489e-06, | |
| "loss": 0.1878, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.07169517884914463, | |
| "grad_norm": 1.2474757406216732, | |
| "learning_rate": 9.873705964750603e-06, | |
| "loss": 0.201, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.071850699844479, | |
| "grad_norm": 0.9854370189019162, | |
| "learning_rate": 9.873159788020135e-06, | |
| "loss": 0.1572, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.07200622083981338, | |
| "grad_norm": 1.2046716423202313, | |
| "learning_rate": 9.872612447998466e-06, | |
| "loss": 0.1644, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.07216174183514774, | |
| "grad_norm": 1.6657683984708445, | |
| "learning_rate": 9.872063944816257e-06, | |
| "loss": 0.2026, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.07231726283048212, | |
| "grad_norm": 1.6319780353610651, | |
| "learning_rate": 9.871514278604439e-06, | |
| "loss": 0.2361, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.07247278382581648, | |
| "grad_norm": 0.930626270347552, | |
| "learning_rate": 9.870963449494228e-06, | |
| "loss": 0.2334, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.07262830482115086, | |
| "grad_norm": 1.7347785771237878, | |
| "learning_rate": 9.870411457617115e-06, | |
| "loss": 0.3121, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.07278382581648522, | |
| "grad_norm": 1.6861297399111428, | |
| "learning_rate": 9.869858303104864e-06, | |
| "loss": 0.2234, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.0729393468118196, | |
| "grad_norm": 2.2175613812233856, | |
| "learning_rate": 9.869303986089525e-06, | |
| "loss": 0.215, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.07309486780715396, | |
| "grad_norm": 1.2151103786584494, | |
| "learning_rate": 9.86874850670342e-06, | |
| "loss": 0.143, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.07325038880248834, | |
| "grad_norm": 1.8347498082665927, | |
| "learning_rate": 9.868191865079149e-06, | |
| "loss": 0.1847, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.0734059097978227, | |
| "grad_norm": 0.7662001443118179, | |
| "learning_rate": 9.867634061349592e-06, | |
| "loss": 0.2132, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.07356143079315708, | |
| "grad_norm": 1.127229878211817, | |
| "learning_rate": 9.8670750956479e-06, | |
| "loss": 0.2405, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.07371695178849144, | |
| "grad_norm": 0.8919765028163983, | |
| "learning_rate": 9.866514968107511e-06, | |
| "loss": 0.2187, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.07387247278382582, | |
| "grad_norm": 0.8318099634868261, | |
| "learning_rate": 9.865953678862133e-06, | |
| "loss": 0.149, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.07402799377916018, | |
| "grad_norm": 1.577340616348031, | |
| "learning_rate": 9.865391228045753e-06, | |
| "loss": 0.2319, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.07418351477449456, | |
| "grad_norm": 1.116181816359047, | |
| "learning_rate": 9.864827615792637e-06, | |
| "loss": 0.1901, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.07433903576982892, | |
| "grad_norm": 1.105109643192386, | |
| "learning_rate": 9.864262842237327e-06, | |
| "loss": 0.2011, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.0744945567651633, | |
| "grad_norm": 1.9701207318396636, | |
| "learning_rate": 9.863696907514641e-06, | |
| "loss": 0.2409, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.07465007776049767, | |
| "grad_norm": 2.2498632028053507, | |
| "learning_rate": 9.863129811759678e-06, | |
| "loss": 0.3829, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.07480559875583204, | |
| "grad_norm": 1.1224194434111838, | |
| "learning_rate": 9.86256155510781e-06, | |
| "loss": 0.2114, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.07496111975116641, | |
| "grad_norm": 1.5539407325523458, | |
| "learning_rate": 9.861992137694687e-06, | |
| "loss": 0.1976, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.07511664074650078, | |
| "grad_norm": 1.962092802549792, | |
| "learning_rate": 9.86142155965624e-06, | |
| "loss": 0.2725, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.07527216174183515, | |
| "grad_norm": 0.8983695148666645, | |
| "learning_rate": 9.860849821128668e-06, | |
| "loss": 0.154, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.07542768273716952, | |
| "grad_norm": 1.398592267234838, | |
| "learning_rate": 9.86027692224846e-06, | |
| "loss": 0.1497, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.07558320373250389, | |
| "grad_norm": 1.0403186420901969, | |
| "learning_rate": 9.859702863152372e-06, | |
| "loss": 0.1936, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.07573872472783826, | |
| "grad_norm": 0.7470818354767621, | |
| "learning_rate": 9.859127643977438e-06, | |
| "loss": 0.1523, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.07589424572317263, | |
| "grad_norm": 1.2067693893481815, | |
| "learning_rate": 9.858551264860972e-06, | |
| "loss": 0.3168, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.076049766718507, | |
| "grad_norm": 1.5295551443098423, | |
| "learning_rate": 9.857973725940565e-06, | |
| "loss": 0.2194, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.07620528771384137, | |
| "grad_norm": 1.618418958541224, | |
| "learning_rate": 9.857395027354085e-06, | |
| "loss": 0.2209, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.07636080870917573, | |
| "grad_norm": 1.1696631104347366, | |
| "learning_rate": 9.856815169239671e-06, | |
| "loss": 0.1993, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.07651632970451011, | |
| "grad_norm": 1.4918786793556023, | |
| "learning_rate": 9.856234151735744e-06, | |
| "loss": 0.2657, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.07667185069984447, | |
| "grad_norm": 1.3100404095494855, | |
| "learning_rate": 9.855651974981005e-06, | |
| "loss": 0.2832, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.07682737169517885, | |
| "grad_norm": 13.98784357990924, | |
| "learning_rate": 9.855068639114425e-06, | |
| "loss": 0.2488, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.07698289269051321, | |
| "grad_norm": 1.3956332181045448, | |
| "learning_rate": 9.854484144275254e-06, | |
| "loss": 0.225, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.07713841368584759, | |
| "grad_norm": 1.1858198275947147, | |
| "learning_rate": 9.853898490603018e-06, | |
| "loss": 0.2041, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.07729393468118195, | |
| "grad_norm": 0.765411378364051, | |
| "learning_rate": 9.853311678237524e-06, | |
| "loss": 0.1492, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.07744945567651633, | |
| "grad_norm": 1.2288325537770441, | |
| "learning_rate": 9.85272370731885e-06, | |
| "loss": 0.1773, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.0776049766718507, | |
| "grad_norm": 1.3901203640607709, | |
| "learning_rate": 9.852134577987353e-06, | |
| "loss": 0.2091, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.07776049766718507, | |
| "grad_norm": 1.5991626946866644, | |
| "learning_rate": 9.85154429038367e-06, | |
| "loss": 0.2485, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07776049766718507, | |
| "eval_loss": 0.22800126671791077, | |
| "eval_runtime": 9.4446, | |
| "eval_samples_per_second": 2.753, | |
| "eval_steps_per_second": 0.741, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07791601866251945, | |
| "grad_norm": 0.9946822389547595, | |
| "learning_rate": 9.850952844648705e-06, | |
| "loss": 0.2324, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.07807153965785381, | |
| "grad_norm": 1.088817573789371, | |
| "learning_rate": 9.850360240923647e-06, | |
| "loss": 0.1813, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.07822706065318819, | |
| "grad_norm": 5.945777339639669, | |
| "learning_rate": 9.849766479349959e-06, | |
| "loss": 0.1976, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.07838258164852255, | |
| "grad_norm": 0.8593394406117729, | |
| "learning_rate": 9.84917156006938e-06, | |
| "loss": 0.2474, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.07853810264385692, | |
| "grad_norm": 1.3930666133589364, | |
| "learning_rate": 9.848575483223925e-06, | |
| "loss": 0.215, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.07869362363919129, | |
| "grad_norm": 1.6493288101835173, | |
| "learning_rate": 9.84797824895589e-06, | |
| "loss": 0.303, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.07884914463452566, | |
| "grad_norm": 1.1106903817577367, | |
| "learning_rate": 9.847379857407835e-06, | |
| "loss": 0.1654, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.07900466562986003, | |
| "grad_norm": 1.166896696404847, | |
| "learning_rate": 9.846780308722612e-06, | |
| "loss": 0.2046, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.0791601866251944, | |
| "grad_norm": 1.7221901123414272, | |
| "learning_rate": 9.846179603043338e-06, | |
| "loss": 0.2543, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.07931570762052877, | |
| "grad_norm": 1.0398664154595585, | |
| "learning_rate": 9.845577740513409e-06, | |
| "loss": 0.2616, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.07947122861586314, | |
| "grad_norm": 1.2062182369254026, | |
| "learning_rate": 9.8449747212765e-06, | |
| "loss": 0.1641, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.0796267496111975, | |
| "grad_norm": 1.3859318575453086, | |
| "learning_rate": 9.84437054547656e-06, | |
| "loss": 0.193, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.07978227060653188, | |
| "grad_norm": 3.5056235741823523, | |
| "learning_rate": 9.843765213257814e-06, | |
| "loss": 0.2399, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.07993779160186625, | |
| "grad_norm": 1.2578551416521373, | |
| "learning_rate": 9.843158724764762e-06, | |
| "loss": 0.2177, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.08009331259720062, | |
| "grad_norm": 1.4118043035204642, | |
| "learning_rate": 9.842551080142182e-06, | |
| "loss": 0.21, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.080248833592535, | |
| "grad_norm": 1.1155160124053434, | |
| "learning_rate": 9.841942279535128e-06, | |
| "loss": 0.2128, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.08040435458786936, | |
| "grad_norm": 1.0287833439256027, | |
| "learning_rate": 9.84133232308893e-06, | |
| "loss": 0.1846, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.08055987558320374, | |
| "grad_norm": 2.2894965228305377, | |
| "learning_rate": 9.84072121094919e-06, | |
| "loss": 0.1814, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.0807153965785381, | |
| "grad_norm": 1.345886098139959, | |
| "learning_rate": 9.84010894326179e-06, | |
| "loss": 0.1912, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.08087091757387248, | |
| "grad_norm": 1.9234609876851483, | |
| "learning_rate": 9.83949552017289e-06, | |
| "loss": 0.2982, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.08102643856920684, | |
| "grad_norm": 1.2452886345823744, | |
| "learning_rate": 9.83888094182892e-06, | |
| "loss": 0.2144, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.08118195956454122, | |
| "grad_norm": 1.2711995935698062, | |
| "learning_rate": 9.838265208376584e-06, | |
| "loss": 0.1799, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.08133748055987558, | |
| "grad_norm": 1.0755729955519457, | |
| "learning_rate": 9.837648319962876e-06, | |
| "loss": 0.3311, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.08149300155520996, | |
| "grad_norm": 1.5025152130217085, | |
| "learning_rate": 9.837030276735049e-06, | |
| "loss": 0.203, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.08164852255054432, | |
| "grad_norm": 1.4271542860149822, | |
| "learning_rate": 9.83641107884064e-06, | |
| "loss": 0.2055, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.0818040435458787, | |
| "grad_norm": 1.1896665999932865, | |
| "learning_rate": 9.83579072642746e-06, | |
| "loss": 0.2191, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.08195956454121306, | |
| "grad_norm": 1.6391797544527267, | |
| "learning_rate": 9.835169219643597e-06, | |
| "loss": 0.2164, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.08211508553654744, | |
| "grad_norm": 1.3905263994766632, | |
| "learning_rate": 9.834546558637412e-06, | |
| "loss": 0.2188, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.0822706065318818, | |
| "grad_norm": 1.1325886941547982, | |
| "learning_rate": 9.833922743557545e-06, | |
| "loss": 0.3596, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.08242612752721618, | |
| "grad_norm": 1.58458236573862, | |
| "learning_rate": 9.833297774552905e-06, | |
| "loss": 0.2725, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.08258164852255054, | |
| "grad_norm": 1.2630288499628133, | |
| "learning_rate": 9.832671651772685e-06, | |
| "loss": 0.3327, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.08273716951788491, | |
| "grad_norm": 1.1472998381036559, | |
| "learning_rate": 9.832044375366347e-06, | |
| "loss": 0.1758, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.08289269051321929, | |
| "grad_norm": 1.148175513948538, | |
| "learning_rate": 9.831415945483634e-06, | |
| "loss": 0.189, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.08304821150855365, | |
| "grad_norm": 1.2010115460022994, | |
| "learning_rate": 9.830786362274556e-06, | |
| "loss": 0.2065, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.08320373250388803, | |
| "grad_norm": 1.357353814240526, | |
| "learning_rate": 9.830155625889406e-06, | |
| "loss": 0.1505, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.0833592534992224, | |
| "grad_norm": 1.2541527971168078, | |
| "learning_rate": 9.829523736478748e-06, | |
| "loss": 0.2309, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.08351477449455677, | |
| "grad_norm": 1.0453169347517781, | |
| "learning_rate": 9.828890694193425e-06, | |
| "loss": 0.1593, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.08367029548989113, | |
| "grad_norm": 1.256435896986176, | |
| "learning_rate": 9.828256499184553e-06, | |
| "loss": 0.2081, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.08382581648522551, | |
| "grad_norm": 1.4617851677608784, | |
| "learning_rate": 9.827621151603522e-06, | |
| "loss": 0.2181, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.08398133748055987, | |
| "grad_norm": 2.512069587946666, | |
| "learning_rate": 9.826984651601998e-06, | |
| "loss": 0.4003, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.08413685847589425, | |
| "grad_norm": 1.1168842922612399, | |
| "learning_rate": 9.826346999331923e-06, | |
| "loss": 0.2823, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.08429237947122861, | |
| "grad_norm": 1.0417831453973136, | |
| "learning_rate": 9.825708194945514e-06, | |
| "loss": 0.1889, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.08444790046656299, | |
| "grad_norm": 0.9096481087872343, | |
| "learning_rate": 9.82506823859526e-06, | |
| "loss": 0.2351, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.08460342146189735, | |
| "grad_norm": 1.0430082881953087, | |
| "learning_rate": 9.824427130433932e-06, | |
| "loss": 0.1953, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.08475894245723173, | |
| "grad_norm": 0.6608850743713712, | |
| "learning_rate": 9.823784870614568e-06, | |
| "loss": 0.1854, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.08491446345256609, | |
| "grad_norm": 0.9535990803944258, | |
| "learning_rate": 9.823141459290486e-06, | |
| "loss": 0.3623, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.08506998444790047, | |
| "grad_norm": 1.2084813471627978, | |
| "learning_rate": 9.822496896615276e-06, | |
| "loss": 0.2088, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.08522550544323483, | |
| "grad_norm": 1.751880921507202, | |
| "learning_rate": 9.821851182742806e-06, | |
| "loss": 0.2367, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.08538102643856921, | |
| "grad_norm": 0.859776642879622, | |
| "learning_rate": 9.821204317827214e-06, | |
| "loss": 0.249, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.08553654743390357, | |
| "grad_norm": 1.127529266910784, | |
| "learning_rate": 9.820556302022916e-06, | |
| "loss": 0.2038, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.08569206842923795, | |
| "grad_norm": 1.1762380712487397, | |
| "learning_rate": 9.819907135484607e-06, | |
| "loss": 0.1408, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.08584758942457232, | |
| "grad_norm": 1.1841316789710945, | |
| "learning_rate": 9.819256818367247e-06, | |
| "loss": 0.1971, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.08600311041990669, | |
| "grad_norm": 0.9978225930526609, | |
| "learning_rate": 9.818605350826078e-06, | |
| "loss": 0.2221, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.08615863141524106, | |
| "grad_norm": 1.6694424755142652, | |
| "learning_rate": 9.817952733016614e-06, | |
| "loss": 0.1549, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.08631415241057543, | |
| "grad_norm": 0.9346983450738274, | |
| "learning_rate": 9.817298965094644e-06, | |
| "loss": 0.1579, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.0864696734059098, | |
| "grad_norm": 1.147526345911482, | |
| "learning_rate": 9.816644047216231e-06, | |
| "loss": 0.1873, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.08662519440124417, | |
| "grad_norm": 1.1886850012764587, | |
| "learning_rate": 9.815987979537713e-06, | |
| "loss": 0.2347, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.08678071539657854, | |
| "grad_norm": 1.6793433753087175, | |
| "learning_rate": 9.815330762215704e-06, | |
| "loss": 0.2773, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.0869362363919129, | |
| "grad_norm": 0.7389091927152867, | |
| "learning_rate": 9.81467239540709e-06, | |
| "loss": 0.2376, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.08709175738724728, | |
| "grad_norm": 1.5501383478894555, | |
| "learning_rate": 9.814012879269031e-06, | |
| "loss": 0.249, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.08724727838258164, | |
| "grad_norm": 1.985092307546573, | |
| "learning_rate": 9.813352213958966e-06, | |
| "loss": 0.2293, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.08740279937791602, | |
| "grad_norm": 1.1408911673993625, | |
| "learning_rate": 9.812690399634601e-06, | |
| "loss": 0.29, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.08755832037325038, | |
| "grad_norm": 1.2461126532920535, | |
| "learning_rate": 9.812027436453924e-06, | |
| "loss": 0.2783, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.08771384136858476, | |
| "grad_norm": 1.764223151926025, | |
| "learning_rate": 9.81136332457519e-06, | |
| "loss": 0.2528, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.08786936236391912, | |
| "grad_norm": 1.0618642840366128, | |
| "learning_rate": 9.810698064156935e-06, | |
| "loss": 0.1723, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.0880248833592535, | |
| "grad_norm": 0.8569330765683667, | |
| "learning_rate": 9.810031655357964e-06, | |
| "loss": 0.2241, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.08818040435458786, | |
| "grad_norm": 1.0553303848822568, | |
| "learning_rate": 9.80936409833736e-06, | |
| "loss": 0.2312, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.08833592534992224, | |
| "grad_norm": 1.8702866312005988, | |
| "learning_rate": 9.808695393254474e-06, | |
| "loss": 0.1949, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.08849144634525662, | |
| "grad_norm": 0.9476538253542002, | |
| "learning_rate": 9.808025540268939e-06, | |
| "loss": 0.1783, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.08864696734059098, | |
| "grad_norm": 1.4661601306937122, | |
| "learning_rate": 9.80735453954066e-06, | |
| "loss": 0.2941, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.08880248833592536, | |
| "grad_norm": 1.1865752816456114, | |
| "learning_rate": 9.80668239122981e-06, | |
| "loss": 0.2196, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.08895800933125972, | |
| "grad_norm": 0.9682721759722641, | |
| "learning_rate": 9.80600909549684e-06, | |
| "loss": 0.2453, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.0891135303265941, | |
| "grad_norm": 1.0402552655035497, | |
| "learning_rate": 9.805334652502478e-06, | |
| "loss": 0.2528, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.08926905132192846, | |
| "grad_norm": 1.1058208608284787, | |
| "learning_rate": 9.804659062407721e-06, | |
| "loss": 0.1704, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.08942457231726283, | |
| "grad_norm": 0.9300562072054855, | |
| "learning_rate": 9.803982325373843e-06, | |
| "loss": 0.241, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.0895800933125972, | |
| "grad_norm": 1.3452145435832572, | |
| "learning_rate": 9.803304441562391e-06, | |
| "loss": 0.179, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.08973561430793157, | |
| "grad_norm": 0.934714522466104, | |
| "learning_rate": 9.802625411135183e-06, | |
| "loss": 0.2131, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.08989113530326594, | |
| "grad_norm": 1.2723518042915498, | |
| "learning_rate": 9.801945234254315e-06, | |
| "loss": 0.2342, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.09004665629860031, | |
| "grad_norm": 2.11692073632197, | |
| "learning_rate": 9.801263911082154e-06, | |
| "loss": 0.2148, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.09020217729393468, | |
| "grad_norm": 2.6365326907523396, | |
| "learning_rate": 9.800581441781342e-06, | |
| "loss": 0.2787, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.09035769828926905, | |
| "grad_norm": 1.3369047254369875, | |
| "learning_rate": 9.799897826514793e-06, | |
| "loss": 0.2365, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.09051321928460342, | |
| "grad_norm": 0.9493060685693816, | |
| "learning_rate": 9.799213065445696e-06, | |
| "loss": 0.1656, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.09066874027993779, | |
| "grad_norm": 1.0470819909783555, | |
| "learning_rate": 9.798527158737512e-06, | |
| "loss": 0.1578, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.09082426127527216, | |
| "grad_norm": 1.0969444747176942, | |
| "learning_rate": 9.797840106553977e-06, | |
| "loss": 0.2095, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.09097978227060653, | |
| "grad_norm": 1.6035875172395766, | |
| "learning_rate": 9.797151909059102e-06, | |
| "loss": 0.2682, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.0911353032659409, | |
| "grad_norm": 1.3049640527657593, | |
| "learning_rate": 9.796462566417169e-06, | |
| "loss": 0.2537, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.09129082426127527, | |
| "grad_norm": 1.365745492042764, | |
| "learning_rate": 9.79577207879273e-06, | |
| "loss": 0.2065, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.09144634525660965, | |
| "grad_norm": 0.9500261347653985, | |
| "learning_rate": 9.795080446350616e-06, | |
| "loss": 0.1885, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.09160186625194401, | |
| "grad_norm": 1.5405453397493063, | |
| "learning_rate": 9.79438766925593e-06, | |
| "loss": 0.2507, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.09175738724727839, | |
| "grad_norm": 0.9919977440587929, | |
| "learning_rate": 9.79369374767405e-06, | |
| "loss": 0.1607, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.09191290824261275, | |
| "grad_norm": 1.2052697190695243, | |
| "learning_rate": 9.79299868177062e-06, | |
| "loss": 0.2247, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.09206842923794713, | |
| "grad_norm": 1.5911347684916193, | |
| "learning_rate": 9.792302471711564e-06, | |
| "loss": 0.1812, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.09222395023328149, | |
| "grad_norm": 1.3772469912987155, | |
| "learning_rate": 9.791605117663076e-06, | |
| "loss": 0.1567, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.09237947122861587, | |
| "grad_norm": 1.456752513640415, | |
| "learning_rate": 9.790906619791627e-06, | |
| "loss": 0.2009, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.09253499222395023, | |
| "grad_norm": 0.9824754188966437, | |
| "learning_rate": 9.790206978263955e-06, | |
| "loss": 0.2041, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.0926905132192846, | |
| "grad_norm": 1.1576177882724517, | |
| "learning_rate": 9.789506193247075e-06, | |
| "loss": 0.2304, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.09284603421461897, | |
| "grad_norm": 1.3814578099918997, | |
| "learning_rate": 9.788804264908276e-06, | |
| "loss": 0.1935, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.09300155520995335, | |
| "grad_norm": 0.8483069008778095, | |
| "learning_rate": 9.788101193415116e-06, | |
| "loss": 0.2148, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.09315707620528771, | |
| "grad_norm": 1.3477202886979611, | |
| "learning_rate": 9.787396978935431e-06, | |
| "loss": 0.23, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.09331259720062209, | |
| "grad_norm": 1.4372703771133322, | |
| "learning_rate": 9.786691621637322e-06, | |
| "loss": 0.2496, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09331259720062209, | |
| "eval_loss": 0.22426502406597137, | |
| "eval_runtime": 9.4405, | |
| "eval_samples_per_second": 2.754, | |
| "eval_steps_per_second": 0.741, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09346811819595645, | |
| "grad_norm": 1.9416761367047068, | |
| "learning_rate": 9.785985121689171e-06, | |
| "loss": 0.6927, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 0.09362363919129083, | |
| "grad_norm": 1.268764907148312, | |
| "learning_rate": 9.785277479259629e-06, | |
| "loss": 0.2501, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.09377916018662519, | |
| "grad_norm": 2.3273439454641514, | |
| "learning_rate": 9.784568694517618e-06, | |
| "loss": 0.2469, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 0.09393468118195956, | |
| "grad_norm": 1.8747313801721335, | |
| "learning_rate": 9.783858767632338e-06, | |
| "loss": 0.2289, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.09409020217729394, | |
| "grad_norm": 1.2586569667037595, | |
| "learning_rate": 9.783147698773257e-06, | |
| "loss": 0.1962, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.0942457231726283, | |
| "grad_norm": 1.496316694651238, | |
| "learning_rate": 9.782435488110116e-06, | |
| "loss": 0.298, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.09440124416796268, | |
| "grad_norm": 1.1801510466185432, | |
| "learning_rate": 9.781722135812932e-06, | |
| "loss": 0.2189, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 0.09455676516329704, | |
| "grad_norm": 1.2565748248573585, | |
| "learning_rate": 9.78100764205199e-06, | |
| "loss": 0.2186, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.09471228615863142, | |
| "grad_norm": 0.9391168040034623, | |
| "learning_rate": 9.780292006997849e-06, | |
| "loss": 0.2144, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.09486780715396578, | |
| "grad_norm": 1.1387381134081225, | |
| "learning_rate": 9.779575230821344e-06, | |
| "loss": 0.1718, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.09502332814930016, | |
| "grad_norm": 1.0177855034745955, | |
| "learning_rate": 9.778857313693578e-06, | |
| "loss": 0.1586, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 0.09517884914463452, | |
| "grad_norm": 1.4624255805438011, | |
| "learning_rate": 9.778138255785928e-06, | |
| "loss": 0.2697, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.0953343701399689, | |
| "grad_norm": 1.0796167649791846, | |
| "learning_rate": 9.77741805727004e-06, | |
| "loss": 0.2668, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 0.09548989113530326, | |
| "grad_norm": 2.1747859377128806, | |
| "learning_rate": 9.776696718317842e-06, | |
| "loss": 0.2117, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.09564541213063764, | |
| "grad_norm": 1.6173977205310859, | |
| "learning_rate": 9.775974239101522e-06, | |
| "loss": 0.2048, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.095800933125972, | |
| "grad_norm": 1.281075534048029, | |
| "learning_rate": 9.775250619793548e-06, | |
| "loss": 0.2218, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.09595645412130638, | |
| "grad_norm": 1.5623409338338163, | |
| "learning_rate": 9.77452586056666e-06, | |
| "loss": 0.2843, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 0.09611197511664074, | |
| "grad_norm": 0.920135780872905, | |
| "learning_rate": 9.773799961593862e-06, | |
| "loss": 0.218, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.09626749611197512, | |
| "grad_norm": 1.6644765009913491, | |
| "learning_rate": 9.773072923048443e-06, | |
| "loss": 0.277, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 0.09642301710730948, | |
| "grad_norm": 1.0758387537045102, | |
| "learning_rate": 9.772344745103955e-06, | |
| "loss": 0.2405, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.09657853810264386, | |
| "grad_norm": 1.1751354263981124, | |
| "learning_rate": 9.77161542793422e-06, | |
| "loss": 0.2362, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 0.09673405909797823, | |
| "grad_norm": 2.7957127911749655, | |
| "learning_rate": 9.770884971713344e-06, | |
| "loss": 0.178, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.0968895800933126, | |
| "grad_norm": 5.021758252286217, | |
| "learning_rate": 9.770153376615692e-06, | |
| "loss": 0.2095, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.09704510108864697, | |
| "grad_norm": 0.8518883317455118, | |
| "learning_rate": 9.769420642815905e-06, | |
| "loss": 0.2174, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.09720062208398134, | |
| "grad_norm": 1.0603512343033086, | |
| "learning_rate": 9.7686867704889e-06, | |
| "loss": 0.2437, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.09735614307931571, | |
| "grad_norm": 2.7767054670419067, | |
| "learning_rate": 9.767951759809861e-06, | |
| "loss": 0.3072, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 0.09751166407465008, | |
| "grad_norm": 0.875830402681162, | |
| "learning_rate": 9.767215610954246e-06, | |
| "loss": 0.1865, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 0.09766718506998445, | |
| "grad_norm": 1.1746324049289305, | |
| "learning_rate": 9.766478324097784e-06, | |
| "loss": 0.1775, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 0.09782270606531882, | |
| "grad_norm": 1.3198405804921558, | |
| "learning_rate": 9.765739899416474e-06, | |
| "loss": 0.2202, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 0.09797822706065319, | |
| "grad_norm": 0.9040537149469751, | |
| "learning_rate": 9.76500033708659e-06, | |
| "loss": 0.134, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.09813374805598755, | |
| "grad_norm": 1.1116680855923542, | |
| "learning_rate": 9.764259637284674e-06, | |
| "loss": 0.2413, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 0.09828926905132193, | |
| "grad_norm": 1.816511140625042, | |
| "learning_rate": 9.763517800187543e-06, | |
| "loss": 0.1881, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.0984447900466563, | |
| "grad_norm": 1.1808179637924803, | |
| "learning_rate": 9.762774825972284e-06, | |
| "loss": 0.1797, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 0.09860031104199067, | |
| "grad_norm": 0.9260180174403776, | |
| "learning_rate": 9.762030714816255e-06, | |
| "loss": 0.1692, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 0.09875583203732503, | |
| "grad_norm": 0.9809663827224766, | |
| "learning_rate": 9.761285466897086e-06, | |
| "loss": 0.1971, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.09891135303265941, | |
| "grad_norm": 1.1818951833176021, | |
| "learning_rate": 9.760539082392678e-06, | |
| "loss": 0.3061, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 0.09906687402799377, | |
| "grad_norm": 1.5126562950843534, | |
| "learning_rate": 9.759791561481201e-06, | |
| "loss": 0.2214, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 0.09922239502332815, | |
| "grad_norm": 1.1563368410762391, | |
| "learning_rate": 9.759042904341103e-06, | |
| "loss": 0.1879, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 0.09937791601866251, | |
| "grad_norm": 1.7465834025848672, | |
| "learning_rate": 9.758293111151094e-06, | |
| "loss": 0.2936, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 0.09953343701399689, | |
| "grad_norm": 1.4420901394687415, | |
| "learning_rate": 9.757542182090165e-06, | |
| "loss": 0.1977, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.09968895800933127, | |
| "grad_norm": 1.4320029014579423, | |
| "learning_rate": 9.756790117337569e-06, | |
| "loss": 0.235, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 0.09984447900466563, | |
| "grad_norm": 1.0178157213981396, | |
| "learning_rate": 9.756036917072837e-06, | |
| "loss": 0.228, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.634337451034447, | |
| "learning_rate": 9.755282581475769e-06, | |
| "loss": 0.174, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 0.10015552099533437, | |
| "grad_norm": 1.3123622467109133, | |
| "learning_rate": 9.754527110726432e-06, | |
| "loss": 0.1854, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 0.10031104199066875, | |
| "grad_norm": 1.3700959071130703, | |
| "learning_rate": 9.753770505005171e-06, | |
| "loss": 0.271, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.10046656298600311, | |
| "grad_norm": 1.5589446061903662, | |
| "learning_rate": 9.753012764492596e-06, | |
| "loss": 0.1669, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 0.10062208398133748, | |
| "grad_norm": 1.3813884723817376, | |
| "learning_rate": 9.752253889369592e-06, | |
| "loss": 0.1525, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 0.10077760497667185, | |
| "grad_norm": 1.3858844961504873, | |
| "learning_rate": 9.75149387981731e-06, | |
| "loss": 0.2673, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.10093312597200622, | |
| "grad_norm": 0.9436000404569762, | |
| "learning_rate": 9.75073273601718e-06, | |
| "loss": 0.2058, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 0.10108864696734059, | |
| "grad_norm": 1.4599521330072638, | |
| "learning_rate": 9.749970458150893e-06, | |
| "loss": 0.2145, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.10124416796267496, | |
| "grad_norm": 1.3455835009343615, | |
| "learning_rate": 9.749207046400415e-06, | |
| "loss": 0.2353, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 0.10139968895800933, | |
| "grad_norm": 1.6299219848605395, | |
| "learning_rate": 9.748442500947988e-06, | |
| "loss": 0.2582, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 0.1015552099533437, | |
| "grad_norm": 2.1538893724554966, | |
| "learning_rate": 9.747676821976116e-06, | |
| "loss": 0.2128, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 0.10171073094867807, | |
| "grad_norm": 1.1642628978054306, | |
| "learning_rate": 9.746910009667577e-06, | |
| "loss": 0.2092, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 0.10186625194401244, | |
| "grad_norm": 0.9776673463806724, | |
| "learning_rate": 9.746142064205422e-06, | |
| "loss": 0.176, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.1020217729393468, | |
| "grad_norm": 1.350687490540933, | |
| "learning_rate": 9.745372985772968e-06, | |
| "loss": 0.2426, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 0.10217729393468118, | |
| "grad_norm": 1.7681295289484116, | |
| "learning_rate": 9.744602774553807e-06, | |
| "loss": 0.2204, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 0.10233281493001556, | |
| "grad_norm": 0.9199423619051535, | |
| "learning_rate": 9.743831430731796e-06, | |
| "loss": 0.1647, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 0.10248833592534992, | |
| "grad_norm": 5.138426947168042, | |
| "learning_rate": 9.743058954491067e-06, | |
| "loss": 0.2107, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 0.1026438569206843, | |
| "grad_norm": 1.446510693113484, | |
| "learning_rate": 9.742285346016024e-06, | |
| "loss": 0.2379, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.10279937791601866, | |
| "grad_norm": 1.4833539837619547, | |
| "learning_rate": 9.741510605491335e-06, | |
| "loss": 0.1714, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 0.10295489891135304, | |
| "grad_norm": 1.3228899574182327, | |
| "learning_rate": 9.74073473310194e-06, | |
| "loss": 0.2388, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 0.1031104199066874, | |
| "grad_norm": 1.0712502633957945, | |
| "learning_rate": 9.739957729033054e-06, | |
| "loss": 0.2289, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 0.10326594090202178, | |
| "grad_norm": 1.1587775220461487, | |
| "learning_rate": 9.739179593470156e-06, | |
| "loss": 0.1741, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 0.10342146189735614, | |
| "grad_norm": 1.0260279383302884, | |
| "learning_rate": 9.738400326599e-06, | |
| "loss": 0.2412, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.10357698289269052, | |
| "grad_norm": 1.491042707966078, | |
| "learning_rate": 9.737619928605605e-06, | |
| "loss": 0.1833, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 0.10373250388802488, | |
| "grad_norm": 1.6710832262506907, | |
| "learning_rate": 9.736838399676266e-06, | |
| "loss": 0.1712, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 0.10388802488335926, | |
| "grad_norm": 1.4001413138925893, | |
| "learning_rate": 9.736055739997543e-06, | |
| "loss": 0.2739, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 0.10404354587869362, | |
| "grad_norm": 1.0413982567358797, | |
| "learning_rate": 9.735271949756269e-06, | |
| "loss": 0.1655, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 0.104199066874028, | |
| "grad_norm": 2.062452927969995, | |
| "learning_rate": 9.734487029139544e-06, | |
| "loss": 0.2384, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.10435458786936236, | |
| "grad_norm": 1.1419346714711909, | |
| "learning_rate": 9.733700978334741e-06, | |
| "loss": 0.2176, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 0.10451010886469674, | |
| "grad_norm": 1.4704145498498906, | |
| "learning_rate": 9.7329137975295e-06, | |
| "loss": 0.2281, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.1046656298600311, | |
| "grad_norm": 1.7257595787120843, | |
| "learning_rate": 9.732125486911733e-06, | |
| "loss": 0.1964, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 0.10482115085536547, | |
| "grad_norm": 1.596182048450316, | |
| "learning_rate": 9.731336046669621e-06, | |
| "loss": 0.1863, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 0.10497667185069985, | |
| "grad_norm": 1.741565962255971, | |
| "learning_rate": 9.730545476991613e-06, | |
| "loss": 0.1358, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.10513219284603421, | |
| "grad_norm": 1.2105023861624677, | |
| "learning_rate": 9.729753778066431e-06, | |
| "loss": 0.2757, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 0.10528771384136859, | |
| "grad_norm": 1.1483441998296096, | |
| "learning_rate": 9.728960950083062e-06, | |
| "loss": 0.2327, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 0.10544323483670295, | |
| "grad_norm": 2.6827889453865255, | |
| "learning_rate": 9.728166993230768e-06, | |
| "loss": 0.2841, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 0.10559875583203733, | |
| "grad_norm": 1.3531013447523792, | |
| "learning_rate": 9.727371907699075e-06, | |
| "loss": 0.2742, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 0.1057542768273717, | |
| "grad_norm": 1.4165422039945663, | |
| "learning_rate": 9.726575693677782e-06, | |
| "loss": 0.1733, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.10590979782270607, | |
| "grad_norm": 1.1633994693280907, | |
| "learning_rate": 9.725778351356958e-06, | |
| "loss": 0.1752, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 0.10606531881804043, | |
| "grad_norm": 1.4801298044861129, | |
| "learning_rate": 9.724979880926937e-06, | |
| "loss": 0.1654, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 0.10622083981337481, | |
| "grad_norm": 1.038476254792903, | |
| "learning_rate": 9.724180282578327e-06, | |
| "loss": 0.1796, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 0.10637636080870917, | |
| "grad_norm": 1.1715546692057253, | |
| "learning_rate": 9.723379556502002e-06, | |
| "loss": 0.2615, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 0.10653188180404355, | |
| "grad_norm": 0.9669903775949065, | |
| "learning_rate": 9.722577702889106e-06, | |
| "loss": 0.2217, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.10668740279937791, | |
| "grad_norm": 0.9554324370526551, | |
| "learning_rate": 9.721774721931056e-06, | |
| "loss": 0.2067, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 0.10684292379471229, | |
| "grad_norm": 1.5055382554521828, | |
| "learning_rate": 9.720970613819532e-06, | |
| "loss": 0.2886, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 0.10699844479004665, | |
| "grad_norm": 1.4701983051316598, | |
| "learning_rate": 9.720165378746486e-06, | |
| "loss": 0.2461, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.10715396578538103, | |
| "grad_norm": 0.8955915121278603, | |
| "learning_rate": 9.719359016904137e-06, | |
| "loss": 0.1296, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 0.10730948678071539, | |
| "grad_norm": 1.1365940197104127, | |
| "learning_rate": 9.718551528484979e-06, | |
| "loss": 0.1756, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.10746500777604977, | |
| "grad_norm": 1.1309854500820393, | |
| "learning_rate": 9.717742913681769e-06, | |
| "loss": 0.1685, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 0.10762052877138413, | |
| "grad_norm": 1.228647590848163, | |
| "learning_rate": 9.716933172687533e-06, | |
| "loss": 0.1988, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 0.1077760497667185, | |
| "grad_norm": 1.8437087557242553, | |
| "learning_rate": 9.71612230569557e-06, | |
| "loss": 0.2259, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 0.10793157076205288, | |
| "grad_norm": 2.190128145243616, | |
| "learning_rate": 9.715310312899445e-06, | |
| "loss": 0.1593, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 0.10808709175738725, | |
| "grad_norm": 1.9542747095305757, | |
| "learning_rate": 9.714497194492988e-06, | |
| "loss": 0.1942, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.10824261275272162, | |
| "grad_norm": 1.190017072453523, | |
| "learning_rate": 9.713682950670305e-06, | |
| "loss": 0.184, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 0.10839813374805599, | |
| "grad_norm": 1.3702585397170965, | |
| "learning_rate": 9.712867581625769e-06, | |
| "loss": 0.2747, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 0.10855365474339036, | |
| "grad_norm": 1.1224607857205071, | |
| "learning_rate": 9.712051087554017e-06, | |
| "loss": 0.1851, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 0.10870917573872473, | |
| "grad_norm": 1.1610995749820388, | |
| "learning_rate": 9.711233468649958e-06, | |
| "loss": 0.1651, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 0.1088646967340591, | |
| "grad_norm": 1.0713548580433974, | |
| "learning_rate": 9.710414725108771e-06, | |
| "loss": 0.2798, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1088646967340591, | |
| "eval_loss": 0.2192843109369278, | |
| "eval_runtime": 9.4454, | |
| "eval_samples_per_second": 2.753, | |
| "eval_steps_per_second": 0.741, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10902021772939346, | |
| "grad_norm": 1.086974338576193, | |
| "learning_rate": 9.709594857125898e-06, | |
| "loss": 0.3235, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 0.10917573872472784, | |
| "grad_norm": 3.455927159294357, | |
| "learning_rate": 9.708773864897059e-06, | |
| "loss": 0.1502, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 0.1093312597200622, | |
| "grad_norm": 1.6070730415734276, | |
| "learning_rate": 9.707951748618229e-06, | |
| "loss": 0.2652, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 0.10948678071539658, | |
| "grad_norm": 1.0297377958380671, | |
| "learning_rate": 9.707128508485663e-06, | |
| "loss": 0.2352, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.10964230171073094, | |
| "grad_norm": 1.07292209906991, | |
| "learning_rate": 9.706304144695877e-06, | |
| "loss": 0.1471, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.10979782270606532, | |
| "grad_norm": 1.2095547877752455, | |
| "learning_rate": 9.705478657445661e-06, | |
| "loss": 0.2107, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 0.10995334370139968, | |
| "grad_norm": 1.307669146215221, | |
| "learning_rate": 9.70465204693207e-06, | |
| "loss": 0.2337, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 0.11010886469673406, | |
| "grad_norm": 0.8004125116368356, | |
| "learning_rate": 9.703824313352428e-06, | |
| "loss": 0.2042, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 0.11026438569206842, | |
| "grad_norm": 1.5202724190274493, | |
| "learning_rate": 9.702995456904323e-06, | |
| "loss": 0.2446, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 0.1104199066874028, | |
| "grad_norm": 1.3109419274601464, | |
| "learning_rate": 9.702165477785618e-06, | |
| "loss": 0.2791, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.11057542768273718, | |
| "grad_norm": 1.2175779348655416, | |
| "learning_rate": 9.70133437619444e-06, | |
| "loss": 0.2787, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 0.11073094867807154, | |
| "grad_norm": 2.4619987863193824, | |
| "learning_rate": 9.700502152329182e-06, | |
| "loss": 0.2184, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.11088646967340592, | |
| "grad_norm": 1.1204962171981678, | |
| "learning_rate": 9.69966880638851e-06, | |
| "loss": 0.1796, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 0.11104199066874028, | |
| "grad_norm": 1.3460375672771012, | |
| "learning_rate": 9.698834338571355e-06, | |
| "loss": 0.1536, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 0.11119751166407466, | |
| "grad_norm": 1.4551247859245915, | |
| "learning_rate": 9.697998749076916e-06, | |
| "loss": 0.1775, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.11135303265940902, | |
| "grad_norm": 1.64865769787968, | |
| "learning_rate": 9.69716203810466e-06, | |
| "loss": 0.2341, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 0.1115085536547434, | |
| "grad_norm": 1.8250018792840808, | |
| "learning_rate": 9.696324205854322e-06, | |
| "loss": 0.2058, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 0.11166407465007776, | |
| "grad_norm": 1.067050937242904, | |
| "learning_rate": 9.695485252525902e-06, | |
| "loss": 0.1463, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 0.11181959564541213, | |
| "grad_norm": 2.2821274396758127, | |
| "learning_rate": 9.694645178319673e-06, | |
| "loss": 0.2508, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 0.1119751166407465, | |
| "grad_norm": 1.388014808020173, | |
| "learning_rate": 9.69380398343617e-06, | |
| "loss": 0.1977, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.11213063763608087, | |
| "grad_norm": 1.5658859493501038, | |
| "learning_rate": 9.692961668076197e-06, | |
| "loss": 0.2291, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 0.11228615863141524, | |
| "grad_norm": 1.0853791710998715, | |
| "learning_rate": 9.69211823244083e-06, | |
| "loss": 0.2763, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 0.11244167962674961, | |
| "grad_norm": 1.27256020581809, | |
| "learning_rate": 9.691273676731408e-06, | |
| "loss": 0.195, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 0.11259720062208398, | |
| "grad_norm": 0.6768405188507002, | |
| "learning_rate": 9.690428001149537e-06, | |
| "loss": 0.1839, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 0.11275272161741835, | |
| "grad_norm": 3.309861478677342, | |
| "learning_rate": 9.68958120589709e-06, | |
| "loss": 0.1446, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.11290824261275272, | |
| "grad_norm": 1.3577561463931358, | |
| "learning_rate": 9.688733291176211e-06, | |
| "loss": 0.174, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 0.11306376360808709, | |
| "grad_norm": 0.7899130738957459, | |
| "learning_rate": 9.68788425718931e-06, | |
| "loss": 0.1819, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 0.11321928460342146, | |
| "grad_norm": 1.9374468863177388, | |
| "learning_rate": 9.68703410413906e-06, | |
| "loss": 0.2148, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.11337480559875583, | |
| "grad_norm": 0.9790173123360771, | |
| "learning_rate": 9.686182832228408e-06, | |
| "loss": 0.1842, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 0.11353032659409021, | |
| "grad_norm": 1.8838507925348544, | |
| "learning_rate": 9.685330441660564e-06, | |
| "loss": 0.2482, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.11368584758942457, | |
| "grad_norm": 1.7209011423931209, | |
| "learning_rate": 9.684476932639002e-06, | |
| "loss": 0.1938, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 0.11384136858475895, | |
| "grad_norm": 1.3133247484457822, | |
| "learning_rate": 9.68362230536747e-06, | |
| "loss": 0.1629, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 0.11399688958009331, | |
| "grad_norm": 1.4346328630835792, | |
| "learning_rate": 9.682766560049979e-06, | |
| "loss": 0.2393, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 0.11415241057542769, | |
| "grad_norm": 1.416880965769396, | |
| "learning_rate": 9.681909696890805e-06, | |
| "loss": 0.2149, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 0.11430793157076205, | |
| "grad_norm": 1.3604331981225013, | |
| "learning_rate": 9.681051716094497e-06, | |
| "loss": 0.2116, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.11446345256609643, | |
| "grad_norm": 1.370682231566524, | |
| "learning_rate": 9.680192617865862e-06, | |
| "loss": 0.1574, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.11461897356143079, | |
| "grad_norm": 3.11697026931608, | |
| "learning_rate": 9.679332402409983e-06, | |
| "loss": 0.1659, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 0.11477449455676517, | |
| "grad_norm": 1.0795485204091093, | |
| "learning_rate": 9.678471069932205e-06, | |
| "loss": 0.1843, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 0.11493001555209953, | |
| "grad_norm": 1.089003737321956, | |
| "learning_rate": 9.677608620638138e-06, | |
| "loss": 0.1289, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 0.1150855365474339, | |
| "grad_norm": 1.9816825572482675, | |
| "learning_rate": 9.676745054733661e-06, | |
| "loss": 0.183, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.11524105754276827, | |
| "grad_norm": 4.608323882578619, | |
| "learning_rate": 9.675880372424922e-06, | |
| "loss": 0.1797, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 0.11539657853810265, | |
| "grad_norm": 0.9751878331403108, | |
| "learning_rate": 9.675014573918328e-06, | |
| "loss": 0.2649, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 0.11555209953343701, | |
| "grad_norm": 0.913137520804308, | |
| "learning_rate": 9.67414765942056e-06, | |
| "loss": 0.1229, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 0.11570762052877138, | |
| "grad_norm": 1.1182409613228717, | |
| "learning_rate": 9.673279629138565e-06, | |
| "loss": 0.1554, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 0.11586314152410575, | |
| "grad_norm": 2.425925853364065, | |
| "learning_rate": 9.67241048327955e-06, | |
| "loss": 0.2414, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.11601866251944012, | |
| "grad_norm": 2.1643434151507024, | |
| "learning_rate": 9.671540222050995e-06, | |
| "loss": 0.2402, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 0.1161741835147745, | |
| "grad_norm": 1.1869224601016288, | |
| "learning_rate": 9.67066884566064e-06, | |
| "loss": 0.225, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 0.11632970451010886, | |
| "grad_norm": 1.1850496858694712, | |
| "learning_rate": 9.669796354316497e-06, | |
| "loss": 0.1732, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 0.11648522550544324, | |
| "grad_norm": 1.083880428656249, | |
| "learning_rate": 9.668922748226842e-06, | |
| "loss": 0.2256, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 0.1166407465007776, | |
| "grad_norm": 0.9290306352610638, | |
| "learning_rate": 9.668048027600217e-06, | |
| "loss": 0.1814, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.11679626749611198, | |
| "grad_norm": 1.1985316233321583, | |
| "learning_rate": 9.66717219264543e-06, | |
| "loss": 0.2646, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 0.11695178849144634, | |
| "grad_norm": 1.5752976014862634, | |
| "learning_rate": 9.666295243571553e-06, | |
| "loss": 0.2212, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 0.11710730948678072, | |
| "grad_norm": 1.554593030529623, | |
| "learning_rate": 9.665417180587928e-06, | |
| "loss": 0.2008, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 0.11726283048211508, | |
| "grad_norm": 1.802147426905897, | |
| "learning_rate": 9.664538003904162e-06, | |
| "loss": 0.1694, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 0.11741835147744946, | |
| "grad_norm": 1.117253074112765, | |
| "learning_rate": 9.663657713730123e-06, | |
| "loss": 0.1769, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.11757387247278382, | |
| "grad_norm": 1.2713208371120763, | |
| "learning_rate": 9.662776310275954e-06, | |
| "loss": 0.3356, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 0.1177293934681182, | |
| "grad_norm": 1.5049877808240208, | |
| "learning_rate": 9.661893793752053e-06, | |
| "loss": 0.2156, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 0.11788491446345256, | |
| "grad_norm": 1.3646831264890733, | |
| "learning_rate": 9.661010164369092e-06, | |
| "loss": 0.2077, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 0.11804043545878694, | |
| "grad_norm": 1.2057674637964264, | |
| "learning_rate": 9.660125422338003e-06, | |
| "loss": 0.234, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 0.1181959564541213, | |
| "grad_norm": 1.7059599899477969, | |
| "learning_rate": 9.659239567869989e-06, | |
| "loss": 0.2019, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.11835147744945568, | |
| "grad_norm": 1.359054263386884, | |
| "learning_rate": 9.658352601176514e-06, | |
| "loss": 0.2263, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 0.11850699844479004, | |
| "grad_norm": 1.4779502971821263, | |
| "learning_rate": 9.65746452246931e-06, | |
| "loss": 0.229, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 0.11866251944012442, | |
| "grad_norm": 1.2106031530437371, | |
| "learning_rate": 9.656575331960376e-06, | |
| "loss": 0.2075, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 0.1188180404354588, | |
| "grad_norm": 1.5750869920441555, | |
| "learning_rate": 9.655685029861969e-06, | |
| "loss": 0.2103, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 0.11897356143079316, | |
| "grad_norm": 1.328300416339256, | |
| "learning_rate": 9.654793616386621e-06, | |
| "loss": 0.1822, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.11912908242612753, | |
| "grad_norm": 2.218866258760128, | |
| "learning_rate": 9.653901091747124e-06, | |
| "loss": 0.1909, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 0.1192846034214619, | |
| "grad_norm": 1.8622051312400103, | |
| "learning_rate": 9.653007456156536e-06, | |
| "loss": 0.2241, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 0.11944012441679627, | |
| "grad_norm": 1.3832228672336278, | |
| "learning_rate": 9.652112709828179e-06, | |
| "loss": 0.2256, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.11959564541213064, | |
| "grad_norm": 1.0673171707909481, | |
| "learning_rate": 9.651216852975643e-06, | |
| "loss": 0.1959, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 0.11975116640746501, | |
| "grad_norm": 1.3393619429463375, | |
| "learning_rate": 9.650319885812777e-06, | |
| "loss": 0.2727, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.11990668740279938, | |
| "grad_norm": 1.0882111784771522, | |
| "learning_rate": 9.649421808553708e-06, | |
| "loss": 0.2259, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 0.12006220839813375, | |
| "grad_norm": 4.447919742603164, | |
| "learning_rate": 9.648522621412812e-06, | |
| "loss": 0.231, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 0.12021772939346811, | |
| "grad_norm": 1.5176403638597071, | |
| "learning_rate": 9.647622324604742e-06, | |
| "loss": 0.2824, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 0.12037325038880249, | |
| "grad_norm": 1.7576074795768224, | |
| "learning_rate": 9.646720918344409e-06, | |
| "loss": 0.2034, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 0.12052877138413685, | |
| "grad_norm": 1.5792838723378395, | |
| "learning_rate": 9.645818402846992e-06, | |
| "loss": 0.1677, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.12068429237947123, | |
| "grad_norm": 1.0405000433648128, | |
| "learning_rate": 9.644914778327935e-06, | |
| "loss": 0.1742, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 0.1208398133748056, | |
| "grad_norm": 1.545200668981177, | |
| "learning_rate": 9.644010045002942e-06, | |
| "loss": 0.215, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 0.12099533437013997, | |
| "grad_norm": 1.203039484308954, | |
| "learning_rate": 9.64310420308799e-06, | |
| "loss": 0.1997, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 0.12115085536547433, | |
| "grad_norm": 1.038062251460105, | |
| "learning_rate": 9.642197252799315e-06, | |
| "loss": 0.2001, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 0.12130637636080871, | |
| "grad_norm": 1.3963430783849184, | |
| "learning_rate": 9.641289194353418e-06, | |
| "loss": 0.2034, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.12146189735614307, | |
| "grad_norm": 1.7069918759015217, | |
| "learning_rate": 9.640380027967065e-06, | |
| "loss": 0.1763, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 0.12161741835147745, | |
| "grad_norm": 1.1485309219449071, | |
| "learning_rate": 9.639469753857287e-06, | |
| "loss": 0.1946, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 0.12177293934681183, | |
| "grad_norm": 0.9976269624811838, | |
| "learning_rate": 9.63855837224138e-06, | |
| "loss": 0.1797, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 0.12192846034214619, | |
| "grad_norm": 1.413148682632424, | |
| "learning_rate": 9.6376458833369e-06, | |
| "loss": 0.1873, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 0.12208398133748057, | |
| "grad_norm": 1.287068701523726, | |
| "learning_rate": 9.636732287361675e-06, | |
| "loss": 0.1964, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.12223950233281493, | |
| "grad_norm": 1.338092957612231, | |
| "learning_rate": 9.635817584533791e-06, | |
| "loss": 0.2353, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 0.1223950233281493, | |
| "grad_norm": 1.018985176065171, | |
| "learning_rate": 9.6349017750716e-06, | |
| "loss": 0.243, | |
| "step": 787 | |
| }, | |
| { | |
| "epoch": 0.12255054432348367, | |
| "grad_norm": 1.434405666961768, | |
| "learning_rate": 9.633984859193722e-06, | |
| "loss": 0.1622, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 0.12270606531881804, | |
| "grad_norm": 1.2392900109261706, | |
| "learning_rate": 9.633066837119034e-06, | |
| "loss": 0.2223, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 0.12286158631415241, | |
| "grad_norm": 0.9045673894396051, | |
| "learning_rate": 9.632147709066682e-06, | |
| "loss": 0.2079, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.12301710730948678, | |
| "grad_norm": 1.14443309047443, | |
| "learning_rate": 9.631227475256072e-06, | |
| "loss": 0.1611, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 0.12317262830482115, | |
| "grad_norm": 1.1564291271253233, | |
| "learning_rate": 9.630306135906882e-06, | |
| "loss": 0.1918, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 0.12332814930015552, | |
| "grad_norm": 2.1831582412646138, | |
| "learning_rate": 9.629383691239043e-06, | |
| "loss": 0.3687, | |
| "step": 793 | |
| }, | |
| { | |
| "epoch": 0.12348367029548989, | |
| "grad_norm": 1.0115623861000755, | |
| "learning_rate": 9.628460141472759e-06, | |
| "loss": 0.1589, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 0.12363919129082426, | |
| "grad_norm": 0.8936049036056027, | |
| "learning_rate": 9.627535486828491e-06, | |
| "loss": 0.1775, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.12379471228615863, | |
| "grad_norm": 1.3757750926899586, | |
| "learning_rate": 9.626609727526973e-06, | |
| "loss": 0.2, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 0.123950233281493, | |
| "grad_norm": 1.3462049704057701, | |
| "learning_rate": 9.62568286378919e-06, | |
| "loss": 0.2079, | |
| "step": 797 | |
| }, | |
| { | |
| "epoch": 0.12410575427682737, | |
| "grad_norm": 2.793319589376331, | |
| "learning_rate": 9.624754895836401e-06, | |
| "loss": 0.2297, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 0.12426127527216174, | |
| "grad_norm": 2.1016347336310357, | |
| "learning_rate": 9.623825823890123e-06, | |
| "loss": 0.3106, | |
| "step": 799 | |
| }, | |
| { | |
| "epoch": 0.12441679626749612, | |
| "grad_norm": 1.003756031018623, | |
| "learning_rate": 9.622895648172141e-06, | |
| "loss": 0.2143, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12441679626749612, | |
| "eval_loss": 0.2170763909816742, | |
| "eval_runtime": 9.4305, | |
| "eval_samples_per_second": 2.757, | |
| "eval_steps_per_second": 0.742, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12457231726283048, | |
| "grad_norm": 0.897563337381756, | |
| "learning_rate": 9.621964368904497e-06, | |
| "loss": 0.1512, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 0.12472783825816486, | |
| "grad_norm": 1.4190659163727315, | |
| "learning_rate": 9.621031986309504e-06, | |
| "loss": 0.1372, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 0.12488335925349922, | |
| "grad_norm": 1.4031206175030444, | |
| "learning_rate": 9.620098500609734e-06, | |
| "loss": 0.1871, | |
| "step": 803 | |
| }, | |
| { | |
| "epoch": 0.12503888024883358, | |
| "grad_norm": 1.387547575925909, | |
| "learning_rate": 9.61916391202802e-06, | |
| "loss": 0.2899, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 0.12519440124416797, | |
| "grad_norm": 1.3476031364192975, | |
| "learning_rate": 9.618228220787466e-06, | |
| "loss": 0.1693, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.12534992223950234, | |
| "grad_norm": 2.5401419561208787, | |
| "learning_rate": 9.617291427111431e-06, | |
| "loss": 0.141, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 0.1255054432348367, | |
| "grad_norm": 1.918003643731122, | |
| "learning_rate": 9.616353531223543e-06, | |
| "loss": 0.2531, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 0.12566096423017106, | |
| "grad_norm": 0.8824574964250353, | |
| "learning_rate": 9.61541453334769e-06, | |
| "loss": 0.2257, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 0.12581648522550545, | |
| "grad_norm": 1.2069677012195894, | |
| "learning_rate": 9.614474433708021e-06, | |
| "loss": 0.2012, | |
| "step": 809 | |
| }, | |
| { | |
| "epoch": 0.12597200622083982, | |
| "grad_norm": 0.8806254573901449, | |
| "learning_rate": 9.613533232528956e-06, | |
| "loss": 0.2312, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.12612752721617418, | |
| "grad_norm": 0.9758926813848963, | |
| "learning_rate": 9.61259093003517e-06, | |
| "loss": 0.1623, | |
| "step": 811 | |
| }, | |
| { | |
| "epoch": 0.12628304821150854, | |
| "grad_norm": 1.601541464183247, | |
| "learning_rate": 9.611647526451603e-06, | |
| "loss": 0.2448, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 0.12643856920684293, | |
| "grad_norm": 0.987236561765066, | |
| "learning_rate": 9.610703022003462e-06, | |
| "loss": 0.1833, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 0.1265940902021773, | |
| "grad_norm": 1.1685078861500846, | |
| "learning_rate": 9.60975741691621e-06, | |
| "loss": 0.2708, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 0.12674961119751166, | |
| "grad_norm": 1.2818789908746795, | |
| "learning_rate": 9.608810711415577e-06, | |
| "loss": 0.2132, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.12690513219284602, | |
| "grad_norm": 1.7355503765107922, | |
| "learning_rate": 9.607862905727556e-06, | |
| "loss": 0.2316, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 0.1270606531881804, | |
| "grad_norm": 2.4291900998321614, | |
| "learning_rate": 9.6069140000784e-06, | |
| "loss": 0.2607, | |
| "step": 817 | |
| }, | |
| { | |
| "epoch": 0.12721617418351477, | |
| "grad_norm": 1.2126882446943306, | |
| "learning_rate": 9.605963994694625e-06, | |
| "loss": 0.2374, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 0.12737169517884914, | |
| "grad_norm": 1.402793253608196, | |
| "learning_rate": 9.605012889803013e-06, | |
| "loss": 0.1854, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 0.12752721617418353, | |
| "grad_norm": 1.1350096409875572, | |
| "learning_rate": 9.604060685630608e-06, | |
| "loss": 0.2353, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.1276827371695179, | |
| "grad_norm": 0.8605397955086846, | |
| "learning_rate": 9.603107382404708e-06, | |
| "loss": 0.1725, | |
| "step": 821 | |
| }, | |
| { | |
| "epoch": 0.12783825816485225, | |
| "grad_norm": 1.8193213761501528, | |
| "learning_rate": 9.602152980352884e-06, | |
| "loss": 0.191, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 0.12799377916018662, | |
| "grad_norm": 1.0560479092155457, | |
| "learning_rate": 9.601197479702963e-06, | |
| "loss": 0.2129, | |
| "step": 823 | |
| }, | |
| { | |
| "epoch": 0.128149300155521, | |
| "grad_norm": 0.9886146739779551, | |
| "learning_rate": 9.60024088068304e-06, | |
| "loss": 0.1349, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 0.12830482115085537, | |
| "grad_norm": 1.044208330213169, | |
| "learning_rate": 9.599283183521467e-06, | |
| "loss": 0.1611, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.12846034214618973, | |
| "grad_norm": 1.105951942629371, | |
| "learning_rate": 9.598324388446856e-06, | |
| "loss": 0.25, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 0.1286158631415241, | |
| "grad_norm": 1.2794645483672162, | |
| "learning_rate": 9.59736449568809e-06, | |
| "loss": 0.2132, | |
| "step": 827 | |
| }, | |
| { | |
| "epoch": 0.12877138413685849, | |
| "grad_norm": 1.3758053785309152, | |
| "learning_rate": 9.596403505474304e-06, | |
| "loss": 0.2149, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 0.12892690513219285, | |
| "grad_norm": 3.355818230170184, | |
| "learning_rate": 9.595441418034903e-06, | |
| "loss": 0.3682, | |
| "step": 829 | |
| }, | |
| { | |
| "epoch": 0.1290824261275272, | |
| "grad_norm": 1.837073128336488, | |
| "learning_rate": 9.594478233599551e-06, | |
| "loss": 0.2032, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.12923794712286157, | |
| "grad_norm": 1.5066969144898332, | |
| "learning_rate": 9.593513952398172e-06, | |
| "loss": 0.2378, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 0.12939346811819596, | |
| "grad_norm": 2.2384679831338614, | |
| "learning_rate": 9.592548574660954e-06, | |
| "loss": 0.3073, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.12954898911353033, | |
| "grad_norm": 0.9921790422628257, | |
| "learning_rate": 9.591582100618345e-06, | |
| "loss": 0.1937, | |
| "step": 833 | |
| }, | |
| { | |
| "epoch": 0.1297045101088647, | |
| "grad_norm": 1.198440591432804, | |
| "learning_rate": 9.590614530501057e-06, | |
| "loss": 0.1925, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 0.12986003110419908, | |
| "grad_norm": 1.3748463927035848, | |
| "learning_rate": 9.589645864540061e-06, | |
| "loss": 0.1941, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.13001555209953344, | |
| "grad_norm": 1.3610943196332044, | |
| "learning_rate": 9.588676102966593e-06, | |
| "loss": 0.166, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 0.1301710730948678, | |
| "grad_norm": 0.8955532583487235, | |
| "learning_rate": 9.58770524601215e-06, | |
| "loss": 0.1495, | |
| "step": 837 | |
| }, | |
| { | |
| "epoch": 0.13032659409020217, | |
| "grad_norm": 1.285038495994977, | |
| "learning_rate": 9.586733293908486e-06, | |
| "loss": 0.2182, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 0.13048211508553656, | |
| "grad_norm": 1.3128144306673817, | |
| "learning_rate": 9.585760246887618e-06, | |
| "loss": 0.2371, | |
| "step": 839 | |
| }, | |
| { | |
| "epoch": 0.13063763608087092, | |
| "grad_norm": 0.9827038114137296, | |
| "learning_rate": 9.584786105181831e-06, | |
| "loss": 0.2151, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.13079315707620529, | |
| "grad_norm": 1.0846767572687748, | |
| "learning_rate": 9.583810869023663e-06, | |
| "loss": 0.2757, | |
| "step": 841 | |
| }, | |
| { | |
| "epoch": 0.13094867807153965, | |
| "grad_norm": 2.4064875629004265, | |
| "learning_rate": 9.582834538645917e-06, | |
| "loss": 0.2357, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 0.13110419906687404, | |
| "grad_norm": 1.2894114641673238, | |
| "learning_rate": 9.581857114281656e-06, | |
| "loss": 0.1877, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 0.1312597200622084, | |
| "grad_norm": 1.5574730662344252, | |
| "learning_rate": 9.580878596164207e-06, | |
| "loss": 0.1623, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 0.13141524105754276, | |
| "grad_norm": 1.2710340775794473, | |
| "learning_rate": 9.579898984527154e-06, | |
| "loss": 0.187, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.13157076205287713, | |
| "grad_norm": 1.4508001676942102, | |
| "learning_rate": 9.578918279604346e-06, | |
| "loss": 0.1372, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 0.13172628304821152, | |
| "grad_norm": 1.0012618056091263, | |
| "learning_rate": 9.577936481629887e-06, | |
| "loss": 0.2201, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 0.13188180404354588, | |
| "grad_norm": 0.8624233281967797, | |
| "learning_rate": 9.576953590838149e-06, | |
| "loss": 0.1979, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 0.13203732503888024, | |
| "grad_norm": 1.083965076436999, | |
| "learning_rate": 9.57596960746376e-06, | |
| "loss": 0.2404, | |
| "step": 849 | |
| }, | |
| { | |
| "epoch": 0.1321928460342146, | |
| "grad_norm": 2.717551231092263, | |
| "learning_rate": 9.574984531741613e-06, | |
| "loss": 0.2745, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.132348367029549, | |
| "grad_norm": 1.1408480485083061, | |
| "learning_rate": 9.573998363906858e-06, | |
| "loss": 0.207, | |
| "step": 851 | |
| }, | |
| { | |
| "epoch": 0.13250388802488336, | |
| "grad_norm": 1.6013848917828304, | |
| "learning_rate": 9.573011104194907e-06, | |
| "loss": 0.1826, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 0.13265940902021772, | |
| "grad_norm": 1.1178529036140945, | |
| "learning_rate": 9.572022752841433e-06, | |
| "loss": 0.1676, | |
| "step": 853 | |
| }, | |
| { | |
| "epoch": 0.1328149300155521, | |
| "grad_norm": 1.4964605327939924, | |
| "learning_rate": 9.571033310082367e-06, | |
| "loss": 0.1929, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 0.13297045101088648, | |
| "grad_norm": 1.1404147062516024, | |
| "learning_rate": 9.570042776153904e-06, | |
| "loss": 0.2274, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.13312597200622084, | |
| "grad_norm": 1.037410347500119, | |
| "learning_rate": 9.5690511512925e-06, | |
| "loss": 0.1577, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 0.1332814930015552, | |
| "grad_norm": 0.8366673014473697, | |
| "learning_rate": 9.56805843573487e-06, | |
| "loss": 0.1689, | |
| "step": 857 | |
| }, | |
| { | |
| "epoch": 0.1334370139968896, | |
| "grad_norm": 1.1452085152848681, | |
| "learning_rate": 9.567064629717986e-06, | |
| "loss": 0.1882, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 0.13359253499222395, | |
| "grad_norm": 1.574854487100182, | |
| "learning_rate": 9.566069733479087e-06, | |
| "loss": 0.31, | |
| "step": 859 | |
| }, | |
| { | |
| "epoch": 0.13374805598755832, | |
| "grad_norm": 1.593208427145828, | |
| "learning_rate": 9.565073747255665e-06, | |
| "loss": 0.2198, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.13390357698289268, | |
| "grad_norm": 1.1177720055491567, | |
| "learning_rate": 9.564076671285477e-06, | |
| "loss": 0.2164, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 0.13405909797822707, | |
| "grad_norm": 0.8165999821951461, | |
| "learning_rate": 9.56307850580654e-06, | |
| "loss": 0.1506, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 0.13421461897356143, | |
| "grad_norm": 1.6750367279986849, | |
| "learning_rate": 9.562079251057129e-06, | |
| "loss": 0.1732, | |
| "step": 863 | |
| }, | |
| { | |
| "epoch": 0.1343701399688958, | |
| "grad_norm": 0.8044448243559967, | |
| "learning_rate": 9.561078907275781e-06, | |
| "loss": 0.1922, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.13452566096423016, | |
| "grad_norm": 1.271960150991974, | |
| "learning_rate": 9.56007747470129e-06, | |
| "loss": 0.2229, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.13468118195956455, | |
| "grad_norm": 1.0004490456147865, | |
| "learning_rate": 9.559074953572713e-06, | |
| "loss": 0.171, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 0.1348367029548989, | |
| "grad_norm": 1.312217862895249, | |
| "learning_rate": 9.558071344129368e-06, | |
| "loss": 0.1783, | |
| "step": 867 | |
| }, | |
| { | |
| "epoch": 0.13499222395023328, | |
| "grad_norm": 0.9356844106701133, | |
| "learning_rate": 9.557066646610826e-06, | |
| "loss": 0.1279, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 0.13514774494556764, | |
| "grad_norm": 1.4966712904656105, | |
| "learning_rate": 9.556060861256928e-06, | |
| "loss": 0.1971, | |
| "step": 869 | |
| }, | |
| { | |
| "epoch": 0.13530326594090203, | |
| "grad_norm": 0.9157016732315058, | |
| "learning_rate": 9.555053988307764e-06, | |
| "loss": 0.1739, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.1354587869362364, | |
| "grad_norm": 1.6187813697357434, | |
| "learning_rate": 9.554046028003691e-06, | |
| "loss": 0.2326, | |
| "step": 871 | |
| }, | |
| { | |
| "epoch": 0.13561430793157075, | |
| "grad_norm": 1.649258041134042, | |
| "learning_rate": 9.553036980585323e-06, | |
| "loss": 0.2775, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 0.13576982892690515, | |
| "grad_norm": 0.8386386166459481, | |
| "learning_rate": 9.552026846293532e-06, | |
| "loss": 0.2225, | |
| "step": 873 | |
| }, | |
| { | |
| "epoch": 0.1359253499222395, | |
| "grad_norm": 0.96771492040488, | |
| "learning_rate": 9.551015625369455e-06, | |
| "loss": 0.1999, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 0.13608087091757387, | |
| "grad_norm": 1.4939182411934322, | |
| "learning_rate": 9.550003318054482e-06, | |
| "loss": 0.2427, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.13623639191290823, | |
| "grad_norm": 1.1599555983572944, | |
| "learning_rate": 9.548989924590263e-06, | |
| "loss": 0.2038, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 0.13639191290824262, | |
| "grad_norm": 1.094972018927162, | |
| "learning_rate": 9.547975445218712e-06, | |
| "loss": 0.1477, | |
| "step": 877 | |
| }, | |
| { | |
| "epoch": 0.136547433903577, | |
| "grad_norm": 1.5378516224601575, | |
| "learning_rate": 9.546959880181998e-06, | |
| "loss": 0.2411, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 0.13670295489891135, | |
| "grad_norm": 0.8702765312556789, | |
| "learning_rate": 9.545943229722553e-06, | |
| "loss": 0.1646, | |
| "step": 879 | |
| }, | |
| { | |
| "epoch": 0.1368584758942457, | |
| "grad_norm": 1.3664019719395564, | |
| "learning_rate": 9.544925494083062e-06, | |
| "loss": 0.1688, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.1370139968895801, | |
| "grad_norm": 1.3206104649159593, | |
| "learning_rate": 9.543906673506474e-06, | |
| "loss": 0.1623, | |
| "step": 881 | |
| }, | |
| { | |
| "epoch": 0.13716951788491447, | |
| "grad_norm": 1.3156230503659714, | |
| "learning_rate": 9.542886768235996e-06, | |
| "loss": 0.2297, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 0.13732503888024883, | |
| "grad_norm": 1.727680640232904, | |
| "learning_rate": 9.541865778515094e-06, | |
| "loss": 0.2824, | |
| "step": 883 | |
| }, | |
| { | |
| "epoch": 0.1374805598755832, | |
| "grad_norm": 1.3346266664784416, | |
| "learning_rate": 9.540843704587492e-06, | |
| "loss": 0.2533, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 0.13763608087091758, | |
| "grad_norm": 1.663603312691407, | |
| "learning_rate": 9.539820546697175e-06, | |
| "loss": 0.1889, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.13779160186625194, | |
| "grad_norm": 1.3931002570801638, | |
| "learning_rate": 9.53879630508838e-06, | |
| "loss": 0.2125, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 0.1379471228615863, | |
| "grad_norm": 1.0312695868953268, | |
| "learning_rate": 9.537770980005616e-06, | |
| "loss": 0.157, | |
| "step": 887 | |
| }, | |
| { | |
| "epoch": 0.13810264385692067, | |
| "grad_norm": 1.291055270497525, | |
| "learning_rate": 9.536744571693634e-06, | |
| "loss": 0.1542, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 0.13825816485225506, | |
| "grad_norm": 1.0586309772197517, | |
| "learning_rate": 9.535717080397458e-06, | |
| "loss": 0.1413, | |
| "step": 889 | |
| }, | |
| { | |
| "epoch": 0.13841368584758942, | |
| "grad_norm": 1.9142459890481243, | |
| "learning_rate": 9.53468850636236e-06, | |
| "loss": 0.2132, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.1385692068429238, | |
| "grad_norm": 1.57785159694773, | |
| "learning_rate": 9.533658849833879e-06, | |
| "loss": 0.2704, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 0.13872472783825818, | |
| "grad_norm": 0.6767899331815482, | |
| "learning_rate": 9.532628111057804e-06, | |
| "loss": 0.1994, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 0.13888024883359254, | |
| "grad_norm": 0.7786068585931847, | |
| "learning_rate": 9.531596290280191e-06, | |
| "loss": 0.2215, | |
| "step": 893 | |
| }, | |
| { | |
| "epoch": 0.1390357698289269, | |
| "grad_norm": 1.1907351307303637, | |
| "learning_rate": 9.530563387747348e-06, | |
| "loss": 0.1597, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 0.13919129082426127, | |
| "grad_norm": 0.994862972128769, | |
| "learning_rate": 9.529529403705844e-06, | |
| "loss": 0.2586, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.13934681181959566, | |
| "grad_norm": 0.9549652766512168, | |
| "learning_rate": 9.528494338402502e-06, | |
| "loss": 0.1332, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.13950233281493002, | |
| "grad_norm": 1.1799329518454007, | |
| "learning_rate": 9.527458192084413e-06, | |
| "loss": 0.1884, | |
| "step": 897 | |
| }, | |
| { | |
| "epoch": 0.13965785381026438, | |
| "grad_norm": 0.7863314952979764, | |
| "learning_rate": 9.526420964998915e-06, | |
| "loss": 0.1679, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 0.13981337480559874, | |
| "grad_norm": 0.937917950726602, | |
| "learning_rate": 9.52538265739361e-06, | |
| "loss": 0.2024, | |
| "step": 899 | |
| }, | |
| { | |
| "epoch": 0.13996889580093314, | |
| "grad_norm": 1.7160775693106616, | |
| "learning_rate": 9.524343269516354e-06, | |
| "loss": 0.2127, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.13996889580093314, | |
| "eval_loss": 0.21867091953754425, | |
| "eval_runtime": 9.4128, | |
| "eval_samples_per_second": 2.762, | |
| "eval_steps_per_second": 0.744, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1401244167962675, | |
| "grad_norm": 1.4496209630087886, | |
| "learning_rate": 9.523302801615266e-06, | |
| "loss": 0.2026, | |
| "step": 901 | |
| }, | |
| { | |
| "epoch": 0.14027993779160186, | |
| "grad_norm": 0.9035504049737524, | |
| "learning_rate": 9.522261253938721e-06, | |
| "loss": 0.237, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 0.14043545878693622, | |
| "grad_norm": 1.0344016899215176, | |
| "learning_rate": 9.521218626735347e-06, | |
| "loss": 0.2079, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 0.14059097978227061, | |
| "grad_norm": 0.8764502702407341, | |
| "learning_rate": 9.52017492025404e-06, | |
| "loss": 0.1512, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 0.14074650077760498, | |
| "grad_norm": 0.78362955023232, | |
| "learning_rate": 9.519130134743938e-06, | |
| "loss": 0.1544, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.14090202177293934, | |
| "grad_norm": 1.331879071297993, | |
| "learning_rate": 9.518084270454456e-06, | |
| "loss": 0.208, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 0.14105754276827373, | |
| "grad_norm": 1.0576721252655992, | |
| "learning_rate": 9.51703732763525e-06, | |
| "loss": 0.1777, | |
| "step": 907 | |
| }, | |
| { | |
| "epoch": 0.1412130637636081, | |
| "grad_norm": 0.9777650095779323, | |
| "learning_rate": 9.515989306536241e-06, | |
| "loss": 0.2431, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 0.14136858475894246, | |
| "grad_norm": 1.2351460184737522, | |
| "learning_rate": 9.514940207407608e-06, | |
| "loss": 0.164, | |
| "step": 909 | |
| }, | |
| { | |
| "epoch": 0.14152410575427682, | |
| "grad_norm": 1.0466682687606328, | |
| "learning_rate": 9.513890030499786e-06, | |
| "loss": 0.1862, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.1416796267496112, | |
| "grad_norm": 1.667573553968496, | |
| "learning_rate": 9.512838776063464e-06, | |
| "loss": 0.1881, | |
| "step": 911 | |
| }, | |
| { | |
| "epoch": 0.14183514774494557, | |
| "grad_norm": 1.0309274313381354, | |
| "learning_rate": 9.51178644434959e-06, | |
| "loss": 0.1894, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 0.14199066874027994, | |
| "grad_norm": 1.1516030880613233, | |
| "learning_rate": 9.510733035609376e-06, | |
| "loss": 0.1906, | |
| "step": 913 | |
| }, | |
| { | |
| "epoch": 0.1421461897356143, | |
| "grad_norm": 1.1964374362259393, | |
| "learning_rate": 9.509678550094282e-06, | |
| "loss": 0.2193, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 0.1423017107309487, | |
| "grad_norm": 1.018131456622998, | |
| "learning_rate": 9.508622988056026e-06, | |
| "loss": 0.18, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.14245723172628305, | |
| "grad_norm": 0.9878879365994556, | |
| "learning_rate": 9.50756634974659e-06, | |
| "loss": 0.2303, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 0.14261275272161741, | |
| "grad_norm": 0.9092163587106824, | |
| "learning_rate": 9.506508635418203e-06, | |
| "loss": 0.1565, | |
| "step": 917 | |
| }, | |
| { | |
| "epoch": 0.14276827371695178, | |
| "grad_norm": 1.2600000274625656, | |
| "learning_rate": 9.505449845323362e-06, | |
| "loss": 0.2203, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 0.14292379471228617, | |
| "grad_norm": 1.0177653430547444, | |
| "learning_rate": 9.504389979714812e-06, | |
| "loss": 0.1708, | |
| "step": 919 | |
| }, | |
| { | |
| "epoch": 0.14307931570762053, | |
| "grad_norm": 1.323141251202386, | |
| "learning_rate": 9.503329038845556e-06, | |
| "loss": 0.2041, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.1432348367029549, | |
| "grad_norm": 0.8666613786933973, | |
| "learning_rate": 9.50226702296886e-06, | |
| "loss": 0.1709, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 0.14339035769828926, | |
| "grad_norm": 1.4717207003269144, | |
| "learning_rate": 9.501203932338238e-06, | |
| "loss": 0.1531, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 0.14354587869362365, | |
| "grad_norm": 0.9850527774643847, | |
| "learning_rate": 9.500139767207465e-06, | |
| "loss": 0.2673, | |
| "step": 923 | |
| }, | |
| { | |
| "epoch": 0.143701399688958, | |
| "grad_norm": 0.795383661376322, | |
| "learning_rate": 9.499074527830576e-06, | |
| "loss": 0.1514, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 0.14385692068429237, | |
| "grad_norm": 1.5926732733378721, | |
| "learning_rate": 9.498008214461854e-06, | |
| "loss": 0.1919, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.14401244167962676, | |
| "grad_norm": 1.0577956165619293, | |
| "learning_rate": 9.496940827355843e-06, | |
| "loss": 0.2541, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 0.14416796267496113, | |
| "grad_norm": 1.0853608193427453, | |
| "learning_rate": 9.495872366767345e-06, | |
| "loss": 0.3026, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 0.1443234836702955, | |
| "grad_norm": 1.5841584604687593, | |
| "learning_rate": 9.494802832951416e-06, | |
| "loss": 0.237, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.14447900466562985, | |
| "grad_norm": 1.2668912692543315, | |
| "learning_rate": 9.493732226163368e-06, | |
| "loss": 0.1962, | |
| "step": 929 | |
| }, | |
| { | |
| "epoch": 0.14463452566096424, | |
| "grad_norm": 1.1865934879383473, | |
| "learning_rate": 9.492660546658771e-06, | |
| "loss": 0.205, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.1447900466562986, | |
| "grad_norm": 1.16907334182334, | |
| "learning_rate": 9.491587794693448e-06, | |
| "loss": 0.1649, | |
| "step": 931 | |
| }, | |
| { | |
| "epoch": 0.14494556765163297, | |
| "grad_norm": 2.6694118671679035, | |
| "learning_rate": 9.490513970523482e-06, | |
| "loss": 0.1716, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 0.14510108864696733, | |
| "grad_norm": 1.2693916754547256, | |
| "learning_rate": 9.489439074405211e-06, | |
| "loss": 0.2102, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 0.14525660964230172, | |
| "grad_norm": 1.4815910522621762, | |
| "learning_rate": 9.488363106595223e-06, | |
| "loss": 0.2146, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 0.14541213063763608, | |
| "grad_norm": 1.5330200808441012, | |
| "learning_rate": 9.48728606735037e-06, | |
| "loss": 0.1767, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.14556765163297045, | |
| "grad_norm": 1.123567228978502, | |
| "learning_rate": 9.486207956927756e-06, | |
| "loss": 0.1864, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 0.1457231726283048, | |
| "grad_norm": 0.9960966752159592, | |
| "learning_rate": 9.485128775584737e-06, | |
| "loss": 0.2118, | |
| "step": 937 | |
| }, | |
| { | |
| "epoch": 0.1458786936236392, | |
| "grad_norm": 1.2303193618017887, | |
| "learning_rate": 9.484048523578934e-06, | |
| "loss": 0.2106, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 0.14603421461897356, | |
| "grad_norm": 1.2867421133114936, | |
| "learning_rate": 9.482967201168218e-06, | |
| "loss": 0.2252, | |
| "step": 939 | |
| }, | |
| { | |
| "epoch": 0.14618973561430793, | |
| "grad_norm": 1.3372951799730566, | |
| "learning_rate": 9.481884808610712e-06, | |
| "loss": 0.2662, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.1463452566096423, | |
| "grad_norm": 0.6808561025624517, | |
| "learning_rate": 9.4808013461648e-06, | |
| "loss": 0.1613, | |
| "step": 941 | |
| }, | |
| { | |
| "epoch": 0.14650077760497668, | |
| "grad_norm": 1.0617639952793092, | |
| "learning_rate": 9.479716814089119e-06, | |
| "loss": 0.22, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 0.14665629860031104, | |
| "grad_norm": 1.2088515247514138, | |
| "learning_rate": 9.478631212642565e-06, | |
| "loss": 0.2027, | |
| "step": 943 | |
| }, | |
| { | |
| "epoch": 0.1468118195956454, | |
| "grad_norm": 0.9673478217504623, | |
| "learning_rate": 9.477544542084283e-06, | |
| "loss": 0.2291, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 0.1469673405909798, | |
| "grad_norm": 1.3295783157520016, | |
| "learning_rate": 9.476456802673677e-06, | |
| "loss": 0.2153, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.14712286158631416, | |
| "grad_norm": 1.1001160858062626, | |
| "learning_rate": 9.475367994670406e-06, | |
| "loss": 0.2195, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 0.14727838258164852, | |
| "grad_norm": 1.291866801296516, | |
| "learning_rate": 9.474278118334382e-06, | |
| "loss": 0.2213, | |
| "step": 947 | |
| }, | |
| { | |
| "epoch": 0.14743390357698288, | |
| "grad_norm": 1.2674302718543788, | |
| "learning_rate": 9.473187173925777e-06, | |
| "loss": 0.1371, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 0.14758942457231727, | |
| "grad_norm": 1.4168689609608738, | |
| "learning_rate": 9.472095161705014e-06, | |
| "loss": 0.1902, | |
| "step": 949 | |
| }, | |
| { | |
| "epoch": 0.14774494556765164, | |
| "grad_norm": 1.0439332293475743, | |
| "learning_rate": 9.471002081932767e-06, | |
| "loss": 0.2069, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.147900466562986, | |
| "grad_norm": 1.346490441102045, | |
| "learning_rate": 9.469907934869974e-06, | |
| "loss": 0.1982, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 0.14805598755832036, | |
| "grad_norm": 1.1817129831636979, | |
| "learning_rate": 9.468812720777822e-06, | |
| "loss": 0.1626, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 0.14821150855365475, | |
| "grad_norm": 0.846186520557803, | |
| "learning_rate": 9.467716439917753e-06, | |
| "loss": 0.1659, | |
| "step": 953 | |
| }, | |
| { | |
| "epoch": 0.14836702954898912, | |
| "grad_norm": 1.77057726290962, | |
| "learning_rate": 9.466619092551467e-06, | |
| "loss": 0.1571, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 0.14852255054432348, | |
| "grad_norm": 1.503606666530362, | |
| "learning_rate": 9.465520678940913e-06, | |
| "loss": 0.2317, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.14867807153965784, | |
| "grad_norm": 1.2988561500793663, | |
| "learning_rate": 9.4644211993483e-06, | |
| "loss": 0.184, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 0.14883359253499223, | |
| "grad_norm": 0.9494708116205622, | |
| "learning_rate": 9.463320654036088e-06, | |
| "loss": 0.2061, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 0.1489891135303266, | |
| "grad_norm": 1.1960711999747602, | |
| "learning_rate": 9.462219043266993e-06, | |
| "loss": 0.1595, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 0.14914463452566096, | |
| "grad_norm": 1.456286481771, | |
| "learning_rate": 9.461116367303985e-06, | |
| "loss": 0.1803, | |
| "step": 959 | |
| }, | |
| { | |
| "epoch": 0.14930015552099535, | |
| "grad_norm": 2.193608162058263, | |
| "learning_rate": 9.460012626410286e-06, | |
| "loss": 0.2372, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.1494556765163297, | |
| "grad_norm": 1.1257027932111565, | |
| "learning_rate": 9.458907820849378e-06, | |
| "loss": 0.2183, | |
| "step": 961 | |
| }, | |
| { | |
| "epoch": 0.14961119751166407, | |
| "grad_norm": 1.2699403552308035, | |
| "learning_rate": 9.457801950884991e-06, | |
| "loss": 0.2112, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 0.14976671850699844, | |
| "grad_norm": 2.0211225561288986, | |
| "learning_rate": 9.456695016781112e-06, | |
| "loss": 0.3771, | |
| "step": 963 | |
| }, | |
| { | |
| "epoch": 0.14992223950233283, | |
| "grad_norm": 1.6233952494139523, | |
| "learning_rate": 9.455587018801979e-06, | |
| "loss": 0.1654, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 0.1500777604976672, | |
| "grad_norm": 0.9536635356305013, | |
| "learning_rate": 9.454477957212092e-06, | |
| "loss": 0.1971, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.15023328149300155, | |
| "grad_norm": 1.2024688455270478, | |
| "learning_rate": 9.453367832276196e-06, | |
| "loss": 0.2073, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 0.15038880248833592, | |
| "grad_norm": 1.0163258023024337, | |
| "learning_rate": 9.452256644259296e-06, | |
| "loss": 0.1622, | |
| "step": 967 | |
| }, | |
| { | |
| "epoch": 0.1505443234836703, | |
| "grad_norm": 1.4838973791587633, | |
| "learning_rate": 9.451144393426643e-06, | |
| "loss": 0.2058, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 0.15069984447900467, | |
| "grad_norm": 1.0443777554962437, | |
| "learning_rate": 9.450031080043752e-06, | |
| "loss": 0.165, | |
| "step": 969 | |
| }, | |
| { | |
| "epoch": 0.15085536547433903, | |
| "grad_norm": 1.1175170370729908, | |
| "learning_rate": 9.448916704376384e-06, | |
| "loss": 0.1419, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1510108864696734, | |
| "grad_norm": 1.2857861611804626, | |
| "learning_rate": 9.447801266690557e-06, | |
| "loss": 0.2171, | |
| "step": 971 | |
| }, | |
| { | |
| "epoch": 0.15116640746500778, | |
| "grad_norm": 0.7407729973632995, | |
| "learning_rate": 9.446684767252539e-06, | |
| "loss": 0.1714, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 0.15132192846034215, | |
| "grad_norm": 2.195989894115042, | |
| "learning_rate": 9.445567206328857e-06, | |
| "loss": 0.1989, | |
| "step": 973 | |
| }, | |
| { | |
| "epoch": 0.1514774494556765, | |
| "grad_norm": 0.989971668490221, | |
| "learning_rate": 9.444448584186288e-06, | |
| "loss": 0.1664, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 0.15163297045101087, | |
| "grad_norm": 1.081538706581427, | |
| "learning_rate": 9.44332890109186e-06, | |
| "loss": 0.2066, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.15178849144634526, | |
| "grad_norm": 1.4377035491264887, | |
| "learning_rate": 9.442208157312859e-06, | |
| "loss": 0.2057, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 0.15194401244167963, | |
| "grad_norm": 1.5898783963503191, | |
| "learning_rate": 9.441086353116825e-06, | |
| "loss": 0.1665, | |
| "step": 977 | |
| }, | |
| { | |
| "epoch": 0.152099533437014, | |
| "grad_norm": 0.899579074969373, | |
| "learning_rate": 9.439963488771543e-06, | |
| "loss": 0.2091, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 0.15225505443234838, | |
| "grad_norm": 1.4218933674345213, | |
| "learning_rate": 9.438839564545059e-06, | |
| "loss": 0.2344, | |
| "step": 979 | |
| }, | |
| { | |
| "epoch": 0.15241057542768274, | |
| "grad_norm": 1.2490316562718224, | |
| "learning_rate": 9.437714580705671e-06, | |
| "loss": 0.1771, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.1525660964230171, | |
| "grad_norm": 1.3535600594171835, | |
| "learning_rate": 9.436588537521925e-06, | |
| "loss": 0.2402, | |
| "step": 981 | |
| }, | |
| { | |
| "epoch": 0.15272161741835147, | |
| "grad_norm": 1.2653882449622933, | |
| "learning_rate": 9.435461435262623e-06, | |
| "loss": 0.2368, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 0.15287713841368586, | |
| "grad_norm": 1.4171554003791706, | |
| "learning_rate": 9.434333274196822e-06, | |
| "loss": 0.16, | |
| "step": 983 | |
| }, | |
| { | |
| "epoch": 0.15303265940902022, | |
| "grad_norm": 0.9372171947174371, | |
| "learning_rate": 9.433204054593832e-06, | |
| "loss": 0.1464, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 0.15318818040435458, | |
| "grad_norm": 0.9807519101904891, | |
| "learning_rate": 9.43207377672321e-06, | |
| "loss": 0.1743, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.15334370139968895, | |
| "grad_norm": 1.9830197584350164, | |
| "learning_rate": 9.430942440854772e-06, | |
| "loss": 0.2979, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 0.15349922239502334, | |
| "grad_norm": 1.013327149062581, | |
| "learning_rate": 9.429810047258578e-06, | |
| "loss": 0.2257, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 0.1536547433903577, | |
| "grad_norm": 1.3644569563063227, | |
| "learning_rate": 9.428676596204953e-06, | |
| "loss": 0.227, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 0.15381026438569206, | |
| "grad_norm": 1.2971192291816034, | |
| "learning_rate": 9.427542087964462e-06, | |
| "loss": 0.2012, | |
| "step": 989 | |
| }, | |
| { | |
| "epoch": 0.15396578538102643, | |
| "grad_norm": 1.063681975107411, | |
| "learning_rate": 9.426406522807932e-06, | |
| "loss": 0.2299, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.15412130637636082, | |
| "grad_norm": 1.0390353297783406, | |
| "learning_rate": 9.425269901006435e-06, | |
| "loss": 0.1438, | |
| "step": 991 | |
| }, | |
| { | |
| "epoch": 0.15427682737169518, | |
| "grad_norm": 1.821321152512482, | |
| "learning_rate": 9.424132222831301e-06, | |
| "loss": 0.1797, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.15443234836702954, | |
| "grad_norm": 1.0266940584964872, | |
| "learning_rate": 9.422993488554108e-06, | |
| "loss": 0.1524, | |
| "step": 993 | |
| }, | |
| { | |
| "epoch": 0.1545878693623639, | |
| "grad_norm": 1.2357982408354415, | |
| "learning_rate": 9.42185369844669e-06, | |
| "loss": 0.1765, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 0.1547433903576983, | |
| "grad_norm": 1.3007180654461126, | |
| "learning_rate": 9.420712852781129e-06, | |
| "loss": 0.2278, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.15489891135303266, | |
| "grad_norm": 1.3519816843089092, | |
| "learning_rate": 9.419570951829761e-06, | |
| "loss": 0.2261, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 0.15505443234836702, | |
| "grad_norm": 0.814621189176537, | |
| "learning_rate": 9.418427995865174e-06, | |
| "loss": 0.2172, | |
| "step": 997 | |
| }, | |
| { | |
| "epoch": 0.1552099533437014, | |
| "grad_norm": 1.7543842879443927, | |
| "learning_rate": 9.417283985160206e-06, | |
| "loss": 0.2164, | |
| "step": 998 | |
| }, | |
| { | |
| "epoch": 0.15536547433903578, | |
| "grad_norm": 0.8276231350286671, | |
| "learning_rate": 9.41613891998795e-06, | |
| "loss": 0.1975, | |
| "step": 999 | |
| }, | |
| { | |
| "epoch": 0.15552099533437014, | |
| "grad_norm": 1.1550898822511304, | |
| "learning_rate": 9.414992800621749e-06, | |
| "loss": 0.1501, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.15552099533437014, | |
| "eval_loss": 0.21367190778255463, | |
| "eval_runtime": 9.4284, | |
| "eval_samples_per_second": 2.758, | |
| "eval_steps_per_second": 0.742, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1556765163297045, | |
| "grad_norm": 1.6764153048318766, | |
| "learning_rate": 9.413845627335197e-06, | |
| "loss": 0.2071, | |
| "step": 1001 | |
| }, | |
| { | |
| "epoch": 0.1558320373250389, | |
| "grad_norm": 1.1886246410449919, | |
| "learning_rate": 9.41269740040214e-06, | |
| "loss": 0.1956, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 0.15598755832037325, | |
| "grad_norm": 1.0793500722611682, | |
| "learning_rate": 9.411548120096676e-06, | |
| "loss": 0.144, | |
| "step": 1003 | |
| }, | |
| { | |
| "epoch": 0.15614307931570762, | |
| "grad_norm": 1.2449924636096124, | |
| "learning_rate": 9.410397786693157e-06, | |
| "loss": 0.2734, | |
| "step": 1004 | |
| }, | |
| { | |
| "epoch": 0.15629860031104198, | |
| "grad_norm": 0.8611732851449306, | |
| "learning_rate": 9.409246400466178e-06, | |
| "loss": 0.1923, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.15645412130637637, | |
| "grad_norm": 6.74577569453225, | |
| "learning_rate": 9.408093961690596e-06, | |
| "loss": 0.1956, | |
| "step": 1006 | |
| }, | |
| { | |
| "epoch": 0.15660964230171073, | |
| "grad_norm": 1.2060004741533563, | |
| "learning_rate": 9.406940470641512e-06, | |
| "loss": 0.2739, | |
| "step": 1007 | |
| }, | |
| { | |
| "epoch": 0.1567651632970451, | |
| "grad_norm": 1.6202727992084955, | |
| "learning_rate": 9.405785927594281e-06, | |
| "loss": 0.3171, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 0.15692068429237946, | |
| "grad_norm": 2.0124632761977534, | |
| "learning_rate": 9.404630332824509e-06, | |
| "loss": 0.2104, | |
| "step": 1009 | |
| }, | |
| { | |
| "epoch": 0.15707620528771385, | |
| "grad_norm": 2.0142886633624286, | |
| "learning_rate": 9.40347368660805e-06, | |
| "loss": 0.2548, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.1572317262830482, | |
| "grad_norm": 1.3434989581281018, | |
| "learning_rate": 9.402315989221013e-06, | |
| "loss": 0.2411, | |
| "step": 1011 | |
| }, | |
| { | |
| "epoch": 0.15738724727838257, | |
| "grad_norm": 1.3315974814677487, | |
| "learning_rate": 9.40115724093976e-06, | |
| "loss": 0.2839, | |
| "step": 1012 | |
| }, | |
| { | |
| "epoch": 0.15754276827371697, | |
| "grad_norm": 1.1186058721777734, | |
| "learning_rate": 9.399997442040894e-06, | |
| "loss": 0.167, | |
| "step": 1013 | |
| }, | |
| { | |
| "epoch": 0.15769828926905133, | |
| "grad_norm": 1.4492217703231243, | |
| "learning_rate": 9.39883659280128e-06, | |
| "loss": 0.1268, | |
| "step": 1014 | |
| }, | |
| { | |
| "epoch": 0.1578538102643857, | |
| "grad_norm": 1.257425749091041, | |
| "learning_rate": 9.39767469349803e-06, | |
| "loss": 0.1433, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.15800933125972005, | |
| "grad_norm": 1.7996939549666984, | |
| "learning_rate": 9.396511744408498e-06, | |
| "loss": 0.2012, | |
| "step": 1016 | |
| }, | |
| { | |
| "epoch": 0.15816485225505444, | |
| "grad_norm": 0.8429015986655448, | |
| "learning_rate": 9.395347745810304e-06, | |
| "loss": 0.1935, | |
| "step": 1017 | |
| }, | |
| { | |
| "epoch": 0.1583203732503888, | |
| "grad_norm": 1.370521795316769, | |
| "learning_rate": 9.394182697981306e-06, | |
| "loss": 0.2183, | |
| "step": 1018 | |
| }, | |
| { | |
| "epoch": 0.15847589424572317, | |
| "grad_norm": 1.622770939923456, | |
| "learning_rate": 9.393016601199622e-06, | |
| "loss": 0.1593, | |
| "step": 1019 | |
| }, | |
| { | |
| "epoch": 0.15863141524105753, | |
| "grad_norm": 1.011909638401176, | |
| "learning_rate": 9.39184945574361e-06, | |
| "loss": 0.2053, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.15878693623639192, | |
| "grad_norm": 1.6110438711648936, | |
| "learning_rate": 9.390681261891887e-06, | |
| "loss": 0.222, | |
| "step": 1021 | |
| }, | |
| { | |
| "epoch": 0.1589424572317263, | |
| "grad_norm": 1.4859951673056488, | |
| "learning_rate": 9.389512019923318e-06, | |
| "loss": 0.231, | |
| "step": 1022 | |
| }, | |
| { | |
| "epoch": 0.15909797822706065, | |
| "grad_norm": 1.166598629738374, | |
| "learning_rate": 9.388341730117015e-06, | |
| "loss": 0.1917, | |
| "step": 1023 | |
| }, | |
| { | |
| "epoch": 0.159253499222395, | |
| "grad_norm": 1.0987845208229972, | |
| "learning_rate": 9.387170392752342e-06, | |
| "loss": 0.184, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.1594090202177294, | |
| "grad_norm": 1.5795930559063704, | |
| "learning_rate": 9.385998008108917e-06, | |
| "loss": 0.2097, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.15956454121306377, | |
| "grad_norm": 1.4302193933514027, | |
| "learning_rate": 9.384824576466601e-06, | |
| "loss": 0.2194, | |
| "step": 1026 | |
| }, | |
| { | |
| "epoch": 0.15972006220839813, | |
| "grad_norm": 0.9372034033824603, | |
| "learning_rate": 9.383650098105512e-06, | |
| "loss": 0.243, | |
| "step": 1027 | |
| }, | |
| { | |
| "epoch": 0.1598755832037325, | |
| "grad_norm": 1.0038945695499553, | |
| "learning_rate": 9.382474573306011e-06, | |
| "loss": 0.1861, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 0.16003110419906688, | |
| "grad_norm": 0.9989868346004813, | |
| "learning_rate": 9.381298002348713e-06, | |
| "loss": 0.2324, | |
| "step": 1029 | |
| }, | |
| { | |
| "epoch": 0.16018662519440124, | |
| "grad_norm": 1.4240189031581216, | |
| "learning_rate": 9.380120385514484e-06, | |
| "loss": 0.1974, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.1603421461897356, | |
| "grad_norm": 1.378754367931683, | |
| "learning_rate": 9.378941723084436e-06, | |
| "loss": 0.245, | |
| "step": 1031 | |
| }, | |
| { | |
| "epoch": 0.16049766718507, | |
| "grad_norm": 1.8715129600892846, | |
| "learning_rate": 9.37776201533993e-06, | |
| "loss": 0.3174, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 0.16065318818040436, | |
| "grad_norm": 1.1921962243878195, | |
| "learning_rate": 9.376581262562584e-06, | |
| "loss": 0.1917, | |
| "step": 1033 | |
| }, | |
| { | |
| "epoch": 0.16080870917573872, | |
| "grad_norm": 1.2635206395103649, | |
| "learning_rate": 9.375399465034257e-06, | |
| "loss": 0.1878, | |
| "step": 1034 | |
| }, | |
| { | |
| "epoch": 0.16096423017107309, | |
| "grad_norm": 1.2398545424205532, | |
| "learning_rate": 9.374216623037057e-06, | |
| "loss": 0.2344, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.16111975116640748, | |
| "grad_norm": 0.9462934166321078, | |
| "learning_rate": 9.373032736853352e-06, | |
| "loss": 0.187, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 0.16127527216174184, | |
| "grad_norm": 1.5590735847268282, | |
| "learning_rate": 9.371847806765749e-06, | |
| "loss": 0.2097, | |
| "step": 1037 | |
| }, | |
| { | |
| "epoch": 0.1614307931570762, | |
| "grad_norm": 1.160888284446341, | |
| "learning_rate": 9.370661833057103e-06, | |
| "loss": 0.1506, | |
| "step": 1038 | |
| }, | |
| { | |
| "epoch": 0.16158631415241057, | |
| "grad_norm": 1.1778543046473768, | |
| "learning_rate": 9.36947481601053e-06, | |
| "loss": 0.1716, | |
| "step": 1039 | |
| }, | |
| { | |
| "epoch": 0.16174183514774496, | |
| "grad_norm": 1.4532605779910739, | |
| "learning_rate": 9.368286755909383e-06, | |
| "loss": 0.182, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.16189735614307932, | |
| "grad_norm": 0.9502972420425978, | |
| "learning_rate": 9.36709765303727e-06, | |
| "loss": 0.2161, | |
| "step": 1041 | |
| }, | |
| { | |
| "epoch": 0.16205287713841368, | |
| "grad_norm": 1.4588748874097772, | |
| "learning_rate": 9.365907507678045e-06, | |
| "loss": 0.2338, | |
| "step": 1042 | |
| }, | |
| { | |
| "epoch": 0.16220839813374804, | |
| "grad_norm": 1.4225573142040282, | |
| "learning_rate": 9.364716320115813e-06, | |
| "loss": 0.1781, | |
| "step": 1043 | |
| }, | |
| { | |
| "epoch": 0.16236391912908243, | |
| "grad_norm": 1.029996429205044, | |
| "learning_rate": 9.363524090634928e-06, | |
| "loss": 0.2257, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 0.1625194401244168, | |
| "grad_norm": 1.379085736135871, | |
| "learning_rate": 9.362330819519991e-06, | |
| "loss": 0.2186, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.16267496111975116, | |
| "grad_norm": 1.2962827183429935, | |
| "learning_rate": 9.361136507055853e-06, | |
| "loss": 0.1916, | |
| "step": 1046 | |
| }, | |
| { | |
| "epoch": 0.16283048211508552, | |
| "grad_norm": 0.9451500150098339, | |
| "learning_rate": 9.359941153527612e-06, | |
| "loss": 0.1859, | |
| "step": 1047 | |
| }, | |
| { | |
| "epoch": 0.1629860031104199, | |
| "grad_norm": 1.0944328685975881, | |
| "learning_rate": 9.358744759220614e-06, | |
| "loss": 0.2225, | |
| "step": 1048 | |
| }, | |
| { | |
| "epoch": 0.16314152410575428, | |
| "grad_norm": 1.1266179070522002, | |
| "learning_rate": 9.357547324420461e-06, | |
| "loss": 0.2039, | |
| "step": 1049 | |
| }, | |
| { | |
| "epoch": 0.16329704510108864, | |
| "grad_norm": 1.26823288307141, | |
| "learning_rate": 9.356348849412991e-06, | |
| "loss": 0.2686, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.16345256609642303, | |
| "grad_norm": 1.3783372129870655, | |
| "learning_rate": 9.355149334484302e-06, | |
| "loss": 0.2715, | |
| "step": 1051 | |
| }, | |
| { | |
| "epoch": 0.1636080870917574, | |
| "grad_norm": 0.950454440753535, | |
| "learning_rate": 9.35394877992073e-06, | |
| "loss": 0.1697, | |
| "step": 1052 | |
| }, | |
| { | |
| "epoch": 0.16376360808709176, | |
| "grad_norm": 2.4437577046740895, | |
| "learning_rate": 9.352747186008865e-06, | |
| "loss": 0.2087, | |
| "step": 1053 | |
| }, | |
| { | |
| "epoch": 0.16391912908242612, | |
| "grad_norm": 1.4140943006046114, | |
| "learning_rate": 9.351544553035547e-06, | |
| "loss": 0.2063, | |
| "step": 1054 | |
| }, | |
| { | |
| "epoch": 0.1640746500777605, | |
| "grad_norm": 0.967217619359645, | |
| "learning_rate": 9.350340881287861e-06, | |
| "loss": 0.2008, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.16423017107309487, | |
| "grad_norm": 1.4590565286071695, | |
| "learning_rate": 9.349136171053139e-06, | |
| "loss": 0.1897, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.16438569206842923, | |
| "grad_norm": 1.0794053199949247, | |
| "learning_rate": 9.34793042261896e-06, | |
| "loss": 0.1037, | |
| "step": 1057 | |
| }, | |
| { | |
| "epoch": 0.1645412130637636, | |
| "grad_norm": 1.15272662266887, | |
| "learning_rate": 9.346723636273157e-06, | |
| "loss": 0.239, | |
| "step": 1058 | |
| }, | |
| { | |
| "epoch": 0.164696734059098, | |
| "grad_norm": 1.3755496055051248, | |
| "learning_rate": 9.345515812303802e-06, | |
| "loss": 0.2655, | |
| "step": 1059 | |
| }, | |
| { | |
| "epoch": 0.16485225505443235, | |
| "grad_norm": 1.1623669619389423, | |
| "learning_rate": 9.344306950999226e-06, | |
| "loss": 0.2254, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.1650077760497667, | |
| "grad_norm": 1.1373510201117636, | |
| "learning_rate": 9.343097052647996e-06, | |
| "loss": 0.2515, | |
| "step": 1061 | |
| }, | |
| { | |
| "epoch": 0.16516329704510108, | |
| "grad_norm": 1.349812652007435, | |
| "learning_rate": 9.341886117538931e-06, | |
| "loss": 0.2367, | |
| "step": 1062 | |
| }, | |
| { | |
| "epoch": 0.16531881804043547, | |
| "grad_norm": 1.0436524504014346, | |
| "learning_rate": 9.340674145961101e-06, | |
| "loss": 0.1552, | |
| "step": 1063 | |
| }, | |
| { | |
| "epoch": 0.16547433903576983, | |
| "grad_norm": 1.3297059840324263, | |
| "learning_rate": 9.339461138203821e-06, | |
| "loss": 0.2201, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 0.1656298600311042, | |
| "grad_norm": 1.7541537167845238, | |
| "learning_rate": 9.338247094556651e-06, | |
| "loss": 0.2076, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.16578538102643858, | |
| "grad_norm": 1.442252163275357, | |
| "learning_rate": 9.3370320153094e-06, | |
| "loss": 0.1753, | |
| "step": 1066 | |
| }, | |
| { | |
| "epoch": 0.16594090202177295, | |
| "grad_norm": 1.143025605577321, | |
| "learning_rate": 9.335815900752125e-06, | |
| "loss": 0.2217, | |
| "step": 1067 | |
| }, | |
| { | |
| "epoch": 0.1660964230171073, | |
| "grad_norm": 1.178025675869792, | |
| "learning_rate": 9.33459875117513e-06, | |
| "loss": 0.1621, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 0.16625194401244167, | |
| "grad_norm": 0.8859479026343935, | |
| "learning_rate": 9.333380566868963e-06, | |
| "loss": 0.2214, | |
| "step": 1069 | |
| }, | |
| { | |
| "epoch": 0.16640746500777606, | |
| "grad_norm": 1.1580516447127225, | |
| "learning_rate": 9.332161348124426e-06, | |
| "loss": 0.2104, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.16656298600311042, | |
| "grad_norm": 0.9322363288405592, | |
| "learning_rate": 9.33094109523256e-06, | |
| "loss": 0.1524, | |
| "step": 1071 | |
| }, | |
| { | |
| "epoch": 0.1667185069984448, | |
| "grad_norm": 1.2071920671355123, | |
| "learning_rate": 9.32971980848466e-06, | |
| "loss": 0.2204, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 0.16687402799377915, | |
| "grad_norm": 1.4321090820471434, | |
| "learning_rate": 9.328497488172256e-06, | |
| "loss": 0.2185, | |
| "step": 1073 | |
| }, | |
| { | |
| "epoch": 0.16702954898911354, | |
| "grad_norm": 1.5323210185604608, | |
| "learning_rate": 9.327274134587144e-06, | |
| "loss": 0.1967, | |
| "step": 1074 | |
| }, | |
| { | |
| "epoch": 0.1671850699844479, | |
| "grad_norm": 1.2827697157454871, | |
| "learning_rate": 9.326049748021348e-06, | |
| "loss": 0.1835, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.16734059097978227, | |
| "grad_norm": 0.9598851088099357, | |
| "learning_rate": 9.324824328767148e-06, | |
| "loss": 0.1524, | |
| "step": 1076 | |
| }, | |
| { | |
| "epoch": 0.16749611197511663, | |
| "grad_norm": 1.1012363230038584, | |
| "learning_rate": 9.323597877117069e-06, | |
| "loss": 0.1934, | |
| "step": 1077 | |
| }, | |
| { | |
| "epoch": 0.16765163297045102, | |
| "grad_norm": 1.7979943018863753, | |
| "learning_rate": 9.322370393363881e-06, | |
| "loss": 0.2809, | |
| "step": 1078 | |
| }, | |
| { | |
| "epoch": 0.16780715396578538, | |
| "grad_norm": 0.9525483556320685, | |
| "learning_rate": 9.321141877800604e-06, | |
| "loss": 0.1544, | |
| "step": 1079 | |
| }, | |
| { | |
| "epoch": 0.16796267496111975, | |
| "grad_norm": 1.1079754408286966, | |
| "learning_rate": 9.319912330720502e-06, | |
| "loss": 0.1939, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.1681181959564541, | |
| "grad_norm": 1.4615045454023567, | |
| "learning_rate": 9.31868175241708e-06, | |
| "loss": 0.1879, | |
| "step": 1081 | |
| }, | |
| { | |
| "epoch": 0.1682737169517885, | |
| "grad_norm": 0.9677318917431114, | |
| "learning_rate": 9.3174501431841e-06, | |
| "loss": 0.1572, | |
| "step": 1082 | |
| }, | |
| { | |
| "epoch": 0.16842923794712286, | |
| "grad_norm": 1.1156223371393144, | |
| "learning_rate": 9.316217503315562e-06, | |
| "loss": 0.2477, | |
| "step": 1083 | |
| }, | |
| { | |
| "epoch": 0.16858475894245722, | |
| "grad_norm": 0.9283556985369971, | |
| "learning_rate": 9.314983833105713e-06, | |
| "loss": 0.1855, | |
| "step": 1084 | |
| }, | |
| { | |
| "epoch": 0.16874027993779162, | |
| "grad_norm": 0.9107625137180413, | |
| "learning_rate": 9.313749132849048e-06, | |
| "loss": 0.1941, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.16889580093312598, | |
| "grad_norm": 1.1200752990922627, | |
| "learning_rate": 9.312513402840308e-06, | |
| "loss": 0.1714, | |
| "step": 1086 | |
| }, | |
| { | |
| "epoch": 0.16905132192846034, | |
| "grad_norm": 1.5919484746453285, | |
| "learning_rate": 9.311276643374478e-06, | |
| "loss": 0.1907, | |
| "step": 1087 | |
| }, | |
| { | |
| "epoch": 0.1692068429237947, | |
| "grad_norm": 1.6737891841333687, | |
| "learning_rate": 9.310038854746793e-06, | |
| "loss": 0.3096, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.1693623639191291, | |
| "grad_norm": 0.9356610939198378, | |
| "learning_rate": 9.308800037252726e-06, | |
| "loss": 0.215, | |
| "step": 1089 | |
| }, | |
| { | |
| "epoch": 0.16951788491446346, | |
| "grad_norm": 0.9978911792591384, | |
| "learning_rate": 9.307560191188e-06, | |
| "loss": 0.2023, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.16967340590979782, | |
| "grad_norm": 0.8618605808228078, | |
| "learning_rate": 9.30631931684859e-06, | |
| "loss": 0.1835, | |
| "step": 1091 | |
| }, | |
| { | |
| "epoch": 0.16982892690513218, | |
| "grad_norm": 1.073899023320524, | |
| "learning_rate": 9.305077414530701e-06, | |
| "loss": 0.2856, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 0.16998444790046657, | |
| "grad_norm": 1.390799646940327, | |
| "learning_rate": 9.303834484530798e-06, | |
| "loss": 0.1768, | |
| "step": 1093 | |
| }, | |
| { | |
| "epoch": 0.17013996889580094, | |
| "grad_norm": 1.1517992631531213, | |
| "learning_rate": 9.302590527145585e-06, | |
| "loss": 0.1661, | |
| "step": 1094 | |
| }, | |
| { | |
| "epoch": 0.1702954898911353, | |
| "grad_norm": 1.0942354595322217, | |
| "learning_rate": 9.301345542672012e-06, | |
| "loss": 0.2161, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.17045101088646966, | |
| "grad_norm": 0.8079291053355052, | |
| "learning_rate": 9.300099531407273e-06, | |
| "loss": 0.1768, | |
| "step": 1096 | |
| }, | |
| { | |
| "epoch": 0.17060653188180405, | |
| "grad_norm": 0.8090971826904667, | |
| "learning_rate": 9.298852493648808e-06, | |
| "loss": 0.1761, | |
| "step": 1097 | |
| }, | |
| { | |
| "epoch": 0.17076205287713841, | |
| "grad_norm": 1.2570428694136606, | |
| "learning_rate": 9.297604429694305e-06, | |
| "loss": 0.1742, | |
| "step": 1098 | |
| }, | |
| { | |
| "epoch": 0.17091757387247278, | |
| "grad_norm": 1.4714283316352859, | |
| "learning_rate": 9.296355339841692e-06, | |
| "loss": 0.2716, | |
| "step": 1099 | |
| }, | |
| { | |
| "epoch": 0.17107309486780714, | |
| "grad_norm": 1.07865700806752, | |
| "learning_rate": 9.295105224389144e-06, | |
| "loss": 0.1507, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.17107309486780714, | |
| "eval_loss": 0.21004652976989746, | |
| "eval_runtime": 9.4236, | |
| "eval_samples_per_second": 2.759, | |
| "eval_steps_per_second": 0.743, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.17122861586314153, | |
| "grad_norm": 0.8784655316390252, | |
| "learning_rate": 9.293854083635081e-06, | |
| "loss": 0.1673, | |
| "step": 1101 | |
| }, | |
| { | |
| "epoch": 0.1713841368584759, | |
| "grad_norm": 1.025281186756548, | |
| "learning_rate": 9.292601917878169e-06, | |
| "loss": 0.1715, | |
| "step": 1102 | |
| }, | |
| { | |
| "epoch": 0.17153965785381026, | |
| "grad_norm": 1.409333718683306, | |
| "learning_rate": 9.291348727417318e-06, | |
| "loss": 0.2155, | |
| "step": 1103 | |
| }, | |
| { | |
| "epoch": 0.17169517884914465, | |
| "grad_norm": 1.0469534251307742, | |
| "learning_rate": 9.290094512551679e-06, | |
| "loss": 0.1918, | |
| "step": 1104 | |
| }, | |
| { | |
| "epoch": 0.171850699844479, | |
| "grad_norm": 1.275008024365504, | |
| "learning_rate": 9.288839273580652e-06, | |
| "loss": 0.1264, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.17200622083981337, | |
| "grad_norm": 1.2168876399929267, | |
| "learning_rate": 9.287583010803882e-06, | |
| "loss": 0.2855, | |
| "step": 1106 | |
| }, | |
| { | |
| "epoch": 0.17216174183514774, | |
| "grad_norm": 1.2066762279123466, | |
| "learning_rate": 9.286325724521254e-06, | |
| "loss": 0.2242, | |
| "step": 1107 | |
| }, | |
| { | |
| "epoch": 0.17231726283048213, | |
| "grad_norm": 2.4948253959447144, | |
| "learning_rate": 9.285067415032902e-06, | |
| "loss": 0.2875, | |
| "step": 1108 | |
| }, | |
| { | |
| "epoch": 0.1724727838258165, | |
| "grad_norm": 1.8284540511597713, | |
| "learning_rate": 9.283808082639198e-06, | |
| "loss": 0.2049, | |
| "step": 1109 | |
| }, | |
| { | |
| "epoch": 0.17262830482115085, | |
| "grad_norm": 1.3355119525104016, | |
| "learning_rate": 9.282547727640767e-06, | |
| "loss": 0.1717, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.17278382581648521, | |
| "grad_norm": 1.0266534905254066, | |
| "learning_rate": 9.281286350338472e-06, | |
| "loss": 0.2066, | |
| "step": 1111 | |
| }, | |
| { | |
| "epoch": 0.1729393468118196, | |
| "grad_norm": 1.2099083780797275, | |
| "learning_rate": 9.280023951033418e-06, | |
| "loss": 0.2807, | |
| "step": 1112 | |
| }, | |
| { | |
| "epoch": 0.17309486780715397, | |
| "grad_norm": 0.949550488293792, | |
| "learning_rate": 9.278760530026963e-06, | |
| "loss": 0.1992, | |
| "step": 1113 | |
| }, | |
| { | |
| "epoch": 0.17325038880248833, | |
| "grad_norm": 1.0598653084819885, | |
| "learning_rate": 9.277496087620696e-06, | |
| "loss": 0.2358, | |
| "step": 1114 | |
| }, | |
| { | |
| "epoch": 0.1734059097978227, | |
| "grad_norm": 1.4050304182051088, | |
| "learning_rate": 9.276230624116464e-06, | |
| "loss": 0.2222, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.17356143079315708, | |
| "grad_norm": 0.9817712530234229, | |
| "learning_rate": 9.274964139816347e-06, | |
| "loss": 0.1931, | |
| "step": 1116 | |
| }, | |
| { | |
| "epoch": 0.17371695178849145, | |
| "grad_norm": 1.7060543693066812, | |
| "learning_rate": 9.273696635022674e-06, | |
| "loss": 0.2343, | |
| "step": 1117 | |
| }, | |
| { | |
| "epoch": 0.1738724727838258, | |
| "grad_norm": 1.2527360379181598, | |
| "learning_rate": 9.272428110038016e-06, | |
| "loss": 0.1717, | |
| "step": 1118 | |
| }, | |
| { | |
| "epoch": 0.17402799377916017, | |
| "grad_norm": 1.0592648467758805, | |
| "learning_rate": 9.271158565165186e-06, | |
| "loss": 0.1338, | |
| "step": 1119 | |
| }, | |
| { | |
| "epoch": 0.17418351477449456, | |
| "grad_norm": 1.1697431614729739, | |
| "learning_rate": 9.269888000707243e-06, | |
| "loss": 0.0937, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.17433903576982893, | |
| "grad_norm": 1.3666630215902802, | |
| "learning_rate": 9.26861641696749e-06, | |
| "loss": 0.195, | |
| "step": 1121 | |
| }, | |
| { | |
| "epoch": 0.1744945567651633, | |
| "grad_norm": 0.9618565647030869, | |
| "learning_rate": 9.267343814249468e-06, | |
| "loss": 0.175, | |
| "step": 1122 | |
| }, | |
| { | |
| "epoch": 0.17465007776049768, | |
| "grad_norm": 1.4220832361635052, | |
| "learning_rate": 9.266070192856968e-06, | |
| "loss": 0.1593, | |
| "step": 1123 | |
| }, | |
| { | |
| "epoch": 0.17480559875583204, | |
| "grad_norm": 0.776257033559064, | |
| "learning_rate": 9.264795553094022e-06, | |
| "loss": 0.2249, | |
| "step": 1124 | |
| }, | |
| { | |
| "epoch": 0.1749611197511664, | |
| "grad_norm": 1.2113799530837854, | |
| "learning_rate": 9.263519895264901e-06, | |
| "loss": 0.1907, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.17511664074650077, | |
| "grad_norm": 1.3082437362032786, | |
| "learning_rate": 9.262243219674126e-06, | |
| "loss": 0.2666, | |
| "step": 1126 | |
| }, | |
| { | |
| "epoch": 0.17527216174183516, | |
| "grad_norm": 1.872862944531211, | |
| "learning_rate": 9.260965526626452e-06, | |
| "loss": 0.1784, | |
| "step": 1127 | |
| }, | |
| { | |
| "epoch": 0.17542768273716952, | |
| "grad_norm": 1.3432522813757912, | |
| "learning_rate": 9.25968681642689e-06, | |
| "loss": 0.1451, | |
| "step": 1128 | |
| }, | |
| { | |
| "epoch": 0.17558320373250388, | |
| "grad_norm": 0.9703679937198076, | |
| "learning_rate": 9.258407089380679e-06, | |
| "loss": 0.1297, | |
| "step": 1129 | |
| }, | |
| { | |
| "epoch": 0.17573872472783825, | |
| "grad_norm": 1.0365436632456377, | |
| "learning_rate": 9.25712634579331e-06, | |
| "loss": 0.1761, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.17589424572317264, | |
| "grad_norm": 2.1522303469420994, | |
| "learning_rate": 9.255844585970516e-06, | |
| "loss": 0.1296, | |
| "step": 1131 | |
| }, | |
| { | |
| "epoch": 0.176049766718507, | |
| "grad_norm": 1.291217930882477, | |
| "learning_rate": 9.254561810218269e-06, | |
| "loss": 0.2044, | |
| "step": 1132 | |
| }, | |
| { | |
| "epoch": 0.17620528771384136, | |
| "grad_norm": 0.9937462574500329, | |
| "learning_rate": 9.253278018842786e-06, | |
| "loss": 0.1997, | |
| "step": 1133 | |
| }, | |
| { | |
| "epoch": 0.17636080870917573, | |
| "grad_norm": 0.9450489875743622, | |
| "learning_rate": 9.251993212150525e-06, | |
| "loss": 0.1747, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 0.17651632970451012, | |
| "grad_norm": 1.4735357191672043, | |
| "learning_rate": 9.250707390448187e-06, | |
| "loss": 0.2377, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.17667185069984448, | |
| "grad_norm": 0.957023692443933, | |
| "learning_rate": 9.24942055404272e-06, | |
| "loss": 0.1319, | |
| "step": 1136 | |
| }, | |
| { | |
| "epoch": 0.17682737169517884, | |
| "grad_norm": 0.9533362941250507, | |
| "learning_rate": 9.248132703241306e-06, | |
| "loss": 0.142, | |
| "step": 1137 | |
| }, | |
| { | |
| "epoch": 0.17698289269051323, | |
| "grad_norm": 1.1321821260027138, | |
| "learning_rate": 9.246843838351371e-06, | |
| "loss": 0.185, | |
| "step": 1138 | |
| }, | |
| { | |
| "epoch": 0.1771384136858476, | |
| "grad_norm": 0.6564569809439412, | |
| "learning_rate": 9.24555395968059e-06, | |
| "loss": 0.1511, | |
| "step": 1139 | |
| }, | |
| { | |
| "epoch": 0.17729393468118196, | |
| "grad_norm": 0.8235534803965409, | |
| "learning_rate": 9.244263067536872e-06, | |
| "loss": 0.1851, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.17744945567651632, | |
| "grad_norm": 0.97851675810554, | |
| "learning_rate": 9.24297116222837e-06, | |
| "loss": 0.2184, | |
| "step": 1141 | |
| }, | |
| { | |
| "epoch": 0.1776049766718507, | |
| "grad_norm": 1.1485004351012151, | |
| "learning_rate": 9.241678244063482e-06, | |
| "loss": 0.2106, | |
| "step": 1142 | |
| }, | |
| { | |
| "epoch": 0.17776049766718507, | |
| "grad_norm": 1.081146125371241, | |
| "learning_rate": 9.240384313350845e-06, | |
| "loss": 0.1844, | |
| "step": 1143 | |
| }, | |
| { | |
| "epoch": 0.17791601866251944, | |
| "grad_norm": 1.4013409835542678, | |
| "learning_rate": 9.239089370399338e-06, | |
| "loss": 0.2538, | |
| "step": 1144 | |
| }, | |
| { | |
| "epoch": 0.1780715396578538, | |
| "grad_norm": 6.587281038828778, | |
| "learning_rate": 9.237793415518083e-06, | |
| "loss": 0.2319, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.1782270606531882, | |
| "grad_norm": 1.463087775034242, | |
| "learning_rate": 9.23649644901644e-06, | |
| "loss": 0.1833, | |
| "step": 1146 | |
| }, | |
| { | |
| "epoch": 0.17838258164852255, | |
| "grad_norm": 0.8603221586452274, | |
| "learning_rate": 9.235198471204017e-06, | |
| "loss": 0.1652, | |
| "step": 1147 | |
| }, | |
| { | |
| "epoch": 0.17853810264385692, | |
| "grad_norm": 1.243900965186844, | |
| "learning_rate": 9.233899482390654e-06, | |
| "loss": 0.1688, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 0.17869362363919128, | |
| "grad_norm": 2.2219504182745964, | |
| "learning_rate": 9.232599482886444e-06, | |
| "loss": 0.2472, | |
| "step": 1149 | |
| }, | |
| { | |
| "epoch": 0.17884914463452567, | |
| "grad_norm": 0.8152250444616337, | |
| "learning_rate": 9.23129847300171e-06, | |
| "loss": 0.1542, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.17900466562986003, | |
| "grad_norm": 0.8972000242254355, | |
| "learning_rate": 9.229996453047022e-06, | |
| "loss": 0.1914, | |
| "step": 1151 | |
| }, | |
| { | |
| "epoch": 0.1791601866251944, | |
| "grad_norm": 1.3946215944007783, | |
| "learning_rate": 9.228693423333192e-06, | |
| "loss": 0.2517, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.17931570762052876, | |
| "grad_norm": 1.7211813642698215, | |
| "learning_rate": 9.227389384171272e-06, | |
| "loss": 0.1639, | |
| "step": 1153 | |
| }, | |
| { | |
| "epoch": 0.17947122861586315, | |
| "grad_norm": 1.045567391255685, | |
| "learning_rate": 9.22608433587255e-06, | |
| "loss": 0.1269, | |
| "step": 1154 | |
| }, | |
| { | |
| "epoch": 0.1796267496111975, | |
| "grad_norm": 1.6046875031988923, | |
| "learning_rate": 9.224778278748567e-06, | |
| "loss": 0.279, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.17978227060653187, | |
| "grad_norm": 1.204453994991899, | |
| "learning_rate": 9.223471213111089e-06, | |
| "loss": 0.1925, | |
| "step": 1156 | |
| }, | |
| { | |
| "epoch": 0.17993779160186626, | |
| "grad_norm": 1.3023690662744187, | |
| "learning_rate": 9.222163139272134e-06, | |
| "loss": 0.1788, | |
| "step": 1157 | |
| }, | |
| { | |
| "epoch": 0.18009331259720063, | |
| "grad_norm": 1.1433449264456945, | |
| "learning_rate": 9.220854057543958e-06, | |
| "loss": 0.2228, | |
| "step": 1158 | |
| }, | |
| { | |
| "epoch": 0.180248833592535, | |
| "grad_norm": 1.2657407961939997, | |
| "learning_rate": 9.219543968239057e-06, | |
| "loss": 0.1985, | |
| "step": 1159 | |
| }, | |
| { | |
| "epoch": 0.18040435458786935, | |
| "grad_norm": 1.0010295228905417, | |
| "learning_rate": 9.218232871670168e-06, | |
| "loss": 0.1976, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.18055987558320374, | |
| "grad_norm": 1.0125003198196167, | |
| "learning_rate": 9.216920768150266e-06, | |
| "loss": 0.1886, | |
| "step": 1161 | |
| }, | |
| { | |
| "epoch": 0.1807153965785381, | |
| "grad_norm": 1.2238980097949077, | |
| "learning_rate": 9.215607657992569e-06, | |
| "loss": 0.2848, | |
| "step": 1162 | |
| }, | |
| { | |
| "epoch": 0.18087091757387247, | |
| "grad_norm": 1.6192762802858285, | |
| "learning_rate": 9.214293541510537e-06, | |
| "loss": 0.1714, | |
| "step": 1163 | |
| }, | |
| { | |
| "epoch": 0.18102643856920683, | |
| "grad_norm": 1.0273533259054548, | |
| "learning_rate": 9.212978419017864e-06, | |
| "loss": 0.2001, | |
| "step": 1164 | |
| }, | |
| { | |
| "epoch": 0.18118195956454122, | |
| "grad_norm": 1.1061300881511378, | |
| "learning_rate": 9.211662290828493e-06, | |
| "loss": 0.2214, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.18133748055987559, | |
| "grad_norm": 1.061080909625091, | |
| "learning_rate": 9.210345157256597e-06, | |
| "loss": 0.1914, | |
| "step": 1166 | |
| }, | |
| { | |
| "epoch": 0.18149300155520995, | |
| "grad_norm": 0.6997117059310394, | |
| "learning_rate": 9.209027018616598e-06, | |
| "loss": 0.1434, | |
| "step": 1167 | |
| }, | |
| { | |
| "epoch": 0.1816485225505443, | |
| "grad_norm": 2.4894170703666125, | |
| "learning_rate": 9.207707875223153e-06, | |
| "loss": 0.154, | |
| "step": 1168 | |
| }, | |
| { | |
| "epoch": 0.1818040435458787, | |
| "grad_norm": 1.294402841120763, | |
| "learning_rate": 9.20638772739116e-06, | |
| "loss": 0.1398, | |
| "step": 1169 | |
| }, | |
| { | |
| "epoch": 0.18195956454121306, | |
| "grad_norm": 1.4691556974020672, | |
| "learning_rate": 9.205066575435754e-06, | |
| "loss": 0.2599, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.18211508553654743, | |
| "grad_norm": 1.5109698106731952, | |
| "learning_rate": 9.203744419672318e-06, | |
| "loss": 0.2715, | |
| "step": 1171 | |
| }, | |
| { | |
| "epoch": 0.1822706065318818, | |
| "grad_norm": 0.7824851605920647, | |
| "learning_rate": 9.202421260416464e-06, | |
| "loss": 0.155, | |
| "step": 1172 | |
| }, | |
| { | |
| "epoch": 0.18242612752721618, | |
| "grad_norm": 1.229104135640711, | |
| "learning_rate": 9.20109709798405e-06, | |
| "loss": 0.173, | |
| "step": 1173 | |
| }, | |
| { | |
| "epoch": 0.18258164852255054, | |
| "grad_norm": 1.493187696337834, | |
| "learning_rate": 9.199771932691172e-06, | |
| "loss": 0.1874, | |
| "step": 1174 | |
| }, | |
| { | |
| "epoch": 0.1827371695178849, | |
| "grad_norm": 1.3355865457774434, | |
| "learning_rate": 9.198445764854166e-06, | |
| "loss": 0.1868, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.1828926905132193, | |
| "grad_norm": 1.4822915990950787, | |
| "learning_rate": 9.19711859478961e-06, | |
| "loss": 0.1936, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 0.18304821150855366, | |
| "grad_norm": 1.3568523945836255, | |
| "learning_rate": 9.19579042281431e-06, | |
| "loss": 0.2351, | |
| "step": 1177 | |
| }, | |
| { | |
| "epoch": 0.18320373250388802, | |
| "grad_norm": 1.1221237537622042, | |
| "learning_rate": 9.194461249245326e-06, | |
| "loss": 0.1651, | |
| "step": 1178 | |
| }, | |
| { | |
| "epoch": 0.18335925349922239, | |
| "grad_norm": 1.0427220049147299, | |
| "learning_rate": 9.193131074399949e-06, | |
| "loss": 0.2095, | |
| "step": 1179 | |
| }, | |
| { | |
| "epoch": 0.18351477449455678, | |
| "grad_norm": 1.1443234808493088, | |
| "learning_rate": 9.191799898595706e-06, | |
| "loss": 0.1987, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.18367029548989114, | |
| "grad_norm": 0.8812799774315752, | |
| "learning_rate": 9.190467722150373e-06, | |
| "loss": 0.2529, | |
| "step": 1181 | |
| }, | |
| { | |
| "epoch": 0.1838258164852255, | |
| "grad_norm": 0.9190808713383141, | |
| "learning_rate": 9.189134545381954e-06, | |
| "loss": 0.2043, | |
| "step": 1182 | |
| }, | |
| { | |
| "epoch": 0.18398133748055986, | |
| "grad_norm": 1.1496814316391453, | |
| "learning_rate": 9.187800368608703e-06, | |
| "loss": 0.2166, | |
| "step": 1183 | |
| }, | |
| { | |
| "epoch": 0.18413685847589426, | |
| "grad_norm": 1.3800541644049227, | |
| "learning_rate": 9.1864651921491e-06, | |
| "loss": 0.2258, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.18429237947122862, | |
| "grad_norm": 0.91743359427612, | |
| "learning_rate": 9.185129016321877e-06, | |
| "loss": 0.1383, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.18444790046656298, | |
| "grad_norm": 1.4610869068656602, | |
| "learning_rate": 9.18379184144599e-06, | |
| "loss": 0.1508, | |
| "step": 1186 | |
| }, | |
| { | |
| "epoch": 0.18460342146189734, | |
| "grad_norm": 1.675711445184492, | |
| "learning_rate": 9.18245366784065e-06, | |
| "loss": 0.303, | |
| "step": 1187 | |
| }, | |
| { | |
| "epoch": 0.18475894245723173, | |
| "grad_norm": 0.7182617914658281, | |
| "learning_rate": 9.18111449582529e-06, | |
| "loss": 0.1663, | |
| "step": 1188 | |
| }, | |
| { | |
| "epoch": 0.1849144634525661, | |
| "grad_norm": 2.5919566299762105, | |
| "learning_rate": 9.179774325719593e-06, | |
| "loss": 0.1913, | |
| "step": 1189 | |
| }, | |
| { | |
| "epoch": 0.18506998444790046, | |
| "grad_norm": 1.5246187638405735, | |
| "learning_rate": 9.178433157843474e-06, | |
| "loss": 0.1974, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.18522550544323485, | |
| "grad_norm": 1.1870049850604156, | |
| "learning_rate": 9.17709099251709e-06, | |
| "loss": 0.1889, | |
| "step": 1191 | |
| }, | |
| { | |
| "epoch": 0.1853810264385692, | |
| "grad_norm": 1.2780528349437963, | |
| "learning_rate": 9.175747830060837e-06, | |
| "loss": 0.1682, | |
| "step": 1192 | |
| }, | |
| { | |
| "epoch": 0.18553654743390358, | |
| "grad_norm": 0.852064776201917, | |
| "learning_rate": 9.174403670795342e-06, | |
| "loss": 0.1786, | |
| "step": 1193 | |
| }, | |
| { | |
| "epoch": 0.18569206842923794, | |
| "grad_norm": 0.982736851978155, | |
| "learning_rate": 9.173058515041477e-06, | |
| "loss": 0.1759, | |
| "step": 1194 | |
| }, | |
| { | |
| "epoch": 0.18584758942457233, | |
| "grad_norm": 5.383045313258924, | |
| "learning_rate": 9.171712363120351e-06, | |
| "loss": 0.3862, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.1860031104199067, | |
| "grad_norm": 0.9849374073337689, | |
| "learning_rate": 9.170365215353306e-06, | |
| "loss": 0.1981, | |
| "step": 1196 | |
| }, | |
| { | |
| "epoch": 0.18615863141524105, | |
| "grad_norm": 1.1001803535527055, | |
| "learning_rate": 9.169017072061926e-06, | |
| "loss": 0.1989, | |
| "step": 1197 | |
| }, | |
| { | |
| "epoch": 0.18631415241057542, | |
| "grad_norm": 1.1570335250140034, | |
| "learning_rate": 9.167667933568032e-06, | |
| "loss": 0.1822, | |
| "step": 1198 | |
| }, | |
| { | |
| "epoch": 0.1864696734059098, | |
| "grad_norm": 1.6984581879530103, | |
| "learning_rate": 9.166317800193683e-06, | |
| "loss": 0.2171, | |
| "step": 1199 | |
| }, | |
| { | |
| "epoch": 0.18662519440124417, | |
| "grad_norm": 1.650860536979747, | |
| "learning_rate": 9.164966672261171e-06, | |
| "loss": 0.3055, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.18662519440124417, | |
| "eval_loss": 0.210090771317482, | |
| "eval_runtime": 9.4293, | |
| "eval_samples_per_second": 2.757, | |
| "eval_steps_per_second": 0.742, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.18678071539657853, | |
| "grad_norm": 1.416615143797259, | |
| "learning_rate": 9.163614550093035e-06, | |
| "loss": 0.1347, | |
| "step": 1201 | |
| }, | |
| { | |
| "epoch": 0.1869362363919129, | |
| "grad_norm": 1.3794733777830905, | |
| "learning_rate": 9.16226143401204e-06, | |
| "loss": 0.2041, | |
| "step": 1202 | |
| }, | |
| { | |
| "epoch": 0.1870917573872473, | |
| "grad_norm": 0.9282701621282511, | |
| "learning_rate": 9.160907324341199e-06, | |
| "loss": 0.1589, | |
| "step": 1203 | |
| }, | |
| { | |
| "epoch": 0.18724727838258165, | |
| "grad_norm": 1.4894253244171338, | |
| "learning_rate": 9.159552221403752e-06, | |
| "loss": 0.174, | |
| "step": 1204 | |
| }, | |
| { | |
| "epoch": 0.187402799377916, | |
| "grad_norm": 1.1504157025776975, | |
| "learning_rate": 9.158196125523182e-06, | |
| "loss": 0.1942, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.18755832037325038, | |
| "grad_norm": 0.7255523870962133, | |
| "learning_rate": 9.156839037023209e-06, | |
| "loss": 0.1925, | |
| "step": 1206 | |
| }, | |
| { | |
| "epoch": 0.18771384136858477, | |
| "grad_norm": 1.3297160614851913, | |
| "learning_rate": 9.155480956227789e-06, | |
| "loss": 0.2448, | |
| "step": 1207 | |
| }, | |
| { | |
| "epoch": 0.18786936236391913, | |
| "grad_norm": 1.2394203928257357, | |
| "learning_rate": 9.154121883461115e-06, | |
| "loss": 0.1644, | |
| "step": 1208 | |
| }, | |
| { | |
| "epoch": 0.1880248833592535, | |
| "grad_norm": 1.110942304313815, | |
| "learning_rate": 9.152761819047617e-06, | |
| "loss": 0.158, | |
| "step": 1209 | |
| }, | |
| { | |
| "epoch": 0.18818040435458788, | |
| "grad_norm": 0.8597754146450871, | |
| "learning_rate": 9.151400763311958e-06, | |
| "loss": 0.1765, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.18833592534992225, | |
| "grad_norm": 1.1244255534137637, | |
| "learning_rate": 9.150038716579046e-06, | |
| "loss": 0.14, | |
| "step": 1211 | |
| }, | |
| { | |
| "epoch": 0.1884914463452566, | |
| "grad_norm": 0.9441808017939254, | |
| "learning_rate": 9.148675679174017e-06, | |
| "loss": 0.1685, | |
| "step": 1212 | |
| }, | |
| { | |
| "epoch": 0.18864696734059097, | |
| "grad_norm": 1.49569762403274, | |
| "learning_rate": 9.147311651422248e-06, | |
| "loss": 0.1637, | |
| "step": 1213 | |
| }, | |
| { | |
| "epoch": 0.18880248833592536, | |
| "grad_norm": 1.0568658204953814, | |
| "learning_rate": 9.145946633649352e-06, | |
| "loss": 0.1713, | |
| "step": 1214 | |
| }, | |
| { | |
| "epoch": 0.18895800933125972, | |
| "grad_norm": 1.2127109888393217, | |
| "learning_rate": 9.144580626181176e-06, | |
| "loss": 0.161, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.1891135303265941, | |
| "grad_norm": 0.8503234486008238, | |
| "learning_rate": 9.143213629343807e-06, | |
| "loss": 0.1489, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.18926905132192845, | |
| "grad_norm": 0.9120088478974758, | |
| "learning_rate": 9.141845643463565e-06, | |
| "loss": 0.1939, | |
| "step": 1217 | |
| }, | |
| { | |
| "epoch": 0.18942457231726284, | |
| "grad_norm": 1.0121267789823751, | |
| "learning_rate": 9.140476668867008e-06, | |
| "loss": 0.15, | |
| "step": 1218 | |
| }, | |
| { | |
| "epoch": 0.1895800933125972, | |
| "grad_norm": 1.3638566134338714, | |
| "learning_rate": 9.13910670588093e-06, | |
| "loss": 0.2105, | |
| "step": 1219 | |
| }, | |
| { | |
| "epoch": 0.18973561430793157, | |
| "grad_norm": 1.6276021550806605, | |
| "learning_rate": 9.13773575483236e-06, | |
| "loss": 0.2869, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.18989113530326593, | |
| "grad_norm": 1.6764188720931026, | |
| "learning_rate": 9.136363816048562e-06, | |
| "loss": 0.1458, | |
| "step": 1221 | |
| }, | |
| { | |
| "epoch": 0.19004665629860032, | |
| "grad_norm": 0.6701780576831128, | |
| "learning_rate": 9.134990889857036e-06, | |
| "loss": 0.1842, | |
| "step": 1222 | |
| }, | |
| { | |
| "epoch": 0.19020217729393468, | |
| "grad_norm": 1.1322931167082202, | |
| "learning_rate": 9.133616976585522e-06, | |
| "loss": 0.2556, | |
| "step": 1223 | |
| }, | |
| { | |
| "epoch": 0.19035769828926905, | |
| "grad_norm": 1.2524154763717683, | |
| "learning_rate": 9.13224207656199e-06, | |
| "loss": 0.2104, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 0.1905132192846034, | |
| "grad_norm": 0.9592897430767787, | |
| "learning_rate": 9.130866190114649e-06, | |
| "loss": 0.2833, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.1906687402799378, | |
| "grad_norm": 1.7651472837705433, | |
| "learning_rate": 9.12948931757194e-06, | |
| "loss": 0.2524, | |
| "step": 1226 | |
| }, | |
| { | |
| "epoch": 0.19082426127527216, | |
| "grad_norm": 0.9879072001537496, | |
| "learning_rate": 9.128111459262543e-06, | |
| "loss": 0.1624, | |
| "step": 1227 | |
| }, | |
| { | |
| "epoch": 0.19097978227060652, | |
| "grad_norm": 1.320308534660155, | |
| "learning_rate": 9.126732615515373e-06, | |
| "loss": 0.2937, | |
| "step": 1228 | |
| }, | |
| { | |
| "epoch": 0.19113530326594091, | |
| "grad_norm": 1.6528470759003213, | |
| "learning_rate": 9.125352786659577e-06, | |
| "loss": 0.1824, | |
| "step": 1229 | |
| }, | |
| { | |
| "epoch": 0.19129082426127528, | |
| "grad_norm": 1.099113810582022, | |
| "learning_rate": 9.123971973024543e-06, | |
| "loss": 0.2282, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.19144634525660964, | |
| "grad_norm": 0.9906932002367946, | |
| "learning_rate": 9.122590174939887e-06, | |
| "loss": 0.1908, | |
| "step": 1231 | |
| }, | |
| { | |
| "epoch": 0.191601866251944, | |
| "grad_norm": 1.3700619269813867, | |
| "learning_rate": 9.121207392735465e-06, | |
| "loss": 0.1736, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 0.1917573872472784, | |
| "grad_norm": 0.9132669255091096, | |
| "learning_rate": 9.119823626741367e-06, | |
| "loss": 0.2559, | |
| "step": 1233 | |
| }, | |
| { | |
| "epoch": 0.19191290824261276, | |
| "grad_norm": 1.0158832597362466, | |
| "learning_rate": 9.118438877287913e-06, | |
| "loss": 0.218, | |
| "step": 1234 | |
| }, | |
| { | |
| "epoch": 0.19206842923794712, | |
| "grad_norm": 0.9172450560816615, | |
| "learning_rate": 9.11705314470567e-06, | |
| "loss": 0.2038, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.19222395023328148, | |
| "grad_norm": 1.0457809289045787, | |
| "learning_rate": 9.115666429325424e-06, | |
| "loss": 0.2383, | |
| "step": 1236 | |
| }, | |
| { | |
| "epoch": 0.19237947122861587, | |
| "grad_norm": 0.6123808194220389, | |
| "learning_rate": 9.114278731478207e-06, | |
| "loss": 0.1059, | |
| "step": 1237 | |
| }, | |
| { | |
| "epoch": 0.19253499222395024, | |
| "grad_norm": 0.8957445923668392, | |
| "learning_rate": 9.112890051495281e-06, | |
| "loss": 0.1753, | |
| "step": 1238 | |
| }, | |
| { | |
| "epoch": 0.1926905132192846, | |
| "grad_norm": 1.010302756648279, | |
| "learning_rate": 9.111500389708144e-06, | |
| "loss": 0.2162, | |
| "step": 1239 | |
| }, | |
| { | |
| "epoch": 0.19284603421461896, | |
| "grad_norm": 1.26307408847368, | |
| "learning_rate": 9.110109746448527e-06, | |
| "loss": 0.1901, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.19300155520995335, | |
| "grad_norm": 1.237621554432501, | |
| "learning_rate": 9.108718122048395e-06, | |
| "loss": 0.1746, | |
| "step": 1241 | |
| }, | |
| { | |
| "epoch": 0.19315707620528771, | |
| "grad_norm": 0.9172927280641415, | |
| "learning_rate": 9.107325516839952e-06, | |
| "loss": 0.1556, | |
| "step": 1242 | |
| }, | |
| { | |
| "epoch": 0.19331259720062208, | |
| "grad_norm": 1.7268710214147918, | |
| "learning_rate": 9.105931931155626e-06, | |
| "loss": 0.2808, | |
| "step": 1243 | |
| }, | |
| { | |
| "epoch": 0.19346811819595647, | |
| "grad_norm": 0.8932022562830918, | |
| "learning_rate": 9.10453736532809e-06, | |
| "loss": 0.1527, | |
| "step": 1244 | |
| }, | |
| { | |
| "epoch": 0.19362363919129083, | |
| "grad_norm": 1.2202712676463288, | |
| "learning_rate": 9.103141819690246e-06, | |
| "loss": 0.1376, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.1937791601866252, | |
| "grad_norm": 1.0826681500025592, | |
| "learning_rate": 9.101745294575227e-06, | |
| "loss": 0.1449, | |
| "step": 1246 | |
| }, | |
| { | |
| "epoch": 0.19393468118195956, | |
| "grad_norm": 1.1807575757930213, | |
| "learning_rate": 9.100347790316409e-06, | |
| "loss": 0.2126, | |
| "step": 1247 | |
| }, | |
| { | |
| "epoch": 0.19409020217729395, | |
| "grad_norm": 0.941763687751761, | |
| "learning_rate": 9.098949307247391e-06, | |
| "loss": 0.1632, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.1942457231726283, | |
| "grad_norm": 1.378441641768549, | |
| "learning_rate": 9.097549845702009e-06, | |
| "loss": 0.1906, | |
| "step": 1249 | |
| }, | |
| { | |
| "epoch": 0.19440124416796267, | |
| "grad_norm": 1.2339116886059447, | |
| "learning_rate": 9.09614940601434e-06, | |
| "loss": 0.2006, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.19455676516329704, | |
| "grad_norm": 1.1239344680494445, | |
| "learning_rate": 9.094747988518683e-06, | |
| "loss": 0.2336, | |
| "step": 1251 | |
| }, | |
| { | |
| "epoch": 0.19471228615863143, | |
| "grad_norm": 0.927588276459713, | |
| "learning_rate": 9.093345593549579e-06, | |
| "loss": 0.1449, | |
| "step": 1252 | |
| }, | |
| { | |
| "epoch": 0.1948678071539658, | |
| "grad_norm": 1.13724282637735, | |
| "learning_rate": 9.091942221441797e-06, | |
| "loss": 0.2126, | |
| "step": 1253 | |
| }, | |
| { | |
| "epoch": 0.19502332814930015, | |
| "grad_norm": 1.0365698182525573, | |
| "learning_rate": 9.090537872530343e-06, | |
| "loss": 0.1867, | |
| "step": 1254 | |
| }, | |
| { | |
| "epoch": 0.19517884914463451, | |
| "grad_norm": 0.9371814591941575, | |
| "learning_rate": 9.089132547150453e-06, | |
| "loss": 0.1618, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.1953343701399689, | |
| "grad_norm": 1.0697225550230685, | |
| "learning_rate": 9.0877262456376e-06, | |
| "loss": 0.1849, | |
| "step": 1256 | |
| }, | |
| { | |
| "epoch": 0.19548989113530327, | |
| "grad_norm": 1.5583498729530745, | |
| "learning_rate": 9.086318968327488e-06, | |
| "loss": 0.2014, | |
| "step": 1257 | |
| }, | |
| { | |
| "epoch": 0.19564541213063763, | |
| "grad_norm": 1.2271229677253923, | |
| "learning_rate": 9.084910715556052e-06, | |
| "loss": 0.2017, | |
| "step": 1258 | |
| }, | |
| { | |
| "epoch": 0.195800933125972, | |
| "grad_norm": 1.0026506309270833, | |
| "learning_rate": 9.083501487659461e-06, | |
| "loss": 0.1646, | |
| "step": 1259 | |
| }, | |
| { | |
| "epoch": 0.19595645412130638, | |
| "grad_norm": 1.2598951391108157, | |
| "learning_rate": 9.08209128497412e-06, | |
| "loss": 0.1851, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.19611197511664075, | |
| "grad_norm": 1.5838356552966606, | |
| "learning_rate": 9.080680107836662e-06, | |
| "loss": 0.1948, | |
| "step": 1261 | |
| }, | |
| { | |
| "epoch": 0.1962674961119751, | |
| "grad_norm": 1.1087104243969894, | |
| "learning_rate": 9.079267956583953e-06, | |
| "loss": 0.1687, | |
| "step": 1262 | |
| }, | |
| { | |
| "epoch": 0.1964230171073095, | |
| "grad_norm": 1.6020412697904411, | |
| "learning_rate": 9.077854831553097e-06, | |
| "loss": 0.1854, | |
| "step": 1263 | |
| }, | |
| { | |
| "epoch": 0.19657853810264386, | |
| "grad_norm": 1.0315547992066338, | |
| "learning_rate": 9.076440733081426e-06, | |
| "loss": 0.2211, | |
| "step": 1264 | |
| }, | |
| { | |
| "epoch": 0.19673405909797823, | |
| "grad_norm": 1.0349194289967332, | |
| "learning_rate": 9.075025661506505e-06, | |
| "loss": 0.182, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.1968895800933126, | |
| "grad_norm": 0.8148640872234216, | |
| "learning_rate": 9.073609617166129e-06, | |
| "loss": 0.2319, | |
| "step": 1266 | |
| }, | |
| { | |
| "epoch": 0.19704510108864698, | |
| "grad_norm": 0.8956967698145264, | |
| "learning_rate": 9.072192600398328e-06, | |
| "loss": 0.2318, | |
| "step": 1267 | |
| }, | |
| { | |
| "epoch": 0.19720062208398134, | |
| "grad_norm": 1.512397062737358, | |
| "learning_rate": 9.070774611541366e-06, | |
| "loss": 0.1279, | |
| "step": 1268 | |
| }, | |
| { | |
| "epoch": 0.1973561430793157, | |
| "grad_norm": 1.089155641459757, | |
| "learning_rate": 9.069355650933732e-06, | |
| "loss": 0.132, | |
| "step": 1269 | |
| }, | |
| { | |
| "epoch": 0.19751166407465007, | |
| "grad_norm": 1.15341700389814, | |
| "learning_rate": 9.06793571891416e-06, | |
| "loss": 0.1416, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.19766718506998446, | |
| "grad_norm": 1.2188604321419376, | |
| "learning_rate": 9.0665148158216e-06, | |
| "loss": 0.1635, | |
| "step": 1271 | |
| }, | |
| { | |
| "epoch": 0.19782270606531882, | |
| "grad_norm": 1.6133883720632236, | |
| "learning_rate": 9.065092941995245e-06, | |
| "loss": 0.185, | |
| "step": 1272 | |
| }, | |
| { | |
| "epoch": 0.19797822706065318, | |
| "grad_norm": 1.4486872766212289, | |
| "learning_rate": 9.063670097774513e-06, | |
| "loss": 0.2325, | |
| "step": 1273 | |
| }, | |
| { | |
| "epoch": 0.19813374805598755, | |
| "grad_norm": 1.557263365124596, | |
| "learning_rate": 9.062246283499058e-06, | |
| "loss": 0.1712, | |
| "step": 1274 | |
| }, | |
| { | |
| "epoch": 0.19828926905132194, | |
| "grad_norm": 1.9875754585690109, | |
| "learning_rate": 9.060821499508769e-06, | |
| "loss": 0.1843, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.1984447900466563, | |
| "grad_norm": 1.1418131416263584, | |
| "learning_rate": 9.059395746143756e-06, | |
| "loss": 0.1777, | |
| "step": 1276 | |
| }, | |
| { | |
| "epoch": 0.19860031104199066, | |
| "grad_norm": 1.0395361627239141, | |
| "learning_rate": 9.057969023744367e-06, | |
| "loss": 0.2194, | |
| "step": 1277 | |
| }, | |
| { | |
| "epoch": 0.19875583203732503, | |
| "grad_norm": 1.305159234748547, | |
| "learning_rate": 9.056541332651183e-06, | |
| "loss": 0.2141, | |
| "step": 1278 | |
| }, | |
| { | |
| "epoch": 0.19891135303265942, | |
| "grad_norm": 1.0849932011185046, | |
| "learning_rate": 9.055112673205014e-06, | |
| "loss": 0.1821, | |
| "step": 1279 | |
| }, | |
| { | |
| "epoch": 0.19906687402799378, | |
| "grad_norm": 0.979089764226756, | |
| "learning_rate": 9.053683045746897e-06, | |
| "loss": 0.269, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.19922239502332814, | |
| "grad_norm": 1.078405593629792, | |
| "learning_rate": 9.052252450618106e-06, | |
| "loss": 0.1413, | |
| "step": 1281 | |
| }, | |
| { | |
| "epoch": 0.19937791601866253, | |
| "grad_norm": 1.2031448135959215, | |
| "learning_rate": 9.050820888160145e-06, | |
| "loss": 0.2268, | |
| "step": 1282 | |
| }, | |
| { | |
| "epoch": 0.1995334370139969, | |
| "grad_norm": 0.9432997632179643, | |
| "learning_rate": 9.049388358714747e-06, | |
| "loss": 0.0856, | |
| "step": 1283 | |
| }, | |
| { | |
| "epoch": 0.19968895800933126, | |
| "grad_norm": 1.1798467376681538, | |
| "learning_rate": 9.04795486262388e-06, | |
| "loss": 0.1487, | |
| "step": 1284 | |
| }, | |
| { | |
| "epoch": 0.19984447900466562, | |
| "grad_norm": 0.9959594825238516, | |
| "learning_rate": 9.046520400229734e-06, | |
| "loss": 0.1363, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.2777597650080654, | |
| "learning_rate": 9.045084971874738e-06, | |
| "loss": 0.2053, | |
| "step": 1286 | |
| }, | |
| { | |
| "epoch": 0.20015552099533437, | |
| "grad_norm": 1.3807813898572032, | |
| "learning_rate": 9.04364857790155e-06, | |
| "loss": 0.1608, | |
| "step": 1287 | |
| }, | |
| { | |
| "epoch": 0.20031104199066874, | |
| "grad_norm": 1.213101350130223, | |
| "learning_rate": 9.042211218653054e-06, | |
| "loss": 0.1783, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 0.2004665629860031, | |
| "grad_norm": 1.270497799974636, | |
| "learning_rate": 9.040772894472369e-06, | |
| "loss": 0.1335, | |
| "step": 1289 | |
| }, | |
| { | |
| "epoch": 0.2006220839813375, | |
| "grad_norm": 1.143678584624158, | |
| "learning_rate": 9.039333605702844e-06, | |
| "loss": 0.2566, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.20077760497667185, | |
| "grad_norm": 0.9321591383595857, | |
| "learning_rate": 9.03789335268806e-06, | |
| "loss": 0.1517, | |
| "step": 1291 | |
| }, | |
| { | |
| "epoch": 0.20093312597200622, | |
| "grad_norm": 0.8482625172580437, | |
| "learning_rate": 9.036452135771818e-06, | |
| "loss": 0.2284, | |
| "step": 1292 | |
| }, | |
| { | |
| "epoch": 0.20108864696734058, | |
| "grad_norm": 1.5799008472731184, | |
| "learning_rate": 9.035009955298163e-06, | |
| "loss": 0.2491, | |
| "step": 1293 | |
| }, | |
| { | |
| "epoch": 0.20124416796267497, | |
| "grad_norm": 1.5021594414320747, | |
| "learning_rate": 9.03356681161136e-06, | |
| "loss": 0.1623, | |
| "step": 1294 | |
| }, | |
| { | |
| "epoch": 0.20139968895800933, | |
| "grad_norm": 1.1207507593154515, | |
| "learning_rate": 9.032122705055912e-06, | |
| "loss": 0.1996, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.2015552099533437, | |
| "grad_norm": 1.1753346897113919, | |
| "learning_rate": 9.030677635976542e-06, | |
| "loss": 0.156, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 0.20171073094867809, | |
| "grad_norm": 1.582912014985177, | |
| "learning_rate": 9.02923160471821e-06, | |
| "loss": 0.2852, | |
| "step": 1297 | |
| }, | |
| { | |
| "epoch": 0.20186625194401245, | |
| "grad_norm": 4.24419003235004, | |
| "learning_rate": 9.027784611626108e-06, | |
| "loss": 0.1857, | |
| "step": 1298 | |
| }, | |
| { | |
| "epoch": 0.2020217729393468, | |
| "grad_norm": 3.465507316165179, | |
| "learning_rate": 9.026336657045646e-06, | |
| "loss": 0.1331, | |
| "step": 1299 | |
| }, | |
| { | |
| "epoch": 0.20217729393468117, | |
| "grad_norm": 0.8992554022243577, | |
| "learning_rate": 9.024887741322475e-06, | |
| "loss": 0.1649, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.20217729393468117, | |
| "eval_loss": 0.20873166620731354, | |
| "eval_runtime": 9.4107, | |
| "eval_samples_per_second": 2.763, | |
| "eval_steps_per_second": 0.744, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.20233281493001556, | |
| "grad_norm": 1.2089278079623347, | |
| "learning_rate": 9.023437864802472e-06, | |
| "loss": 0.2705, | |
| "step": 1301 | |
| }, | |
| { | |
| "epoch": 0.20248833592534993, | |
| "grad_norm": 1.2901991665649666, | |
| "learning_rate": 9.021987027831743e-06, | |
| "loss": 0.1672, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 0.2026438569206843, | |
| "grad_norm": 1.5354719963652408, | |
| "learning_rate": 9.02053523075662e-06, | |
| "loss": 0.239, | |
| "step": 1303 | |
| }, | |
| { | |
| "epoch": 0.20279937791601865, | |
| "grad_norm": 1.214882523492219, | |
| "learning_rate": 9.01908247392367e-06, | |
| "loss": 0.1566, | |
| "step": 1304 | |
| }, | |
| { | |
| "epoch": 0.20295489891135304, | |
| "grad_norm": 1.473765899129253, | |
| "learning_rate": 9.017628757679685e-06, | |
| "loss": 0.1931, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.2031104199066874, | |
| "grad_norm": 2.6517165969707683, | |
| "learning_rate": 9.01617408237169e-06, | |
| "loss": 0.1307, | |
| "step": 1306 | |
| }, | |
| { | |
| "epoch": 0.20326594090202177, | |
| "grad_norm": 1.4993932954062734, | |
| "learning_rate": 9.01471844834693e-06, | |
| "loss": 0.2079, | |
| "step": 1307 | |
| }, | |
| { | |
| "epoch": 0.20342146189735613, | |
| "grad_norm": 1.0866992812991043, | |
| "learning_rate": 9.013261855952893e-06, | |
| "loss": 0.2361, | |
| "step": 1308 | |
| }, | |
| { | |
| "epoch": 0.20357698289269052, | |
| "grad_norm": 1.4691858213747517, | |
| "learning_rate": 9.011804305537281e-06, | |
| "loss": 0.2062, | |
| "step": 1309 | |
| }, | |
| { | |
| "epoch": 0.20373250388802489, | |
| "grad_norm": 1.218397331201916, | |
| "learning_rate": 9.010345797448037e-06, | |
| "loss": 0.1295, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.20388802488335925, | |
| "grad_norm": 1.317910015288317, | |
| "learning_rate": 9.008886332033323e-06, | |
| "loss": 0.221, | |
| "step": 1311 | |
| }, | |
| { | |
| "epoch": 0.2040435458786936, | |
| "grad_norm": 1.4368413534493716, | |
| "learning_rate": 9.007425909641538e-06, | |
| "loss": 0.3292, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.204199066874028, | |
| "grad_norm": 1.24467623609956, | |
| "learning_rate": 9.005964530621301e-06, | |
| "loss": 0.2276, | |
| "step": 1313 | |
| }, | |
| { | |
| "epoch": 0.20435458786936236, | |
| "grad_norm": 0.9849662601801316, | |
| "learning_rate": 9.004502195321468e-06, | |
| "loss": 0.1825, | |
| "step": 1314 | |
| }, | |
| { | |
| "epoch": 0.20451010886469673, | |
| "grad_norm": 3.783152250453029, | |
| "learning_rate": 9.003038904091113e-06, | |
| "loss": 0.1834, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.20466562986003112, | |
| "grad_norm": 1.0234608190416166, | |
| "learning_rate": 9.001574657279548e-06, | |
| "loss": 0.2172, | |
| "step": 1316 | |
| }, | |
| { | |
| "epoch": 0.20482115085536548, | |
| "grad_norm": 1.3240981295825394, | |
| "learning_rate": 9.00010945523631e-06, | |
| "loss": 0.1857, | |
| "step": 1317 | |
| }, | |
| { | |
| "epoch": 0.20497667185069984, | |
| "grad_norm": 1.1823107793426477, | |
| "learning_rate": 8.99864329831116e-06, | |
| "loss": 0.2747, | |
| "step": 1318 | |
| }, | |
| { | |
| "epoch": 0.2051321928460342, | |
| "grad_norm": 1.183188676477308, | |
| "learning_rate": 8.997176186854091e-06, | |
| "loss": 0.2091, | |
| "step": 1319 | |
| }, | |
| { | |
| "epoch": 0.2052877138413686, | |
| "grad_norm": 1.1306812200844953, | |
| "learning_rate": 8.995708121215325e-06, | |
| "loss": 0.1789, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.20544323483670296, | |
| "grad_norm": 1.3914844708441778, | |
| "learning_rate": 8.994239101745309e-06, | |
| "loss": 0.1626, | |
| "step": 1321 | |
| }, | |
| { | |
| "epoch": 0.20559875583203732, | |
| "grad_norm": 1.3328736681097808, | |
| "learning_rate": 8.992769128794717e-06, | |
| "loss": 0.1699, | |
| "step": 1322 | |
| }, | |
| { | |
| "epoch": 0.20575427682737168, | |
| "grad_norm": 1.3262550452320387, | |
| "learning_rate": 8.991298202714453e-06, | |
| "loss": 0.1985, | |
| "step": 1323 | |
| }, | |
| { | |
| "epoch": 0.20590979782270608, | |
| "grad_norm": 1.5863201904107513, | |
| "learning_rate": 8.989826323855647e-06, | |
| "loss": 0.2729, | |
| "step": 1324 | |
| }, | |
| { | |
| "epoch": 0.20606531881804044, | |
| "grad_norm": 1.0484153422588192, | |
| "learning_rate": 8.988353492569657e-06, | |
| "loss": 0.2243, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.2062208398133748, | |
| "grad_norm": 0.9724310873787251, | |
| "learning_rate": 8.986879709208069e-06, | |
| "loss": 0.2349, | |
| "step": 1326 | |
| }, | |
| { | |
| "epoch": 0.20637636080870916, | |
| "grad_norm": 1.319839764006134, | |
| "learning_rate": 8.985404974122699e-06, | |
| "loss": 0.1796, | |
| "step": 1327 | |
| }, | |
| { | |
| "epoch": 0.20653188180404355, | |
| "grad_norm": 1.7134943634197457, | |
| "learning_rate": 8.983929287665579e-06, | |
| "loss": 0.2289, | |
| "step": 1328 | |
| }, | |
| { | |
| "epoch": 0.20668740279937792, | |
| "grad_norm": 1.1812406274342315, | |
| "learning_rate": 8.98245265018898e-06, | |
| "loss": 0.2123, | |
| "step": 1329 | |
| }, | |
| { | |
| "epoch": 0.20684292379471228, | |
| "grad_norm": 1.4771839041530355, | |
| "learning_rate": 8.980975062045398e-06, | |
| "loss": 0.2228, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.20699844479004664, | |
| "grad_norm": 1.0073337669892177, | |
| "learning_rate": 8.979496523587552e-06, | |
| "loss": 0.1455, | |
| "step": 1331 | |
| }, | |
| { | |
| "epoch": 0.20715396578538103, | |
| "grad_norm": 0.8665969448850475, | |
| "learning_rate": 8.978017035168389e-06, | |
| "loss": 0.1689, | |
| "step": 1332 | |
| }, | |
| { | |
| "epoch": 0.2073094867807154, | |
| "grad_norm": 1.0555827692971853, | |
| "learning_rate": 8.976536597141085e-06, | |
| "loss": 0.1708, | |
| "step": 1333 | |
| }, | |
| { | |
| "epoch": 0.20746500777604976, | |
| "grad_norm": 0.8842215270037568, | |
| "learning_rate": 8.97505520985904e-06, | |
| "loss": 0.1751, | |
| "step": 1334 | |
| }, | |
| { | |
| "epoch": 0.20762052877138415, | |
| "grad_norm": 1.6924145041248846, | |
| "learning_rate": 8.973572873675882e-06, | |
| "loss": 0.1697, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.2077760497667185, | |
| "grad_norm": 1.1225115788471978, | |
| "learning_rate": 8.972089588945467e-06, | |
| "loss": 0.22, | |
| "step": 1336 | |
| }, | |
| { | |
| "epoch": 0.20793157076205288, | |
| "grad_norm": 1.1964311921620439, | |
| "learning_rate": 8.970605356021873e-06, | |
| "loss": 0.1953, | |
| "step": 1337 | |
| }, | |
| { | |
| "epoch": 0.20808709175738724, | |
| "grad_norm": 1.1874827397504135, | |
| "learning_rate": 8.96912017525941e-06, | |
| "loss": 0.1541, | |
| "step": 1338 | |
| }, | |
| { | |
| "epoch": 0.20824261275272163, | |
| "grad_norm": 1.2996586003784654, | |
| "learning_rate": 8.967634047012607e-06, | |
| "loss": 0.2543, | |
| "step": 1339 | |
| }, | |
| { | |
| "epoch": 0.208398133748056, | |
| "grad_norm": 1.9568915465615424, | |
| "learning_rate": 8.96614697163623e-06, | |
| "loss": 0.1742, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.20855365474339035, | |
| "grad_norm": 1.327702070183964, | |
| "learning_rate": 8.96465894948526e-06, | |
| "loss": 0.1688, | |
| "step": 1341 | |
| }, | |
| { | |
| "epoch": 0.20870917573872472, | |
| "grad_norm": 0.998729186682604, | |
| "learning_rate": 8.963169980914908e-06, | |
| "loss": 0.2165, | |
| "step": 1342 | |
| }, | |
| { | |
| "epoch": 0.2088646967340591, | |
| "grad_norm": 0.9250328323650552, | |
| "learning_rate": 8.961680066280614e-06, | |
| "loss": 0.1978, | |
| "step": 1343 | |
| }, | |
| { | |
| "epoch": 0.20902021772939347, | |
| "grad_norm": 0.685484375204563, | |
| "learning_rate": 8.96018920593804e-06, | |
| "loss": 0.1521, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.20917573872472783, | |
| "grad_norm": 1.2194077898180222, | |
| "learning_rate": 8.958697400243077e-06, | |
| "loss": 0.129, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.2093312597200622, | |
| "grad_norm": 1.3390006867631312, | |
| "learning_rate": 8.957204649551838e-06, | |
| "loss": 0.2295, | |
| "step": 1346 | |
| }, | |
| { | |
| "epoch": 0.2094867807153966, | |
| "grad_norm": 1.0791715779616644, | |
| "learning_rate": 8.955710954220664e-06, | |
| "loss": 0.1922, | |
| "step": 1347 | |
| }, | |
| { | |
| "epoch": 0.20964230171073095, | |
| "grad_norm": 1.0448818497216468, | |
| "learning_rate": 8.954216314606123e-06, | |
| "loss": 0.2074, | |
| "step": 1348 | |
| }, | |
| { | |
| "epoch": 0.2097978227060653, | |
| "grad_norm": 1.0968024521734823, | |
| "learning_rate": 8.952720731065e-06, | |
| "loss": 0.1956, | |
| "step": 1349 | |
| }, | |
| { | |
| "epoch": 0.2099533437013997, | |
| "grad_norm": 1.1729159260054676, | |
| "learning_rate": 8.95122420395432e-06, | |
| "loss": 0.1032, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.21010886469673407, | |
| "grad_norm": 0.7605452577854958, | |
| "learning_rate": 8.949726733631319e-06, | |
| "loss": 0.2173, | |
| "step": 1351 | |
| }, | |
| { | |
| "epoch": 0.21026438569206843, | |
| "grad_norm": 0.7896405561018206, | |
| "learning_rate": 8.948228320453465e-06, | |
| "loss": 0.1411, | |
| "step": 1352 | |
| }, | |
| { | |
| "epoch": 0.2104199066874028, | |
| "grad_norm": 1.3664851820052848, | |
| "learning_rate": 8.946728964778452e-06, | |
| "loss": 0.2043, | |
| "step": 1353 | |
| }, | |
| { | |
| "epoch": 0.21057542768273718, | |
| "grad_norm": 1.0930532560076165, | |
| "learning_rate": 8.945228666964197e-06, | |
| "loss": 0.2112, | |
| "step": 1354 | |
| }, | |
| { | |
| "epoch": 0.21073094867807154, | |
| "grad_norm": 1.3370376996193614, | |
| "learning_rate": 8.94372742736884e-06, | |
| "loss": 0.2763, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.2108864696734059, | |
| "grad_norm": 1.1733695403983486, | |
| "learning_rate": 8.942225246350748e-06, | |
| "loss": 0.1383, | |
| "step": 1356 | |
| }, | |
| { | |
| "epoch": 0.21104199066874027, | |
| "grad_norm": 1.518123240050466, | |
| "learning_rate": 8.940722124268515e-06, | |
| "loss": 0.2035, | |
| "step": 1357 | |
| }, | |
| { | |
| "epoch": 0.21119751166407466, | |
| "grad_norm": 0.7154774393150748, | |
| "learning_rate": 8.939218061480955e-06, | |
| "loss": 0.1513, | |
| "step": 1358 | |
| }, | |
| { | |
| "epoch": 0.21135303265940902, | |
| "grad_norm": 1.7277749667928948, | |
| "learning_rate": 8.937713058347109e-06, | |
| "loss": 0.1852, | |
| "step": 1359 | |
| }, | |
| { | |
| "epoch": 0.2115085536547434, | |
| "grad_norm": 0.8101754008908368, | |
| "learning_rate": 8.936207115226242e-06, | |
| "loss": 0.1755, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.21166407465007775, | |
| "grad_norm": 2.154263107894285, | |
| "learning_rate": 8.934700232477845e-06, | |
| "loss": 0.2284, | |
| "step": 1361 | |
| }, | |
| { | |
| "epoch": 0.21181959564541214, | |
| "grad_norm": 2.9946702775104552, | |
| "learning_rate": 8.933192410461632e-06, | |
| "loss": 0.1571, | |
| "step": 1362 | |
| }, | |
| { | |
| "epoch": 0.2119751166407465, | |
| "grad_norm": 1.3293853025848206, | |
| "learning_rate": 8.931683649537539e-06, | |
| "loss": 0.1818, | |
| "step": 1363 | |
| }, | |
| { | |
| "epoch": 0.21213063763608087, | |
| "grad_norm": 1.069623910831374, | |
| "learning_rate": 8.93017395006573e-06, | |
| "loss": 0.2389, | |
| "step": 1364 | |
| }, | |
| { | |
| "epoch": 0.21228615863141523, | |
| "grad_norm": 1.2692486168753456, | |
| "learning_rate": 8.928663312406593e-06, | |
| "loss": 0.1725, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.21244167962674962, | |
| "grad_norm": 2.31269662319102, | |
| "learning_rate": 8.927151736920733e-06, | |
| "loss": 0.3472, | |
| "step": 1366 | |
| }, | |
| { | |
| "epoch": 0.21259720062208398, | |
| "grad_norm": 1.3024374295612378, | |
| "learning_rate": 8.925639223968989e-06, | |
| "loss": 0.1601, | |
| "step": 1367 | |
| }, | |
| { | |
| "epoch": 0.21275272161741834, | |
| "grad_norm": 1.475662600105692, | |
| "learning_rate": 8.924125773912418e-06, | |
| "loss": 0.1652, | |
| "step": 1368 | |
| }, | |
| { | |
| "epoch": 0.21290824261275273, | |
| "grad_norm": 0.8719883727219597, | |
| "learning_rate": 8.9226113871123e-06, | |
| "loss": 0.2406, | |
| "step": 1369 | |
| }, | |
| { | |
| "epoch": 0.2130637636080871, | |
| "grad_norm": 1.355947295843189, | |
| "learning_rate": 8.921096063930141e-06, | |
| "loss": 0.2387, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.21321928460342146, | |
| "grad_norm": 1.462171782992857, | |
| "learning_rate": 8.919579804727671e-06, | |
| "loss": 0.2075, | |
| "step": 1371 | |
| }, | |
| { | |
| "epoch": 0.21337480559875582, | |
| "grad_norm": 1.4186556891621878, | |
| "learning_rate": 8.91806260986684e-06, | |
| "loss": 0.1906, | |
| "step": 1372 | |
| }, | |
| { | |
| "epoch": 0.21353032659409021, | |
| "grad_norm": 1.0297515081183366, | |
| "learning_rate": 8.916544479709826e-06, | |
| "loss": 0.1813, | |
| "step": 1373 | |
| }, | |
| { | |
| "epoch": 0.21368584758942458, | |
| "grad_norm": 0.8517207332254344, | |
| "learning_rate": 8.915025414619025e-06, | |
| "loss": 0.2314, | |
| "step": 1374 | |
| }, | |
| { | |
| "epoch": 0.21384136858475894, | |
| "grad_norm": 1.4500725099182117, | |
| "learning_rate": 8.91350541495706e-06, | |
| "loss": 0.2702, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.2139968895800933, | |
| "grad_norm": 1.4840249529134437, | |
| "learning_rate": 8.911984481086779e-06, | |
| "loss": 0.1957, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.2141524105754277, | |
| "grad_norm": 1.0812621557572404, | |
| "learning_rate": 8.910462613371246e-06, | |
| "loss": 0.1773, | |
| "step": 1377 | |
| }, | |
| { | |
| "epoch": 0.21430793157076206, | |
| "grad_norm": 0.8285771638848516, | |
| "learning_rate": 8.908939812173756e-06, | |
| "loss": 0.1879, | |
| "step": 1378 | |
| }, | |
| { | |
| "epoch": 0.21446345256609642, | |
| "grad_norm": 1.5413069191948623, | |
| "learning_rate": 8.907416077857818e-06, | |
| "loss": 0.2024, | |
| "step": 1379 | |
| }, | |
| { | |
| "epoch": 0.21461897356143078, | |
| "grad_norm": 1.5546998088262725, | |
| "learning_rate": 8.905891410787174e-06, | |
| "loss": 0.1297, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.21477449455676517, | |
| "grad_norm": 1.0276705986435684, | |
| "learning_rate": 8.904365811325779e-06, | |
| "loss": 0.1777, | |
| "step": 1381 | |
| }, | |
| { | |
| "epoch": 0.21493001555209953, | |
| "grad_norm": 2.186178551364591, | |
| "learning_rate": 8.902839279837818e-06, | |
| "loss": 0.1936, | |
| "step": 1382 | |
| }, | |
| { | |
| "epoch": 0.2150855365474339, | |
| "grad_norm": 1.409142378067793, | |
| "learning_rate": 8.901311816687693e-06, | |
| "loss": 0.2347, | |
| "step": 1383 | |
| }, | |
| { | |
| "epoch": 0.21524105754276826, | |
| "grad_norm": 0.909249039104448, | |
| "learning_rate": 8.899783422240031e-06, | |
| "loss": 0.1858, | |
| "step": 1384 | |
| }, | |
| { | |
| "epoch": 0.21539657853810265, | |
| "grad_norm": 1.389710830109919, | |
| "learning_rate": 8.898254096859681e-06, | |
| "loss": 0.2546, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.215552099533437, | |
| "grad_norm": 1.1722812780197163, | |
| "learning_rate": 8.896723840911718e-06, | |
| "loss": 0.2451, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 0.21570762052877138, | |
| "grad_norm": 1.0186256750739588, | |
| "learning_rate": 8.89519265476143e-06, | |
| "loss": 0.1423, | |
| "step": 1387 | |
| }, | |
| { | |
| "epoch": 0.21586314152410577, | |
| "grad_norm": 1.463755060922718, | |
| "learning_rate": 8.893660538774335e-06, | |
| "loss": 0.678, | |
| "step": 1388 | |
| }, | |
| { | |
| "epoch": 0.21601866251944013, | |
| "grad_norm": 1.2144290461428764, | |
| "learning_rate": 8.892127493316172e-06, | |
| "loss": 0.1289, | |
| "step": 1389 | |
| }, | |
| { | |
| "epoch": 0.2161741835147745, | |
| "grad_norm": 1.2754281076641276, | |
| "learning_rate": 8.8905935187529e-06, | |
| "loss": 0.1775, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.21632970451010886, | |
| "grad_norm": 0.8239843617970345, | |
| "learning_rate": 8.889058615450695e-06, | |
| "loss": 0.1379, | |
| "step": 1391 | |
| }, | |
| { | |
| "epoch": 0.21648522550544325, | |
| "grad_norm": 0.8183516543340216, | |
| "learning_rate": 8.887522783775965e-06, | |
| "loss": 0.4396, | |
| "step": 1392 | |
| }, | |
| { | |
| "epoch": 0.2166407465007776, | |
| "grad_norm": 1.163898200737944, | |
| "learning_rate": 8.885986024095334e-06, | |
| "loss": 0.1788, | |
| "step": 1393 | |
| }, | |
| { | |
| "epoch": 0.21679626749611197, | |
| "grad_norm": 1.0398663598746642, | |
| "learning_rate": 8.884448336775647e-06, | |
| "loss": 0.2058, | |
| "step": 1394 | |
| }, | |
| { | |
| "epoch": 0.21695178849144633, | |
| "grad_norm": 1.1038527572141106, | |
| "learning_rate": 8.882909722183973e-06, | |
| "loss": 0.1603, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.21710730948678073, | |
| "grad_norm": 1.1407224011212185, | |
| "learning_rate": 8.881370180687597e-06, | |
| "loss": 0.212, | |
| "step": 1396 | |
| }, | |
| { | |
| "epoch": 0.2172628304821151, | |
| "grad_norm": 1.171491183176733, | |
| "learning_rate": 8.879829712654032e-06, | |
| "loss": 0.156, | |
| "step": 1397 | |
| }, | |
| { | |
| "epoch": 0.21741835147744945, | |
| "grad_norm": 1.0914587320494888, | |
| "learning_rate": 8.878288318451006e-06, | |
| "loss": 0.0999, | |
| "step": 1398 | |
| }, | |
| { | |
| "epoch": 0.2175738724727838, | |
| "grad_norm": 1.0719935831541472, | |
| "learning_rate": 8.876745998446477e-06, | |
| "loss": 0.2026, | |
| "step": 1399 | |
| }, | |
| { | |
| "epoch": 0.2177293934681182, | |
| "grad_norm": 0.8893812774700685, | |
| "learning_rate": 8.875202753008614e-06, | |
| "loss": 0.1152, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2177293934681182, | |
| "eval_loss": 0.20550738275051117, | |
| "eval_runtime": 9.4165, | |
| "eval_samples_per_second": 2.761, | |
| "eval_steps_per_second": 0.743, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.21788491446345257, | |
| "grad_norm": 1.1732595194107243, | |
| "learning_rate": 8.873658582505813e-06, | |
| "loss": 0.184, | |
| "step": 1401 | |
| }, | |
| { | |
| "epoch": 0.21804043545878693, | |
| "grad_norm": 1.3681566501491238, | |
| "learning_rate": 8.872113487306686e-06, | |
| "loss": 0.1787, | |
| "step": 1402 | |
| }, | |
| { | |
| "epoch": 0.2181959564541213, | |
| "grad_norm": 0.9384518321736989, | |
| "learning_rate": 8.870567467780073e-06, | |
| "loss": 0.1907, | |
| "step": 1403 | |
| }, | |
| { | |
| "epoch": 0.21835147744945568, | |
| "grad_norm": 1.6918321800893066, | |
| "learning_rate": 8.86902052429503e-06, | |
| "loss": 0.1814, | |
| "step": 1404 | |
| }, | |
| { | |
| "epoch": 0.21850699844479005, | |
| "grad_norm": 1.0615675392544648, | |
| "learning_rate": 8.867472657220829e-06, | |
| "loss": 0.1807, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.2186625194401244, | |
| "grad_norm": 1.2104557155019795, | |
| "learning_rate": 8.865923866926973e-06, | |
| "loss": 0.2046, | |
| "step": 1406 | |
| }, | |
| { | |
| "epoch": 0.2188180404354588, | |
| "grad_norm": 1.409015102478802, | |
| "learning_rate": 8.864374153783177e-06, | |
| "loss": 0.2415, | |
| "step": 1407 | |
| }, | |
| { | |
| "epoch": 0.21897356143079316, | |
| "grad_norm": 1.2088161026937052, | |
| "learning_rate": 8.86282351815938e-06, | |
| "loss": 0.1573, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.21912908242612752, | |
| "grad_norm": 1.0288225427805875, | |
| "learning_rate": 8.861271960425741e-06, | |
| "loss": 0.1812, | |
| "step": 1409 | |
| }, | |
| { | |
| "epoch": 0.2192846034214619, | |
| "grad_norm": 1.1067487085965078, | |
| "learning_rate": 8.859719480952637e-06, | |
| "loss": 0.1955, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.21944012441679628, | |
| "grad_norm": 1.331527983707418, | |
| "learning_rate": 8.858166080110666e-06, | |
| "loss": 0.2153, | |
| "step": 1411 | |
| }, | |
| { | |
| "epoch": 0.21959564541213064, | |
| "grad_norm": 1.3966673201995545, | |
| "learning_rate": 8.85661175827065e-06, | |
| "loss": 0.1861, | |
| "step": 1412 | |
| }, | |
| { | |
| "epoch": 0.219751166407465, | |
| "grad_norm": 1.7346922539447693, | |
| "learning_rate": 8.855056515803624e-06, | |
| "loss": 0.2217, | |
| "step": 1413 | |
| }, | |
| { | |
| "epoch": 0.21990668740279937, | |
| "grad_norm": 1.0429561703393233, | |
| "learning_rate": 8.853500353080848e-06, | |
| "loss": 0.137, | |
| "step": 1414 | |
| }, | |
| { | |
| "epoch": 0.22006220839813376, | |
| "grad_norm": 1.099146007367247, | |
| "learning_rate": 8.851943270473797e-06, | |
| "loss": 0.1888, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.22021772939346812, | |
| "grad_norm": 1.0622173162674204, | |
| "learning_rate": 8.850385268354171e-06, | |
| "loss": 0.2054, | |
| "step": 1416 | |
| }, | |
| { | |
| "epoch": 0.22037325038880248, | |
| "grad_norm": 1.7275165681110787, | |
| "learning_rate": 8.848826347093887e-06, | |
| "loss": 0.1839, | |
| "step": 1417 | |
| }, | |
| { | |
| "epoch": 0.22052877138413685, | |
| "grad_norm": 1.4049206778214125, | |
| "learning_rate": 8.84726650706508e-06, | |
| "loss": 0.2719, | |
| "step": 1418 | |
| }, | |
| { | |
| "epoch": 0.22068429237947124, | |
| "grad_norm": 0.984134518775913, | |
| "learning_rate": 8.845705748640104e-06, | |
| "loss": 0.2118, | |
| "step": 1419 | |
| }, | |
| { | |
| "epoch": 0.2208398133748056, | |
| "grad_norm": 0.8575267757080008, | |
| "learning_rate": 8.844144072191537e-06, | |
| "loss": 0.1633, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.22099533437013996, | |
| "grad_norm": 1.2572159208716647, | |
| "learning_rate": 8.842581478092172e-06, | |
| "loss": 0.2397, | |
| "step": 1421 | |
| }, | |
| { | |
| "epoch": 0.22115085536547435, | |
| "grad_norm": 1.2016606507273602, | |
| "learning_rate": 8.841017966715019e-06, | |
| "loss": 0.2033, | |
| "step": 1422 | |
| }, | |
| { | |
| "epoch": 0.22130637636080872, | |
| "grad_norm": 1.3276461025791215, | |
| "learning_rate": 8.839453538433314e-06, | |
| "loss": 0.1925, | |
| "step": 1423 | |
| }, | |
| { | |
| "epoch": 0.22146189735614308, | |
| "grad_norm": 0.8224092915871075, | |
| "learning_rate": 8.837888193620506e-06, | |
| "loss": 0.1358, | |
| "step": 1424 | |
| }, | |
| { | |
| "epoch": 0.22161741835147744, | |
| "grad_norm": 1.4495835386689406, | |
| "learning_rate": 8.836321932650266e-06, | |
| "loss": 0.2432, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.22177293934681183, | |
| "grad_norm": 1.4755944744177818, | |
| "learning_rate": 8.83475475589648e-06, | |
| "loss": 0.1231, | |
| "step": 1426 | |
| }, | |
| { | |
| "epoch": 0.2219284603421462, | |
| "grad_norm": 0.8119316049057401, | |
| "learning_rate": 8.833186663733258e-06, | |
| "loss": 0.2097, | |
| "step": 1427 | |
| }, | |
| { | |
| "epoch": 0.22208398133748056, | |
| "grad_norm": 1.0060829041279713, | |
| "learning_rate": 8.83161765653492e-06, | |
| "loss": 0.1738, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 0.22223950233281492, | |
| "grad_norm": 2.4145754711073733, | |
| "learning_rate": 8.830047734676018e-06, | |
| "loss": 0.2858, | |
| "step": 1429 | |
| }, | |
| { | |
| "epoch": 0.2223950233281493, | |
| "grad_norm": 1.1242173153619541, | |
| "learning_rate": 8.828476898531308e-06, | |
| "loss": 0.2166, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.22255054432348367, | |
| "grad_norm": 0.9324040289076934, | |
| "learning_rate": 8.826905148475772e-06, | |
| "loss": 0.1157, | |
| "step": 1431 | |
| }, | |
| { | |
| "epoch": 0.22270606531881804, | |
| "grad_norm": 1.1091005510043248, | |
| "learning_rate": 8.82533248488461e-06, | |
| "loss": 0.2387, | |
| "step": 1432 | |
| }, | |
| { | |
| "epoch": 0.2228615863141524, | |
| "grad_norm": 1.5660091935097067, | |
| "learning_rate": 8.823758908133237e-06, | |
| "loss": 0.1783, | |
| "step": 1433 | |
| }, | |
| { | |
| "epoch": 0.2230171073094868, | |
| "grad_norm": 1.7595194847301099, | |
| "learning_rate": 8.822184418597289e-06, | |
| "loss": 0.1971, | |
| "step": 1434 | |
| }, | |
| { | |
| "epoch": 0.22317262830482115, | |
| "grad_norm": 1.1991294408769844, | |
| "learning_rate": 8.820609016652616e-06, | |
| "loss": 0.1993, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.22332814930015552, | |
| "grad_norm": 1.16155323748872, | |
| "learning_rate": 8.819032702675293e-06, | |
| "loss": 0.1663, | |
| "step": 1436 | |
| }, | |
| { | |
| "epoch": 0.22348367029548988, | |
| "grad_norm": 1.144471577400653, | |
| "learning_rate": 8.817455477041605e-06, | |
| "loss": 0.1553, | |
| "step": 1437 | |
| }, | |
| { | |
| "epoch": 0.22363919129082427, | |
| "grad_norm": 1.1758756635872867, | |
| "learning_rate": 8.815877340128059e-06, | |
| "loss": 0.1997, | |
| "step": 1438 | |
| }, | |
| { | |
| "epoch": 0.22379471228615863, | |
| "grad_norm": 1.0774573442962538, | |
| "learning_rate": 8.814298292311376e-06, | |
| "loss": 0.224, | |
| "step": 1439 | |
| }, | |
| { | |
| "epoch": 0.223950233281493, | |
| "grad_norm": 1.018897824496983, | |
| "learning_rate": 8.812718333968498e-06, | |
| "loss": 0.1969, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.22410575427682738, | |
| "grad_norm": 0.7464671714955523, | |
| "learning_rate": 8.811137465476584e-06, | |
| "loss": 0.1704, | |
| "step": 1441 | |
| }, | |
| { | |
| "epoch": 0.22426127527216175, | |
| "grad_norm": 1.120267062163412, | |
| "learning_rate": 8.80955568721301e-06, | |
| "loss": 0.1941, | |
| "step": 1442 | |
| }, | |
| { | |
| "epoch": 0.2244167962674961, | |
| "grad_norm": 1.57559360058438, | |
| "learning_rate": 8.807972999555368e-06, | |
| "loss": 0.2603, | |
| "step": 1443 | |
| }, | |
| { | |
| "epoch": 0.22457231726283047, | |
| "grad_norm": 0.939702806688543, | |
| "learning_rate": 8.806389402881466e-06, | |
| "loss": 0.2116, | |
| "step": 1444 | |
| }, | |
| { | |
| "epoch": 0.22472783825816486, | |
| "grad_norm": 1.2188628504615986, | |
| "learning_rate": 8.80480489756933e-06, | |
| "loss": 0.2424, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.22488335925349923, | |
| "grad_norm": 1.0944670304974327, | |
| "learning_rate": 8.803219483997205e-06, | |
| "loss": 0.1828, | |
| "step": 1446 | |
| }, | |
| { | |
| "epoch": 0.2250388802488336, | |
| "grad_norm": 1.6182633129569433, | |
| "learning_rate": 8.801633162543555e-06, | |
| "loss": 0.1964, | |
| "step": 1447 | |
| }, | |
| { | |
| "epoch": 0.22519440124416795, | |
| "grad_norm": 0.8387542620173406, | |
| "learning_rate": 8.800045933587052e-06, | |
| "loss": 0.1585, | |
| "step": 1448 | |
| }, | |
| { | |
| "epoch": 0.22534992223950234, | |
| "grad_norm": 1.2464384825217707, | |
| "learning_rate": 8.798457797506588e-06, | |
| "loss": 0.1134, | |
| "step": 1449 | |
| }, | |
| { | |
| "epoch": 0.2255054432348367, | |
| "grad_norm": 0.9893884401535724, | |
| "learning_rate": 8.79686875468128e-06, | |
| "loss": 0.2421, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.22566096423017107, | |
| "grad_norm": 1.0480572189617101, | |
| "learning_rate": 8.79527880549045e-06, | |
| "loss": 0.1921, | |
| "step": 1451 | |
| }, | |
| { | |
| "epoch": 0.22581648522550543, | |
| "grad_norm": 1.3152458887234093, | |
| "learning_rate": 8.793687950313643e-06, | |
| "loss": 0.1597, | |
| "step": 1452 | |
| }, | |
| { | |
| "epoch": 0.22597200622083982, | |
| "grad_norm": 1.0970403207876425, | |
| "learning_rate": 8.792096189530614e-06, | |
| "loss": 0.1854, | |
| "step": 1453 | |
| }, | |
| { | |
| "epoch": 0.22612752721617418, | |
| "grad_norm": 1.0705518033654797, | |
| "learning_rate": 8.790503523521346e-06, | |
| "loss": 0.1839, | |
| "step": 1454 | |
| }, | |
| { | |
| "epoch": 0.22628304821150855, | |
| "grad_norm": 1.132932961220967, | |
| "learning_rate": 8.788909952666024e-06, | |
| "loss": 0.1871, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.2264385692068429, | |
| "grad_norm": 1.4797221000535143, | |
| "learning_rate": 8.787315477345059e-06, | |
| "loss": 0.2295, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 0.2265940902021773, | |
| "grad_norm": 1.0944162670416104, | |
| "learning_rate": 8.785720097939075e-06, | |
| "loss": 0.1745, | |
| "step": 1457 | |
| }, | |
| { | |
| "epoch": 0.22674961119751166, | |
| "grad_norm": 1.6430830107526284, | |
| "learning_rate": 8.784123814828908e-06, | |
| "loss": 0.3592, | |
| "step": 1458 | |
| }, | |
| { | |
| "epoch": 0.22690513219284603, | |
| "grad_norm": 1.1438907408683774, | |
| "learning_rate": 8.782526628395616e-06, | |
| "loss": 0.1613, | |
| "step": 1459 | |
| }, | |
| { | |
| "epoch": 0.22706065318818042, | |
| "grad_norm": 2.655708868757693, | |
| "learning_rate": 8.780928539020467e-06, | |
| "loss": 0.1821, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.22721617418351478, | |
| "grad_norm": 0.9605535718803637, | |
| "learning_rate": 8.779329547084949e-06, | |
| "loss": 0.1707, | |
| "step": 1461 | |
| }, | |
| { | |
| "epoch": 0.22737169517884914, | |
| "grad_norm": 2.2075086894366036, | |
| "learning_rate": 8.777729652970765e-06, | |
| "loss": 0.1383, | |
| "step": 1462 | |
| }, | |
| { | |
| "epoch": 0.2275272161741835, | |
| "grad_norm": 1.1974721511606266, | |
| "learning_rate": 8.77612885705983e-06, | |
| "loss": 0.2615, | |
| "step": 1463 | |
| }, | |
| { | |
| "epoch": 0.2276827371695179, | |
| "grad_norm": 1.076273551290465, | |
| "learning_rate": 8.774527159734277e-06, | |
| "loss": 0.2094, | |
| "step": 1464 | |
| }, | |
| { | |
| "epoch": 0.22783825816485226, | |
| "grad_norm": 1.3601919661341624, | |
| "learning_rate": 8.772924561376454e-06, | |
| "loss": 0.2324, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.22799377916018662, | |
| "grad_norm": 1.4328079078867457, | |
| "learning_rate": 8.771321062368922e-06, | |
| "loss": 0.1763, | |
| "step": 1466 | |
| }, | |
| { | |
| "epoch": 0.22814930015552098, | |
| "grad_norm": 1.1869126356200645, | |
| "learning_rate": 8.76971666309446e-06, | |
| "loss": 0.1093, | |
| "step": 1467 | |
| }, | |
| { | |
| "epoch": 0.22830482115085537, | |
| "grad_norm": 0.8016043523305539, | |
| "learning_rate": 8.768111363936058e-06, | |
| "loss": 0.1716, | |
| "step": 1468 | |
| }, | |
| { | |
| "epoch": 0.22846034214618974, | |
| "grad_norm": 1.1279000832737547, | |
| "learning_rate": 8.766505165276928e-06, | |
| "loss": 0.1415, | |
| "step": 1469 | |
| }, | |
| { | |
| "epoch": 0.2286158631415241, | |
| "grad_norm": 1.4632653437041683, | |
| "learning_rate": 8.764898067500488e-06, | |
| "loss": 0.1682, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.22877138413685846, | |
| "grad_norm": 1.427331448842405, | |
| "learning_rate": 8.763290070990377e-06, | |
| "loss": 0.261, | |
| "step": 1471 | |
| }, | |
| { | |
| "epoch": 0.22892690513219285, | |
| "grad_norm": 0.9926126679211552, | |
| "learning_rate": 8.761681176130443e-06, | |
| "loss": 0.1625, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.22908242612752722, | |
| "grad_norm": 1.690385156533882, | |
| "learning_rate": 8.760071383304755e-06, | |
| "loss": 0.2803, | |
| "step": 1473 | |
| }, | |
| { | |
| "epoch": 0.22923794712286158, | |
| "grad_norm": 1.0976612977720204, | |
| "learning_rate": 8.758460692897593e-06, | |
| "loss": 0.1802, | |
| "step": 1474 | |
| }, | |
| { | |
| "epoch": 0.22939346811819597, | |
| "grad_norm": 1.2314757179900722, | |
| "learning_rate": 8.756849105293447e-06, | |
| "loss": 0.1768, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.22954898911353033, | |
| "grad_norm": 1.1327643054428198, | |
| "learning_rate": 8.755236620877033e-06, | |
| "loss": 0.1865, | |
| "step": 1476 | |
| }, | |
| { | |
| "epoch": 0.2297045101088647, | |
| "grad_norm": 1.1639229615649782, | |
| "learning_rate": 8.753623240033265e-06, | |
| "loss": 0.1524, | |
| "step": 1477 | |
| }, | |
| { | |
| "epoch": 0.22986003110419906, | |
| "grad_norm": 0.9603164098229106, | |
| "learning_rate": 8.752008963147285e-06, | |
| "loss": 0.1721, | |
| "step": 1478 | |
| }, | |
| { | |
| "epoch": 0.23001555209953345, | |
| "grad_norm": 1.38792631561096, | |
| "learning_rate": 8.750393790604442e-06, | |
| "loss": 0.2342, | |
| "step": 1479 | |
| }, | |
| { | |
| "epoch": 0.2301710730948678, | |
| "grad_norm": 1.2479053691859538, | |
| "learning_rate": 8.7487777227903e-06, | |
| "loss": 0.1938, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.23032659409020217, | |
| "grad_norm": 1.2509939431760002, | |
| "learning_rate": 8.747160760090637e-06, | |
| "loss": 0.1844, | |
| "step": 1481 | |
| }, | |
| { | |
| "epoch": 0.23048211508553654, | |
| "grad_norm": 1.465934150389407, | |
| "learning_rate": 8.745542902891444e-06, | |
| "loss": 0.205, | |
| "step": 1482 | |
| }, | |
| { | |
| "epoch": 0.23063763608087093, | |
| "grad_norm": 1.0510694170069674, | |
| "learning_rate": 8.743924151578928e-06, | |
| "loss": 0.1759, | |
| "step": 1483 | |
| }, | |
| { | |
| "epoch": 0.2307931570762053, | |
| "grad_norm": 1.2869382169156265, | |
| "learning_rate": 8.742304506539506e-06, | |
| "loss": 0.1634, | |
| "step": 1484 | |
| }, | |
| { | |
| "epoch": 0.23094867807153965, | |
| "grad_norm": 2.0849533877813067, | |
| "learning_rate": 8.740683968159808e-06, | |
| "loss": 0.1834, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.23110419906687402, | |
| "grad_norm": 0.5391088701503829, | |
| "learning_rate": 8.739062536826683e-06, | |
| "loss": 0.1062, | |
| "step": 1486 | |
| }, | |
| { | |
| "epoch": 0.2312597200622084, | |
| "grad_norm": 1.339043790882886, | |
| "learning_rate": 8.737440212927188e-06, | |
| "loss": 0.154, | |
| "step": 1487 | |
| }, | |
| { | |
| "epoch": 0.23141524105754277, | |
| "grad_norm": 1.2239049109865379, | |
| "learning_rate": 8.735816996848592e-06, | |
| "loss": 0.1694, | |
| "step": 1488 | |
| }, | |
| { | |
| "epoch": 0.23157076205287713, | |
| "grad_norm": 0.8785721668205927, | |
| "learning_rate": 8.734192888978381e-06, | |
| "loss": 0.1501, | |
| "step": 1489 | |
| }, | |
| { | |
| "epoch": 0.2317262830482115, | |
| "grad_norm": 1.1018359589714184, | |
| "learning_rate": 8.732567889704253e-06, | |
| "loss": 0.2004, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.23188180404354589, | |
| "grad_norm": 1.2782960384351885, | |
| "learning_rate": 8.730941999414117e-06, | |
| "loss": 0.1514, | |
| "step": 1491 | |
| }, | |
| { | |
| "epoch": 0.23203732503888025, | |
| "grad_norm": 0.7470536578634075, | |
| "learning_rate": 8.729315218496097e-06, | |
| "loss": 0.1828, | |
| "step": 1492 | |
| }, | |
| { | |
| "epoch": 0.2321928460342146, | |
| "grad_norm": 1.0314729949458916, | |
| "learning_rate": 8.727687547338527e-06, | |
| "loss": 0.1766, | |
| "step": 1493 | |
| }, | |
| { | |
| "epoch": 0.232348367029549, | |
| "grad_norm": 1.435780946058732, | |
| "learning_rate": 8.726058986329954e-06, | |
| "loss": 0.2574, | |
| "step": 1494 | |
| }, | |
| { | |
| "epoch": 0.23250388802488337, | |
| "grad_norm": 1.3013711909380183, | |
| "learning_rate": 8.72442953585914e-06, | |
| "loss": 0.2304, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.23265940902021773, | |
| "grad_norm": 1.3258835525000316, | |
| "learning_rate": 8.722799196315057e-06, | |
| "loss": 0.1649, | |
| "step": 1496 | |
| }, | |
| { | |
| "epoch": 0.2328149300155521, | |
| "grad_norm": 1.4810824648278473, | |
| "learning_rate": 8.721167968086888e-06, | |
| "loss": 0.2786, | |
| "step": 1497 | |
| }, | |
| { | |
| "epoch": 0.23297045101088648, | |
| "grad_norm": 0.8879588001193606, | |
| "learning_rate": 8.719535851564034e-06, | |
| "loss": 0.1662, | |
| "step": 1498 | |
| }, | |
| { | |
| "epoch": 0.23312597200622084, | |
| "grad_norm": 1.0006636128134747, | |
| "learning_rate": 8.7179028471361e-06, | |
| "loss": 0.144, | |
| "step": 1499 | |
| }, | |
| { | |
| "epoch": 0.2332814930015552, | |
| "grad_norm": 1.0732426035660707, | |
| "learning_rate": 8.716268955192908e-06, | |
| "loss": 0.1799, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2332814930015552, | |
| "eval_loss": 0.20381511747837067, | |
| "eval_runtime": 9.4315, | |
| "eval_samples_per_second": 2.757, | |
| "eval_steps_per_second": 0.742, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.23343701399688957, | |
| "grad_norm": 1.1848798776210054, | |
| "learning_rate": 8.714634176124492e-06, | |
| "loss": 0.2192, | |
| "step": 1501 | |
| }, | |
| { | |
| "epoch": 0.23359253499222396, | |
| "grad_norm": 1.1734962627193575, | |
| "learning_rate": 8.712998510321095e-06, | |
| "loss": 0.2218, | |
| "step": 1502 | |
| }, | |
| { | |
| "epoch": 0.23374805598755832, | |
| "grad_norm": 1.0346380522248477, | |
| "learning_rate": 8.711361958173175e-06, | |
| "loss": 0.1561, | |
| "step": 1503 | |
| }, | |
| { | |
| "epoch": 0.23390357698289269, | |
| "grad_norm": 0.8380236750022618, | |
| "learning_rate": 8.709724520071399e-06, | |
| "loss": 0.1238, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.23405909797822705, | |
| "grad_norm": 0.8234400155679666, | |
| "learning_rate": 8.708086196406646e-06, | |
| "loss": 0.1887, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.23421461897356144, | |
| "grad_norm": 1.3627952832885772, | |
| "learning_rate": 8.706446987570005e-06, | |
| "loss": 0.1739, | |
| "step": 1506 | |
| }, | |
| { | |
| "epoch": 0.2343701399688958, | |
| "grad_norm": 2.486707766460104, | |
| "learning_rate": 8.704806893952782e-06, | |
| "loss": 0.1462, | |
| "step": 1507 | |
| }, | |
| { | |
| "epoch": 0.23452566096423016, | |
| "grad_norm": 1.041812062354574, | |
| "learning_rate": 8.703165915946488e-06, | |
| "loss": 0.2247, | |
| "step": 1508 | |
| }, | |
| { | |
| "epoch": 0.23468118195956453, | |
| "grad_norm": 1.2090827115985525, | |
| "learning_rate": 8.701524053942846e-06, | |
| "loss": 0.1931, | |
| "step": 1509 | |
| }, | |
| { | |
| "epoch": 0.23483670295489892, | |
| "grad_norm": 0.7956311279751848, | |
| "learning_rate": 8.699881308333794e-06, | |
| "loss": 0.1801, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.23499222395023328, | |
| "grad_norm": 2.3005427634248017, | |
| "learning_rate": 8.698237679511476e-06, | |
| "loss": 0.2116, | |
| "step": 1511 | |
| }, | |
| { | |
| "epoch": 0.23514774494556764, | |
| "grad_norm": 1.1297158899245439, | |
| "learning_rate": 8.696593167868252e-06, | |
| "loss": 0.2319, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 0.23530326594090203, | |
| "grad_norm": 0.960775125545338, | |
| "learning_rate": 8.694947773796685e-06, | |
| "loss": 0.1543, | |
| "step": 1513 | |
| }, | |
| { | |
| "epoch": 0.2354587869362364, | |
| "grad_norm": 1.213893040863673, | |
| "learning_rate": 8.69330149768956e-06, | |
| "loss": 0.2041, | |
| "step": 1514 | |
| }, | |
| { | |
| "epoch": 0.23561430793157076, | |
| "grad_norm": 0.8074468351762752, | |
| "learning_rate": 8.69165433993986e-06, | |
| "loss": 0.1965, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.23576982892690512, | |
| "grad_norm": 1.1267774919804718, | |
| "learning_rate": 8.690006300940789e-06, | |
| "loss": 0.1823, | |
| "step": 1516 | |
| }, | |
| { | |
| "epoch": 0.2359253499222395, | |
| "grad_norm": 1.4711843699980223, | |
| "learning_rate": 8.688357381085753e-06, | |
| "loss": 0.1753, | |
| "step": 1517 | |
| }, | |
| { | |
| "epoch": 0.23608087091757388, | |
| "grad_norm": 1.0215570051060534, | |
| "learning_rate": 8.686707580768376e-06, | |
| "loss": 0.214, | |
| "step": 1518 | |
| }, | |
| { | |
| "epoch": 0.23623639191290824, | |
| "grad_norm": 1.4485746749390973, | |
| "learning_rate": 8.685056900382486e-06, | |
| "loss": 0.1742, | |
| "step": 1519 | |
| }, | |
| { | |
| "epoch": 0.2363919129082426, | |
| "grad_norm": 1.6525523323599767, | |
| "learning_rate": 8.683405340322123e-06, | |
| "loss": 0.3261, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.236547433903577, | |
| "grad_norm": 1.411135121552525, | |
| "learning_rate": 8.681752900981539e-06, | |
| "loss": 0.1753, | |
| "step": 1521 | |
| }, | |
| { | |
| "epoch": 0.23670295489891136, | |
| "grad_norm": 1.4707330597490842, | |
| "learning_rate": 8.680099582755196e-06, | |
| "loss": 0.1668, | |
| "step": 1522 | |
| }, | |
| { | |
| "epoch": 0.23685847589424572, | |
| "grad_norm": 1.0942391175538886, | |
| "learning_rate": 8.678445386037759e-06, | |
| "loss": 0.1601, | |
| "step": 1523 | |
| }, | |
| { | |
| "epoch": 0.23701399688958008, | |
| "grad_norm": 1.470588177448403, | |
| "learning_rate": 8.67679031122411e-06, | |
| "loss": 0.246, | |
| "step": 1524 | |
| }, | |
| { | |
| "epoch": 0.23716951788491447, | |
| "grad_norm": 0.9581346042453303, | |
| "learning_rate": 8.675134358709341e-06, | |
| "loss": 0.1574, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.23732503888024883, | |
| "grad_norm": 1.4763786660245666, | |
| "learning_rate": 8.67347752888875e-06, | |
| "loss": 0.1907, | |
| "step": 1526 | |
| }, | |
| { | |
| "epoch": 0.2374805598755832, | |
| "grad_norm": 1.0363167034974192, | |
| "learning_rate": 8.671819822157842e-06, | |
| "loss": 0.1531, | |
| "step": 1527 | |
| }, | |
| { | |
| "epoch": 0.2376360808709176, | |
| "grad_norm": 1.1924345869848432, | |
| "learning_rate": 8.670161238912338e-06, | |
| "loss": 0.1347, | |
| "step": 1528 | |
| }, | |
| { | |
| "epoch": 0.23779160186625195, | |
| "grad_norm": 1.3358065512422586, | |
| "learning_rate": 8.668501779548165e-06, | |
| "loss": 0.1827, | |
| "step": 1529 | |
| }, | |
| { | |
| "epoch": 0.2379471228615863, | |
| "grad_norm": 2.021234266844145, | |
| "learning_rate": 8.666841444461456e-06, | |
| "loss": 0.1368, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.23810264385692068, | |
| "grad_norm": 1.4808660901110622, | |
| "learning_rate": 8.665180234048561e-06, | |
| "loss": 0.2527, | |
| "step": 1531 | |
| }, | |
| { | |
| "epoch": 0.23825816485225507, | |
| "grad_norm": 1.065494309629267, | |
| "learning_rate": 8.66351814870603e-06, | |
| "loss": 0.1645, | |
| "step": 1532 | |
| }, | |
| { | |
| "epoch": 0.23841368584758943, | |
| "grad_norm": 1.154174016882306, | |
| "learning_rate": 8.661855188830626e-06, | |
| "loss": 0.2328, | |
| "step": 1533 | |
| }, | |
| { | |
| "epoch": 0.2385692068429238, | |
| "grad_norm": 1.1447203609781391, | |
| "learning_rate": 8.660191354819324e-06, | |
| "loss": 0.1794, | |
| "step": 1534 | |
| }, | |
| { | |
| "epoch": 0.23872472783825816, | |
| "grad_norm": 0.9991428522588004, | |
| "learning_rate": 8.658526647069303e-06, | |
| "loss": 0.1233, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.23888024883359255, | |
| "grad_norm": 0.7670014014044277, | |
| "learning_rate": 8.65686106597795e-06, | |
| "loss": 0.1834, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.2390357698289269, | |
| "grad_norm": 1.5945089662017708, | |
| "learning_rate": 8.655194611942863e-06, | |
| "loss": 0.1921, | |
| "step": 1537 | |
| }, | |
| { | |
| "epoch": 0.23919129082426127, | |
| "grad_norm": 1.2997434550841578, | |
| "learning_rate": 8.65352728536185e-06, | |
| "loss": 0.1873, | |
| "step": 1538 | |
| }, | |
| { | |
| "epoch": 0.23934681181959563, | |
| "grad_norm": 0.7625665208100638, | |
| "learning_rate": 8.651859086632924e-06, | |
| "loss": 0.1049, | |
| "step": 1539 | |
| }, | |
| { | |
| "epoch": 0.23950233281493002, | |
| "grad_norm": 2.315830524891549, | |
| "learning_rate": 8.650190016154307e-06, | |
| "loss": 0.2199, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.2396578538102644, | |
| "grad_norm": 0.893513036921711, | |
| "learning_rate": 8.648520074324429e-06, | |
| "loss": 0.1486, | |
| "step": 1541 | |
| }, | |
| { | |
| "epoch": 0.23981337480559875, | |
| "grad_norm": 1.0954057776977126, | |
| "learning_rate": 8.64684926154193e-06, | |
| "loss": 0.143, | |
| "step": 1542 | |
| }, | |
| { | |
| "epoch": 0.2399688958009331, | |
| "grad_norm": 1.1636396222045602, | |
| "learning_rate": 8.645177578205654e-06, | |
| "loss": 0.1386, | |
| "step": 1543 | |
| }, | |
| { | |
| "epoch": 0.2401244167962675, | |
| "grad_norm": 1.6636278556595083, | |
| "learning_rate": 8.643505024714656e-06, | |
| "loss": 0.2057, | |
| "step": 1544 | |
| }, | |
| { | |
| "epoch": 0.24027993779160187, | |
| "grad_norm": 0.847583750776468, | |
| "learning_rate": 8.641831601468198e-06, | |
| "loss": 0.1272, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.24043545878693623, | |
| "grad_norm": 1.1676164916999088, | |
| "learning_rate": 8.640157308865751e-06, | |
| "loss": 0.2057, | |
| "step": 1546 | |
| }, | |
| { | |
| "epoch": 0.24059097978227062, | |
| "grad_norm": 1.1944835161358125, | |
| "learning_rate": 8.63848214730699e-06, | |
| "loss": 0.2237, | |
| "step": 1547 | |
| }, | |
| { | |
| "epoch": 0.24074650077760498, | |
| "grad_norm": 1.3051952058816747, | |
| "learning_rate": 8.6368061171918e-06, | |
| "loss": 0.1398, | |
| "step": 1548 | |
| }, | |
| { | |
| "epoch": 0.24090202177293935, | |
| "grad_norm": 1.2433159998532273, | |
| "learning_rate": 8.635129218920272e-06, | |
| "loss": 0.1514, | |
| "step": 1549 | |
| }, | |
| { | |
| "epoch": 0.2410575427682737, | |
| "grad_norm": 1.6469350149721569, | |
| "learning_rate": 8.633451452892707e-06, | |
| "loss": 0.2141, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.2412130637636081, | |
| "grad_norm": 1.0473985194623197, | |
| "learning_rate": 8.631772819509609e-06, | |
| "loss": 0.1629, | |
| "step": 1551 | |
| }, | |
| { | |
| "epoch": 0.24136858475894246, | |
| "grad_norm": 1.081030634052537, | |
| "learning_rate": 8.630093319171692e-06, | |
| "loss": 0.1647, | |
| "step": 1552 | |
| }, | |
| { | |
| "epoch": 0.24152410575427682, | |
| "grad_norm": 1.0002048515938975, | |
| "learning_rate": 8.628412952279879e-06, | |
| "loss": 0.1636, | |
| "step": 1553 | |
| }, | |
| { | |
| "epoch": 0.2416796267496112, | |
| "grad_norm": 1.2635804994332953, | |
| "learning_rate": 8.62673171923529e-06, | |
| "loss": 0.1922, | |
| "step": 1554 | |
| }, | |
| { | |
| "epoch": 0.24183514774494558, | |
| "grad_norm": 1.0841589283406547, | |
| "learning_rate": 8.625049620439266e-06, | |
| "loss": 0.1796, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.24199066874027994, | |
| "grad_norm": 1.2588626615586416, | |
| "learning_rate": 8.623366656293345e-06, | |
| "loss": 0.2045, | |
| "step": 1556 | |
| }, | |
| { | |
| "epoch": 0.2421461897356143, | |
| "grad_norm": 1.114070429674418, | |
| "learning_rate": 8.621682827199271e-06, | |
| "loss": 0.2155, | |
| "step": 1557 | |
| }, | |
| { | |
| "epoch": 0.24230171073094867, | |
| "grad_norm": 1.122877032526039, | |
| "learning_rate": 8.619998133559001e-06, | |
| "loss": 0.1647, | |
| "step": 1558 | |
| }, | |
| { | |
| "epoch": 0.24245723172628306, | |
| "grad_norm": 2.039494379737774, | |
| "learning_rate": 8.618312575774696e-06, | |
| "loss": 0.2327, | |
| "step": 1559 | |
| }, | |
| { | |
| "epoch": 0.24261275272161742, | |
| "grad_norm": 1.1450723191422727, | |
| "learning_rate": 8.616626154248717e-06, | |
| "loss": 0.1879, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.24276827371695178, | |
| "grad_norm": 1.1035439479736404, | |
| "learning_rate": 8.614938869383643e-06, | |
| "loss": 0.1987, | |
| "step": 1561 | |
| }, | |
| { | |
| "epoch": 0.24292379471228615, | |
| "grad_norm": 9.183796995970361, | |
| "learning_rate": 8.613250721582244e-06, | |
| "loss": 0.1657, | |
| "step": 1562 | |
| }, | |
| { | |
| "epoch": 0.24307931570762054, | |
| "grad_norm": 10.346790090579951, | |
| "learning_rate": 8.611561711247512e-06, | |
| "loss": 0.1277, | |
| "step": 1563 | |
| }, | |
| { | |
| "epoch": 0.2432348367029549, | |
| "grad_norm": 1.0950378522648088, | |
| "learning_rate": 8.609871838782636e-06, | |
| "loss": 0.1792, | |
| "step": 1564 | |
| }, | |
| { | |
| "epoch": 0.24339035769828926, | |
| "grad_norm": 1.2442899837619454, | |
| "learning_rate": 8.608181104591008e-06, | |
| "loss": 0.2481, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.24354587869362365, | |
| "grad_norm": 0.9579587283389649, | |
| "learning_rate": 8.606489509076232e-06, | |
| "loss": 0.1464, | |
| "step": 1566 | |
| }, | |
| { | |
| "epoch": 0.24370139968895801, | |
| "grad_norm": 1.3434609920952423, | |
| "learning_rate": 8.604797052642118e-06, | |
| "loss": 0.167, | |
| "step": 1567 | |
| }, | |
| { | |
| "epoch": 0.24385692068429238, | |
| "grad_norm": 1.3932778191886934, | |
| "learning_rate": 8.603103735692678e-06, | |
| "loss": 0.222, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.24401244167962674, | |
| "grad_norm": 1.2606515150004263, | |
| "learning_rate": 8.601409558632125e-06, | |
| "loss": 0.1734, | |
| "step": 1569 | |
| }, | |
| { | |
| "epoch": 0.24416796267496113, | |
| "grad_norm": 0.7524170445152542, | |
| "learning_rate": 8.59971452186489e-06, | |
| "loss": 0.1377, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.2443234836702955, | |
| "grad_norm": 1.8039225543958133, | |
| "learning_rate": 8.5980186257956e-06, | |
| "loss": 0.1645, | |
| "step": 1571 | |
| }, | |
| { | |
| "epoch": 0.24447900466562986, | |
| "grad_norm": 1.2660119379119157, | |
| "learning_rate": 8.596321870829084e-06, | |
| "loss": 0.1297, | |
| "step": 1572 | |
| }, | |
| { | |
| "epoch": 0.24463452566096422, | |
| "grad_norm": 0.9837487875887194, | |
| "learning_rate": 8.594624257370388e-06, | |
| "loss": 0.2292, | |
| "step": 1573 | |
| }, | |
| { | |
| "epoch": 0.2447900466562986, | |
| "grad_norm": 1.4946436207685003, | |
| "learning_rate": 8.592925785824753e-06, | |
| "loss": 0.171, | |
| "step": 1574 | |
| }, | |
| { | |
| "epoch": 0.24494556765163297, | |
| "grad_norm": 1.0654266730537136, | |
| "learning_rate": 8.591226456597626e-06, | |
| "loss": 0.1375, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.24510108864696734, | |
| "grad_norm": 0.971876018180366, | |
| "learning_rate": 8.589526270094664e-06, | |
| "loss": 0.1924, | |
| "step": 1576 | |
| }, | |
| { | |
| "epoch": 0.2452566096423017, | |
| "grad_norm": 1.0087644300116139, | |
| "learning_rate": 8.587825226721722e-06, | |
| "loss": 0.1687, | |
| "step": 1577 | |
| }, | |
| { | |
| "epoch": 0.2454121306376361, | |
| "grad_norm": 1.1652659496533695, | |
| "learning_rate": 8.586123326884865e-06, | |
| "loss": 0.186, | |
| "step": 1578 | |
| }, | |
| { | |
| "epoch": 0.24556765163297045, | |
| "grad_norm": 1.4775732365533967, | |
| "learning_rate": 8.584420570990361e-06, | |
| "loss": 0.1889, | |
| "step": 1579 | |
| }, | |
| { | |
| "epoch": 0.24572317262830481, | |
| "grad_norm": 1.0459439420285532, | |
| "learning_rate": 8.582716959444679e-06, | |
| "loss": 0.1928, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.2458786936236392, | |
| "grad_norm": 1.5372117734449058, | |
| "learning_rate": 8.581012492654495e-06, | |
| "loss": 0.1877, | |
| "step": 1581 | |
| }, | |
| { | |
| "epoch": 0.24603421461897357, | |
| "grad_norm": 1.9347395817267816, | |
| "learning_rate": 8.579307171026693e-06, | |
| "loss": 0.2777, | |
| "step": 1582 | |
| }, | |
| { | |
| "epoch": 0.24618973561430793, | |
| "grad_norm": 0.9029125279631515, | |
| "learning_rate": 8.577600994968352e-06, | |
| "loss": 0.1297, | |
| "step": 1583 | |
| }, | |
| { | |
| "epoch": 0.2463452566096423, | |
| "grad_norm": 0.8355029037365392, | |
| "learning_rate": 8.575893964886763e-06, | |
| "loss": 0.2099, | |
| "step": 1584 | |
| }, | |
| { | |
| "epoch": 0.24650077760497668, | |
| "grad_norm": 1.6899413873191795, | |
| "learning_rate": 8.574186081189416e-06, | |
| "loss": 0.2022, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.24665629860031105, | |
| "grad_norm": 1.087509710593699, | |
| "learning_rate": 8.572477344284009e-06, | |
| "loss": 0.1751, | |
| "step": 1586 | |
| }, | |
| { | |
| "epoch": 0.2468118195956454, | |
| "grad_norm": 1.0292806428751466, | |
| "learning_rate": 8.570767754578438e-06, | |
| "loss": 0.1593, | |
| "step": 1587 | |
| }, | |
| { | |
| "epoch": 0.24696734059097977, | |
| "grad_norm": 1.188609591991913, | |
| "learning_rate": 8.56905731248081e-06, | |
| "loss": 0.1491, | |
| "step": 1588 | |
| }, | |
| { | |
| "epoch": 0.24712286158631416, | |
| "grad_norm": 1.2300883239133906, | |
| "learning_rate": 8.567346018399427e-06, | |
| "loss": 0.165, | |
| "step": 1589 | |
| }, | |
| { | |
| "epoch": 0.24727838258164853, | |
| "grad_norm": 1.2064414577216789, | |
| "learning_rate": 8.565633872742803e-06, | |
| "loss": 0.2524, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2474339035769829, | |
| "grad_norm": 0.8406003864640567, | |
| "learning_rate": 8.56392087591965e-06, | |
| "loss": 0.1658, | |
| "step": 1591 | |
| }, | |
| { | |
| "epoch": 0.24758942457231725, | |
| "grad_norm": 2.634699334807654, | |
| "learning_rate": 8.56220702833888e-06, | |
| "loss": 0.1692, | |
| "step": 1592 | |
| }, | |
| { | |
| "epoch": 0.24774494556765164, | |
| "grad_norm": 0.9815581638651881, | |
| "learning_rate": 8.560492330409618e-06, | |
| "loss": 0.1678, | |
| "step": 1593 | |
| }, | |
| { | |
| "epoch": 0.247900466562986, | |
| "grad_norm": 1.3909573488426212, | |
| "learning_rate": 8.558776782541183e-06, | |
| "loss": 0.2397, | |
| "step": 1594 | |
| }, | |
| { | |
| "epoch": 0.24805598755832037, | |
| "grad_norm": 1.2613818557792364, | |
| "learning_rate": 8.557060385143102e-06, | |
| "loss": 0.2273, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.24821150855365473, | |
| "grad_norm": 0.9777010646149178, | |
| "learning_rate": 8.5553431386251e-06, | |
| "loss": 0.1713, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 0.24836702954898912, | |
| "grad_norm": 1.2012423072130696, | |
| "learning_rate": 8.553625043397112e-06, | |
| "loss": 0.2192, | |
| "step": 1597 | |
| }, | |
| { | |
| "epoch": 0.24852255054432348, | |
| "grad_norm": 1.0747389022970961, | |
| "learning_rate": 8.551906099869269e-06, | |
| "loss": 0.1555, | |
| "step": 1598 | |
| }, | |
| { | |
| "epoch": 0.24867807153965785, | |
| "grad_norm": 0.9987345212261577, | |
| "learning_rate": 8.550186308451906e-06, | |
| "loss": 0.2117, | |
| "step": 1599 | |
| }, | |
| { | |
| "epoch": 0.24883359253499224, | |
| "grad_norm": 1.1743809541983374, | |
| "learning_rate": 8.548465669555564e-06, | |
| "loss": 0.1547, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.24883359253499224, | |
| "eval_loss": 0.2037108987569809, | |
| "eval_runtime": 9.4238, | |
| "eval_samples_per_second": 2.759, | |
| "eval_steps_per_second": 0.743, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2489891135303266, | |
| "grad_norm": 1.0755504197866683, | |
| "learning_rate": 8.546744183590979e-06, | |
| "loss": 0.1448, | |
| "step": 1601 | |
| }, | |
| { | |
| "epoch": 0.24914463452566096, | |
| "grad_norm": 1.293645268455303, | |
| "learning_rate": 8.545021850969097e-06, | |
| "loss": 0.2045, | |
| "step": 1602 | |
| }, | |
| { | |
| "epoch": 0.24930015552099533, | |
| "grad_norm": 1.644496498579518, | |
| "learning_rate": 8.543298672101063e-06, | |
| "loss": 0.1745, | |
| "step": 1603 | |
| }, | |
| { | |
| "epoch": 0.24945567651632972, | |
| "grad_norm": 1.8853737644375217, | |
| "learning_rate": 8.541574647398224e-06, | |
| "loss": 0.1785, | |
| "step": 1604 | |
| }, | |
| { | |
| "epoch": 0.24961119751166408, | |
| "grad_norm": 0.8348472318309339, | |
| "learning_rate": 8.539849777272125e-06, | |
| "loss": 0.1976, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.24976671850699844, | |
| "grad_norm": 1.6007239985640846, | |
| "learning_rate": 8.538124062134521e-06, | |
| "loss": 0.1766, | |
| "step": 1606 | |
| }, | |
| { | |
| "epoch": 0.2499222395023328, | |
| "grad_norm": 2.1944156006209194, | |
| "learning_rate": 8.53639750239736e-06, | |
| "loss": 0.2715, | |
| "step": 1607 | |
| }, | |
| { | |
| "epoch": 0.25007776049766717, | |
| "grad_norm": 1.105749977206952, | |
| "learning_rate": 8.534670098472802e-06, | |
| "loss": 0.1564, | |
| "step": 1608 | |
| }, | |
| { | |
| "epoch": 0.25023328149300156, | |
| "grad_norm": 0.8083237797522677, | |
| "learning_rate": 8.532941850773195e-06, | |
| "loss": 0.1668, | |
| "step": 1609 | |
| }, | |
| { | |
| "epoch": 0.25038880248833595, | |
| "grad_norm": 1.172486307255137, | |
| "learning_rate": 8.531212759711103e-06, | |
| "loss": 0.2302, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.2505443234836703, | |
| "grad_norm": 1.268322758173216, | |
| "learning_rate": 8.52948282569928e-06, | |
| "loss": 0.1789, | |
| "step": 1611 | |
| }, | |
| { | |
| "epoch": 0.2506998444790047, | |
| "grad_norm": 0.9091823227567202, | |
| "learning_rate": 8.527752049150685e-06, | |
| "loss": 0.0784, | |
| "step": 1612 | |
| }, | |
| { | |
| "epoch": 0.250855365474339, | |
| "grad_norm": 1.3902158634610304, | |
| "learning_rate": 8.52602043047848e-06, | |
| "loss": 0.1681, | |
| "step": 1613 | |
| }, | |
| { | |
| "epoch": 0.2510108864696734, | |
| "grad_norm": 1.4942303280111533, | |
| "learning_rate": 8.524287970096026e-06, | |
| "loss": 0.217, | |
| "step": 1614 | |
| }, | |
| { | |
| "epoch": 0.2511664074650078, | |
| "grad_norm": 0.8627158934582907, | |
| "learning_rate": 8.522554668416887e-06, | |
| "loss": 0.2181, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.2513219284603421, | |
| "grad_norm": 1.0390290867530942, | |
| "learning_rate": 8.520820525854824e-06, | |
| "loss": 0.1764, | |
| "step": 1616 | |
| }, | |
| { | |
| "epoch": 0.2514774494556765, | |
| "grad_norm": 1.4108685539031005, | |
| "learning_rate": 8.519085542823802e-06, | |
| "loss": 0.2164, | |
| "step": 1617 | |
| }, | |
| { | |
| "epoch": 0.2516329704510109, | |
| "grad_norm": 1.371077345528009, | |
| "learning_rate": 8.517349719737984e-06, | |
| "loss": 0.1561, | |
| "step": 1618 | |
| }, | |
| { | |
| "epoch": 0.25178849144634524, | |
| "grad_norm": 1.2763042021188964, | |
| "learning_rate": 8.51561305701174e-06, | |
| "loss": 0.1526, | |
| "step": 1619 | |
| }, | |
| { | |
| "epoch": 0.25194401244167963, | |
| "grad_norm": 1.077695325158449, | |
| "learning_rate": 8.51387555505963e-06, | |
| "loss": 0.1876, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.252099533437014, | |
| "grad_norm": 1.3164226998591637, | |
| "learning_rate": 8.512137214296422e-06, | |
| "loss": 0.2131, | |
| "step": 1621 | |
| }, | |
| { | |
| "epoch": 0.25225505443234836, | |
| "grad_norm": 1.7522341912294899, | |
| "learning_rate": 8.510398035137083e-06, | |
| "loss": 0.133, | |
| "step": 1622 | |
| }, | |
| { | |
| "epoch": 0.25241057542768275, | |
| "grad_norm": 4.615604310333582, | |
| "learning_rate": 8.50865801799678e-06, | |
| "loss": 0.1955, | |
| "step": 1623 | |
| }, | |
| { | |
| "epoch": 0.2525660964230171, | |
| "grad_norm": 2.3506074867763536, | |
| "learning_rate": 8.506917163290877e-06, | |
| "loss": 0.3199, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 0.2527216174183515, | |
| "grad_norm": 0.7483739763165084, | |
| "learning_rate": 8.505175471434943e-06, | |
| "loss": 0.2213, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.25287713841368586, | |
| "grad_norm": 2.0095572169442333, | |
| "learning_rate": 8.50343294284474e-06, | |
| "loss": 0.2356, | |
| "step": 1626 | |
| }, | |
| { | |
| "epoch": 0.2530326594090202, | |
| "grad_norm": 0.9367298995041891, | |
| "learning_rate": 8.501689577936238e-06, | |
| "loss": 0.1567, | |
| "step": 1627 | |
| }, | |
| { | |
| "epoch": 0.2531881804043546, | |
| "grad_norm": 1.2746896918156698, | |
| "learning_rate": 8.499945377125602e-06, | |
| "loss": 0.1465, | |
| "step": 1628 | |
| }, | |
| { | |
| "epoch": 0.253343701399689, | |
| "grad_norm": 0.7971645300115215, | |
| "learning_rate": 8.498200340829195e-06, | |
| "loss": 0.1419, | |
| "step": 1629 | |
| }, | |
| { | |
| "epoch": 0.2534992223950233, | |
| "grad_norm": 1.7131432725110083, | |
| "learning_rate": 8.496454469463583e-06, | |
| "loss": 0.1437, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.2536547433903577, | |
| "grad_norm": 1.3945635968284718, | |
| "learning_rate": 8.494707763445526e-06, | |
| "loss": 0.2116, | |
| "step": 1631 | |
| }, | |
| { | |
| "epoch": 0.25381026438569204, | |
| "grad_norm": 1.130700720901677, | |
| "learning_rate": 8.492960223191994e-06, | |
| "loss": 0.1783, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.25396578538102643, | |
| "grad_norm": 0.9910207975897489, | |
| "learning_rate": 8.491211849120146e-06, | |
| "loss": 0.1275, | |
| "step": 1633 | |
| }, | |
| { | |
| "epoch": 0.2541213063763608, | |
| "grad_norm": 1.6819299813099522, | |
| "learning_rate": 8.48946264164734e-06, | |
| "loss": 0.2092, | |
| "step": 1634 | |
| }, | |
| { | |
| "epoch": 0.25427682737169516, | |
| "grad_norm": 0.8070165110990363, | |
| "learning_rate": 8.487712601191143e-06, | |
| "loss": 0.2104, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.25443234836702955, | |
| "grad_norm": 0.7832453865024183, | |
| "learning_rate": 8.485961728169308e-06, | |
| "loss": 0.1491, | |
| "step": 1636 | |
| }, | |
| { | |
| "epoch": 0.25458786936236394, | |
| "grad_norm": 1.570863259158348, | |
| "learning_rate": 8.484210022999795e-06, | |
| "loss": 0.1337, | |
| "step": 1637 | |
| }, | |
| { | |
| "epoch": 0.2547433903576983, | |
| "grad_norm": 2.094162070797788, | |
| "learning_rate": 8.482457486100761e-06, | |
| "loss": 0.1732, | |
| "step": 1638 | |
| }, | |
| { | |
| "epoch": 0.25489891135303266, | |
| "grad_norm": 1.3293274255208316, | |
| "learning_rate": 8.48070411789056e-06, | |
| "loss": 0.1587, | |
| "step": 1639 | |
| }, | |
| { | |
| "epoch": 0.25505443234836706, | |
| "grad_norm": 0.9704592907631973, | |
| "learning_rate": 8.478949918787746e-06, | |
| "loss": 0.167, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.2552099533437014, | |
| "grad_norm": 2.2927511192581935, | |
| "learning_rate": 8.47719488921107e-06, | |
| "loss": 0.1731, | |
| "step": 1641 | |
| }, | |
| { | |
| "epoch": 0.2553654743390358, | |
| "grad_norm": 1.2113969832398468, | |
| "learning_rate": 8.475439029579487e-06, | |
| "loss": 0.1636, | |
| "step": 1642 | |
| }, | |
| { | |
| "epoch": 0.2555209953343701, | |
| "grad_norm": 1.2700840486141427, | |
| "learning_rate": 8.473682340312136e-06, | |
| "loss": 0.2251, | |
| "step": 1643 | |
| }, | |
| { | |
| "epoch": 0.2556765163297045, | |
| "grad_norm": 0.8692629936958125, | |
| "learning_rate": 8.47192482182837e-06, | |
| "loss": 0.1944, | |
| "step": 1644 | |
| }, | |
| { | |
| "epoch": 0.2558320373250389, | |
| "grad_norm": 1.0546031026829716, | |
| "learning_rate": 8.470166474547731e-06, | |
| "loss": 0.1963, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.25598755832037323, | |
| "grad_norm": 1.8035421603246344, | |
| "learning_rate": 8.468407298889962e-06, | |
| "loss": 0.1678, | |
| "step": 1646 | |
| }, | |
| { | |
| "epoch": 0.2561430793157076, | |
| "grad_norm": 0.8593243264529278, | |
| "learning_rate": 8.466647295275002e-06, | |
| "loss": 0.1272, | |
| "step": 1647 | |
| }, | |
| { | |
| "epoch": 0.256298600311042, | |
| "grad_norm": 1.5174530612382813, | |
| "learning_rate": 8.464886464122988e-06, | |
| "loss": 0.2685, | |
| "step": 1648 | |
| }, | |
| { | |
| "epoch": 0.25645412130637635, | |
| "grad_norm": 1.5250972376290421, | |
| "learning_rate": 8.463124805854257e-06, | |
| "loss": 0.1674, | |
| "step": 1649 | |
| }, | |
| { | |
| "epoch": 0.25660964230171074, | |
| "grad_norm": 1.1663575092987046, | |
| "learning_rate": 8.461362320889338e-06, | |
| "loss": 0.1577, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2567651632970451, | |
| "grad_norm": 1.474673013106268, | |
| "learning_rate": 8.459599009648964e-06, | |
| "loss": 0.1769, | |
| "step": 1651 | |
| }, | |
| { | |
| "epoch": 0.25692068429237946, | |
| "grad_norm": 1.1672631965692757, | |
| "learning_rate": 8.45783487255406e-06, | |
| "loss": 0.2249, | |
| "step": 1652 | |
| }, | |
| { | |
| "epoch": 0.25707620528771385, | |
| "grad_norm": 1.1953181883355133, | |
| "learning_rate": 8.456069910025751e-06, | |
| "loss": 0.2018, | |
| "step": 1653 | |
| }, | |
| { | |
| "epoch": 0.2572317262830482, | |
| "grad_norm": 1.1089828464331577, | |
| "learning_rate": 8.454304122485358e-06, | |
| "loss": 0.1419, | |
| "step": 1654 | |
| }, | |
| { | |
| "epoch": 0.2573872472783826, | |
| "grad_norm": 1.2716710060074294, | |
| "learning_rate": 8.452537510354397e-06, | |
| "loss": 0.1966, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.25754276827371697, | |
| "grad_norm": 1.952579937166782, | |
| "learning_rate": 8.450770074054586e-06, | |
| "loss": 0.2699, | |
| "step": 1656 | |
| }, | |
| { | |
| "epoch": 0.2576982892690513, | |
| "grad_norm": 0.7319931402583304, | |
| "learning_rate": 8.449001814007838e-06, | |
| "loss": 0.1401, | |
| "step": 1657 | |
| }, | |
| { | |
| "epoch": 0.2578538102643857, | |
| "grad_norm": 1.627013708512288, | |
| "learning_rate": 8.447232730636257e-06, | |
| "loss": 0.2617, | |
| "step": 1658 | |
| }, | |
| { | |
| "epoch": 0.2580093312597201, | |
| "grad_norm": 1.0492953509552387, | |
| "learning_rate": 8.44546282436215e-06, | |
| "loss": 0.1922, | |
| "step": 1659 | |
| }, | |
| { | |
| "epoch": 0.2581648522550544, | |
| "grad_norm": 0.9166534435780459, | |
| "learning_rate": 8.443692095608019e-06, | |
| "loss": 0.2099, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.2583203732503888, | |
| "grad_norm": 1.1458120209760718, | |
| "learning_rate": 8.441920544796558e-06, | |
| "loss": 0.1724, | |
| "step": 1661 | |
| }, | |
| { | |
| "epoch": 0.25847589424572315, | |
| "grad_norm": 1.071395804244241, | |
| "learning_rate": 8.440148172350666e-06, | |
| "loss": 0.1728, | |
| "step": 1662 | |
| }, | |
| { | |
| "epoch": 0.25863141524105754, | |
| "grad_norm": 1.2413704662622753, | |
| "learning_rate": 8.43837497869343e-06, | |
| "loss": 0.2031, | |
| "step": 1663 | |
| }, | |
| { | |
| "epoch": 0.25878693623639193, | |
| "grad_norm": 1.1068242296182698, | |
| "learning_rate": 8.436600964248138e-06, | |
| "loss": 0.1951, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.25894245723172626, | |
| "grad_norm": 0.8699381605693407, | |
| "learning_rate": 8.43482612943827e-06, | |
| "loss": 0.1764, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.25909797822706065, | |
| "grad_norm": 1.2048052321069596, | |
| "learning_rate": 8.433050474687505e-06, | |
| "loss": 0.2311, | |
| "step": 1666 | |
| }, | |
| { | |
| "epoch": 0.25925349922239505, | |
| "grad_norm": 1.315498269766704, | |
| "learning_rate": 8.431274000419716e-06, | |
| "loss": 0.2412, | |
| "step": 1667 | |
| }, | |
| { | |
| "epoch": 0.2594090202177294, | |
| "grad_norm": 0.6128855898398873, | |
| "learning_rate": 8.42949670705897e-06, | |
| "loss": 0.1068, | |
| "step": 1668 | |
| }, | |
| { | |
| "epoch": 0.25956454121306377, | |
| "grad_norm": 0.9552988172621262, | |
| "learning_rate": 8.427718595029537e-06, | |
| "loss": 0.1458, | |
| "step": 1669 | |
| }, | |
| { | |
| "epoch": 0.25972006220839816, | |
| "grad_norm": 1.411892967173632, | |
| "learning_rate": 8.425939664755874e-06, | |
| "loss": 0.2327, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.2598755832037325, | |
| "grad_norm": 1.066036249369497, | |
| "learning_rate": 8.424159916662636e-06, | |
| "loss": 0.1845, | |
| "step": 1671 | |
| }, | |
| { | |
| "epoch": 0.2600311041990669, | |
| "grad_norm": 1.0078601069914832, | |
| "learning_rate": 8.422379351174673e-06, | |
| "loss": 0.129, | |
| "step": 1672 | |
| }, | |
| { | |
| "epoch": 0.2601866251944012, | |
| "grad_norm": 0.9627418389301211, | |
| "learning_rate": 8.420597968717033e-06, | |
| "loss": 0.2346, | |
| "step": 1673 | |
| }, | |
| { | |
| "epoch": 0.2603421461897356, | |
| "grad_norm": 1.0190302705099263, | |
| "learning_rate": 8.418815769714956e-06, | |
| "loss": 0.1291, | |
| "step": 1674 | |
| }, | |
| { | |
| "epoch": 0.26049766718507, | |
| "grad_norm": 0.8536213147159897, | |
| "learning_rate": 8.417032754593879e-06, | |
| "loss": 0.1759, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.26065318818040434, | |
| "grad_norm": 0.9477728405361937, | |
| "learning_rate": 8.415248923779431e-06, | |
| "loss": 0.1708, | |
| "step": 1676 | |
| }, | |
| { | |
| "epoch": 0.26080870917573873, | |
| "grad_norm": 1.0305276755799404, | |
| "learning_rate": 8.413464277697436e-06, | |
| "loss": 0.3205, | |
| "step": 1677 | |
| }, | |
| { | |
| "epoch": 0.2609642301710731, | |
| "grad_norm": 1.324545893865915, | |
| "learning_rate": 8.411678816773916e-06, | |
| "loss": 0.2936, | |
| "step": 1678 | |
| }, | |
| { | |
| "epoch": 0.26111975116640745, | |
| "grad_norm": 1.3383489149505705, | |
| "learning_rate": 8.409892541435085e-06, | |
| "loss": 0.2406, | |
| "step": 1679 | |
| }, | |
| { | |
| "epoch": 0.26127527216174184, | |
| "grad_norm": 0.9651270377598534, | |
| "learning_rate": 8.408105452107353e-06, | |
| "loss": 0.1511, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.2614307931570762, | |
| "grad_norm": 0.6783781205233194, | |
| "learning_rate": 8.40631754921732e-06, | |
| "loss": 0.1567, | |
| "step": 1681 | |
| }, | |
| { | |
| "epoch": 0.26158631415241057, | |
| "grad_norm": 1.198981860000486, | |
| "learning_rate": 8.404528833191786e-06, | |
| "loss": 0.2125, | |
| "step": 1682 | |
| }, | |
| { | |
| "epoch": 0.26174183514774496, | |
| "grad_norm": 0.7449630196962097, | |
| "learning_rate": 8.402739304457743e-06, | |
| "loss": 0.179, | |
| "step": 1683 | |
| }, | |
| { | |
| "epoch": 0.2618973561430793, | |
| "grad_norm": 1.3499907032342544, | |
| "learning_rate": 8.400948963442373e-06, | |
| "loss": 0.1492, | |
| "step": 1684 | |
| }, | |
| { | |
| "epoch": 0.2620528771384137, | |
| "grad_norm": 1.2324653573954145, | |
| "learning_rate": 8.39915781057306e-06, | |
| "loss": 0.1442, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.2622083981337481, | |
| "grad_norm": 1.5240761421711815, | |
| "learning_rate": 8.397365846277371e-06, | |
| "loss": 0.3141, | |
| "step": 1686 | |
| }, | |
| { | |
| "epoch": 0.2623639191290824, | |
| "grad_norm": 0.9242701212113029, | |
| "learning_rate": 8.39557307098308e-06, | |
| "loss": 0.175, | |
| "step": 1687 | |
| }, | |
| { | |
| "epoch": 0.2625194401244168, | |
| "grad_norm": 1.0215723172112428, | |
| "learning_rate": 8.393779485118142e-06, | |
| "loss": 0.1572, | |
| "step": 1688 | |
| }, | |
| { | |
| "epoch": 0.2626749611197512, | |
| "grad_norm": 1.4272441271545482, | |
| "learning_rate": 8.391985089110715e-06, | |
| "loss": 0.2086, | |
| "step": 1689 | |
| }, | |
| { | |
| "epoch": 0.26283048211508553, | |
| "grad_norm": 0.98493015131112, | |
| "learning_rate": 8.390189883389143e-06, | |
| "loss": 0.1758, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.2629860031104199, | |
| "grad_norm": 1.412962012368002, | |
| "learning_rate": 8.388393868381967e-06, | |
| "loss": 0.137, | |
| "step": 1691 | |
| }, | |
| { | |
| "epoch": 0.26314152410575425, | |
| "grad_norm": 0.8439849086089997, | |
| "learning_rate": 8.386597044517923e-06, | |
| "loss": 0.1794, | |
| "step": 1692 | |
| }, | |
| { | |
| "epoch": 0.26329704510108864, | |
| "grad_norm": 0.9027272166442722, | |
| "learning_rate": 8.384799412225936e-06, | |
| "loss": 0.1827, | |
| "step": 1693 | |
| }, | |
| { | |
| "epoch": 0.26345256609642304, | |
| "grad_norm": 1.0861962602589315, | |
| "learning_rate": 8.383000971935129e-06, | |
| "loss": 0.1736, | |
| "step": 1694 | |
| }, | |
| { | |
| "epoch": 0.26360808709175737, | |
| "grad_norm": 1.4467531133479765, | |
| "learning_rate": 8.38120172407481e-06, | |
| "loss": 0.2872, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.26376360808709176, | |
| "grad_norm": 0.7243899321635017, | |
| "learning_rate": 8.379401669074489e-06, | |
| "loss": 0.1568, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.26391912908242615, | |
| "grad_norm": 0.8947544881090379, | |
| "learning_rate": 8.37760080736386e-06, | |
| "loss": 0.1516, | |
| "step": 1697 | |
| }, | |
| { | |
| "epoch": 0.2640746500777605, | |
| "grad_norm": 1.1759725115418023, | |
| "learning_rate": 8.375799139372818e-06, | |
| "loss": 0.1384, | |
| "step": 1698 | |
| }, | |
| { | |
| "epoch": 0.2642301710730949, | |
| "grad_norm": 0.8519187195565056, | |
| "learning_rate": 8.373996665531443e-06, | |
| "loss": 0.2027, | |
| "step": 1699 | |
| }, | |
| { | |
| "epoch": 0.2643856920684292, | |
| "grad_norm": 1.4756118825078526, | |
| "learning_rate": 8.37219338627001e-06, | |
| "loss": 0.2323, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.2643856920684292, | |
| "eval_loss": 0.19943906366825104, | |
| "eval_runtime": 9.4244, | |
| "eval_samples_per_second": 2.759, | |
| "eval_steps_per_second": 0.743, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.2645412130637636, | |
| "grad_norm": 1.1415194682343677, | |
| "learning_rate": 8.370389302018993e-06, | |
| "loss": 0.1627, | |
| "step": 1701 | |
| }, | |
| { | |
| "epoch": 0.264696734059098, | |
| "grad_norm": 0.9887030475180681, | |
| "learning_rate": 8.368584413209044e-06, | |
| "loss": 0.1913, | |
| "step": 1702 | |
| }, | |
| { | |
| "epoch": 0.26485225505443233, | |
| "grad_norm": 1.579433234849522, | |
| "learning_rate": 8.366778720271022e-06, | |
| "loss": 0.2494, | |
| "step": 1703 | |
| }, | |
| { | |
| "epoch": 0.2650077760497667, | |
| "grad_norm": 1.1581416599961576, | |
| "learning_rate": 8.364972223635967e-06, | |
| "loss": 0.1984, | |
| "step": 1704 | |
| }, | |
| { | |
| "epoch": 0.2651632970451011, | |
| "grad_norm": 1.4481396852315895, | |
| "learning_rate": 8.363164923735116e-06, | |
| "loss": 0.1772, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.26531881804043544, | |
| "grad_norm": 2.2248131911902918, | |
| "learning_rate": 8.361356820999897e-06, | |
| "loss": 0.2035, | |
| "step": 1706 | |
| }, | |
| { | |
| "epoch": 0.26547433903576984, | |
| "grad_norm": 1.296906679431483, | |
| "learning_rate": 8.359547915861927e-06, | |
| "loss": 0.1906, | |
| "step": 1707 | |
| }, | |
| { | |
| "epoch": 0.2656298600311042, | |
| "grad_norm": 1.4510599043288837, | |
| "learning_rate": 8.357738208753022e-06, | |
| "loss": 0.215, | |
| "step": 1708 | |
| }, | |
| { | |
| "epoch": 0.26578538102643856, | |
| "grad_norm": 1.3812180344156422, | |
| "learning_rate": 8.35592770010518e-06, | |
| "loss": 0.2366, | |
| "step": 1709 | |
| }, | |
| { | |
| "epoch": 0.26594090202177295, | |
| "grad_norm": 0.7624028953564842, | |
| "learning_rate": 8.354116390350594e-06, | |
| "loss": 0.1337, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.2660964230171073, | |
| "grad_norm": 1.0938571817018024, | |
| "learning_rate": 8.352304279921655e-06, | |
| "loss": 0.1739, | |
| "step": 1711 | |
| }, | |
| { | |
| "epoch": 0.2662519440124417, | |
| "grad_norm": 1.3112579396126312, | |
| "learning_rate": 8.350491369250933e-06, | |
| "loss": 0.2866, | |
| "step": 1712 | |
| }, | |
| { | |
| "epoch": 0.26640746500777607, | |
| "grad_norm": 1.4175431035953647, | |
| "learning_rate": 8.348677658771197e-06, | |
| "loss": 0.1308, | |
| "step": 1713 | |
| }, | |
| { | |
| "epoch": 0.2665629860031104, | |
| "grad_norm": 2.1014926949253327, | |
| "learning_rate": 8.346863148915402e-06, | |
| "loss": 0.1549, | |
| "step": 1714 | |
| }, | |
| { | |
| "epoch": 0.2667185069984448, | |
| "grad_norm": 1.132911689146343, | |
| "learning_rate": 8.345047840116704e-06, | |
| "loss": 0.2182, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.2668740279937792, | |
| "grad_norm": 0.6535130581015213, | |
| "learning_rate": 8.343231732808435e-06, | |
| "loss": 0.1748, | |
| "step": 1716 | |
| }, | |
| { | |
| "epoch": 0.2670295489891135, | |
| "grad_norm": 0.9808104365320156, | |
| "learning_rate": 8.34141482742413e-06, | |
| "loss": 0.1512, | |
| "step": 1717 | |
| }, | |
| { | |
| "epoch": 0.2671850699844479, | |
| "grad_norm": 1.2630125658621263, | |
| "learning_rate": 8.339597124397509e-06, | |
| "loss": 0.1698, | |
| "step": 1718 | |
| }, | |
| { | |
| "epoch": 0.26734059097978224, | |
| "grad_norm": 1.279259047820582, | |
| "learning_rate": 8.33777862416248e-06, | |
| "loss": 0.1769, | |
| "step": 1719 | |
| }, | |
| { | |
| "epoch": 0.26749611197511663, | |
| "grad_norm": 1.1242790219258612, | |
| "learning_rate": 8.335959327153148e-06, | |
| "loss": 0.2224, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.267651632970451, | |
| "grad_norm": 1.0035835372337707, | |
| "learning_rate": 8.334139233803801e-06, | |
| "loss": 0.1697, | |
| "step": 1721 | |
| }, | |
| { | |
| "epoch": 0.26780715396578536, | |
| "grad_norm": 1.9776796243145607, | |
| "learning_rate": 8.332318344548926e-06, | |
| "loss": 0.2033, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 0.26796267496111975, | |
| "grad_norm": 1.1521258085682824, | |
| "learning_rate": 8.330496659823189e-06, | |
| "loss": 0.1729, | |
| "step": 1723 | |
| }, | |
| { | |
| "epoch": 0.26811819595645414, | |
| "grad_norm": 1.0253842887133877, | |
| "learning_rate": 8.328674180061453e-06, | |
| "loss": 0.2185, | |
| "step": 1724 | |
| }, | |
| { | |
| "epoch": 0.2682737169517885, | |
| "grad_norm": 0.871091469827773, | |
| "learning_rate": 8.326850905698774e-06, | |
| "loss": 0.1359, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.26842923794712287, | |
| "grad_norm": 1.7009594103702224, | |
| "learning_rate": 8.325026837170386e-06, | |
| "loss": 0.2348, | |
| "step": 1726 | |
| }, | |
| { | |
| "epoch": 0.26858475894245726, | |
| "grad_norm": 1.367926551681483, | |
| "learning_rate": 8.323201974911723e-06, | |
| "loss": 0.1842, | |
| "step": 1727 | |
| }, | |
| { | |
| "epoch": 0.2687402799377916, | |
| "grad_norm": 1.148927442910907, | |
| "learning_rate": 8.321376319358407e-06, | |
| "loss": 0.1096, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.268895800933126, | |
| "grad_norm": 1.3075658909675654, | |
| "learning_rate": 8.319549870946244e-06, | |
| "loss": 0.1543, | |
| "step": 1729 | |
| }, | |
| { | |
| "epoch": 0.2690513219284603, | |
| "grad_norm": 0.8291270774545968, | |
| "learning_rate": 8.317722630111233e-06, | |
| "loss": 0.1093, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2692068429237947, | |
| "grad_norm": 2.2622896049282706, | |
| "learning_rate": 8.315894597289565e-06, | |
| "loss": 0.2042, | |
| "step": 1731 | |
| }, | |
| { | |
| "epoch": 0.2693623639191291, | |
| "grad_norm": 0.7046996138148661, | |
| "learning_rate": 8.314065772917612e-06, | |
| "loss": 0.1303, | |
| "step": 1732 | |
| }, | |
| { | |
| "epoch": 0.26951788491446343, | |
| "grad_norm": 0.9333196367153322, | |
| "learning_rate": 8.312236157431946e-06, | |
| "loss": 0.169, | |
| "step": 1733 | |
| }, | |
| { | |
| "epoch": 0.2696734059097978, | |
| "grad_norm": 1.1869718333049797, | |
| "learning_rate": 8.310405751269318e-06, | |
| "loss": 0.2494, | |
| "step": 1734 | |
| }, | |
| { | |
| "epoch": 0.2698289269051322, | |
| "grad_norm": 0.9186255111712875, | |
| "learning_rate": 8.30857455486667e-06, | |
| "loss": 0.1449, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.26998444790046655, | |
| "grad_norm": 1.7158457711756847, | |
| "learning_rate": 8.306742568661137e-06, | |
| "loss": 0.2472, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 0.27013996889580094, | |
| "grad_norm": 0.9091734067747751, | |
| "learning_rate": 8.304909793090039e-06, | |
| "loss": 0.1517, | |
| "step": 1737 | |
| }, | |
| { | |
| "epoch": 0.2702954898911353, | |
| "grad_norm": 0.9472038650945157, | |
| "learning_rate": 8.303076228590885e-06, | |
| "loss": 0.1293, | |
| "step": 1738 | |
| }, | |
| { | |
| "epoch": 0.27045101088646967, | |
| "grad_norm": 1.359961162735269, | |
| "learning_rate": 8.301241875601371e-06, | |
| "loss": 0.1687, | |
| "step": 1739 | |
| }, | |
| { | |
| "epoch": 0.27060653188180406, | |
| "grad_norm": 1.3706412614563859, | |
| "learning_rate": 8.299406734559385e-06, | |
| "loss": 0.1151, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.2707620528771384, | |
| "grad_norm": 1.4633698039358347, | |
| "learning_rate": 8.297570805903e-06, | |
| "loss": 0.1834, | |
| "step": 1741 | |
| }, | |
| { | |
| "epoch": 0.2709175738724728, | |
| "grad_norm": 1.2706325476878815, | |
| "learning_rate": 8.295734090070477e-06, | |
| "loss": 0.1889, | |
| "step": 1742 | |
| }, | |
| { | |
| "epoch": 0.2710730948678072, | |
| "grad_norm": 1.40063937560449, | |
| "learning_rate": 8.293896587500266e-06, | |
| "loss": 0.1644, | |
| "step": 1743 | |
| }, | |
| { | |
| "epoch": 0.2712286158631415, | |
| "grad_norm": 1.756399176307069, | |
| "learning_rate": 8.292058298631003e-06, | |
| "loss": 0.2121, | |
| "step": 1744 | |
| }, | |
| { | |
| "epoch": 0.2713841368584759, | |
| "grad_norm": 1.3118943702099763, | |
| "learning_rate": 8.290219223901517e-06, | |
| "loss": 0.1657, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.2715396578538103, | |
| "grad_norm": 1.221070247479925, | |
| "learning_rate": 8.288379363750818e-06, | |
| "loss": 0.1799, | |
| "step": 1746 | |
| }, | |
| { | |
| "epoch": 0.2716951788491446, | |
| "grad_norm": 1.30049039400021, | |
| "learning_rate": 8.286538718618107e-06, | |
| "loss": 0.1659, | |
| "step": 1747 | |
| }, | |
| { | |
| "epoch": 0.271850699844479, | |
| "grad_norm": 0.8218052779463395, | |
| "learning_rate": 8.28469728894277e-06, | |
| "loss": 0.1417, | |
| "step": 1748 | |
| }, | |
| { | |
| "epoch": 0.27200622083981335, | |
| "grad_norm": 1.318881683721639, | |
| "learning_rate": 8.282855075164386e-06, | |
| "loss": 0.2086, | |
| "step": 1749 | |
| }, | |
| { | |
| "epoch": 0.27216174183514774, | |
| "grad_norm": 1.168225071909074, | |
| "learning_rate": 8.281012077722712e-06, | |
| "loss": 0.1481, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.27231726283048213, | |
| "grad_norm": 1.387527553498744, | |
| "learning_rate": 8.2791682970577e-06, | |
| "loss": 0.224, | |
| "step": 1751 | |
| }, | |
| { | |
| "epoch": 0.27247278382581647, | |
| "grad_norm": 0.9455523699522945, | |
| "learning_rate": 8.277323733609488e-06, | |
| "loss": 0.1689, | |
| "step": 1752 | |
| }, | |
| { | |
| "epoch": 0.27262830482115086, | |
| "grad_norm": 1.301993231412919, | |
| "learning_rate": 8.275478387818394e-06, | |
| "loss": 0.17, | |
| "step": 1753 | |
| }, | |
| { | |
| "epoch": 0.27278382581648525, | |
| "grad_norm": 1.1753804485169133, | |
| "learning_rate": 8.273632260124934e-06, | |
| "loss": 0.2231, | |
| "step": 1754 | |
| }, | |
| { | |
| "epoch": 0.2729393468118196, | |
| "grad_norm": 1.080698611275427, | |
| "learning_rate": 8.271785350969799e-06, | |
| "loss": 0.1796, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.273094867807154, | |
| "grad_norm": 1.290015540604507, | |
| "learning_rate": 8.269937660793875e-06, | |
| "loss": 0.1941, | |
| "step": 1756 | |
| }, | |
| { | |
| "epoch": 0.2732503888024883, | |
| "grad_norm": 1.070538218943679, | |
| "learning_rate": 8.268089190038228e-06, | |
| "loss": 0.1909, | |
| "step": 1757 | |
| }, | |
| { | |
| "epoch": 0.2734059097978227, | |
| "grad_norm": 1.2252798699112468, | |
| "learning_rate": 8.266239939144118e-06, | |
| "loss": 0.1569, | |
| "step": 1758 | |
| }, | |
| { | |
| "epoch": 0.2735614307931571, | |
| "grad_norm": 1.2346475130597931, | |
| "learning_rate": 8.264389908552987e-06, | |
| "loss": 0.1881, | |
| "step": 1759 | |
| }, | |
| { | |
| "epoch": 0.2737169517884914, | |
| "grad_norm": 0.8909529676508143, | |
| "learning_rate": 8.26253909870646e-06, | |
| "loss": 0.1635, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.2738724727838258, | |
| "grad_norm": 1.3801819199807877, | |
| "learning_rate": 8.260687510046352e-06, | |
| "loss": 0.1957, | |
| "step": 1761 | |
| }, | |
| { | |
| "epoch": 0.2740279937791602, | |
| "grad_norm": 0.9098604615543268, | |
| "learning_rate": 8.258835143014663e-06, | |
| "loss": 0.1556, | |
| "step": 1762 | |
| }, | |
| { | |
| "epoch": 0.27418351477449454, | |
| "grad_norm": 1.479953181946323, | |
| "learning_rate": 8.25698199805358e-06, | |
| "loss": 0.1673, | |
| "step": 1763 | |
| }, | |
| { | |
| "epoch": 0.27433903576982893, | |
| "grad_norm": 1.0391961011580078, | |
| "learning_rate": 8.255128075605475e-06, | |
| "loss": 0.1678, | |
| "step": 1764 | |
| }, | |
| { | |
| "epoch": 0.2744945567651633, | |
| "grad_norm": 1.1674213515628957, | |
| "learning_rate": 8.253273376112902e-06, | |
| "loss": 0.1575, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.27465007776049766, | |
| "grad_norm": 0.776827674790433, | |
| "learning_rate": 8.251417900018606e-06, | |
| "loss": 0.2087, | |
| "step": 1766 | |
| }, | |
| { | |
| "epoch": 0.27480559875583205, | |
| "grad_norm": 1.0737505366105782, | |
| "learning_rate": 8.249561647765515e-06, | |
| "loss": 0.202, | |
| "step": 1767 | |
| }, | |
| { | |
| "epoch": 0.2749611197511664, | |
| "grad_norm": 1.0278179070478979, | |
| "learning_rate": 8.247704619796743e-06, | |
| "loss": 0.2246, | |
| "step": 1768 | |
| }, | |
| { | |
| "epoch": 0.2751166407465008, | |
| "grad_norm": 1.3308057309065462, | |
| "learning_rate": 8.245846816555588e-06, | |
| "loss": 0.1781, | |
| "step": 1769 | |
| }, | |
| { | |
| "epoch": 0.27527216174183516, | |
| "grad_norm": 1.171891225152092, | |
| "learning_rate": 8.24398823848553e-06, | |
| "loss": 0.2838, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.2754276827371695, | |
| "grad_norm": 0.9162549134019579, | |
| "learning_rate": 8.242128886030243e-06, | |
| "loss": 0.153, | |
| "step": 1771 | |
| }, | |
| { | |
| "epoch": 0.2755832037325039, | |
| "grad_norm": 1.7094368421056838, | |
| "learning_rate": 8.240268759633576e-06, | |
| "loss": 0.1769, | |
| "step": 1772 | |
| }, | |
| { | |
| "epoch": 0.2757387247278383, | |
| "grad_norm": 1.088761334959302, | |
| "learning_rate": 8.23840785973957e-06, | |
| "loss": 0.1872, | |
| "step": 1773 | |
| }, | |
| { | |
| "epoch": 0.2758942457231726, | |
| "grad_norm": 1.0467068106039534, | |
| "learning_rate": 8.236546186792446e-06, | |
| "loss": 0.1941, | |
| "step": 1774 | |
| }, | |
| { | |
| "epoch": 0.276049766718507, | |
| "grad_norm": 1.469925204114295, | |
| "learning_rate": 8.234683741236612e-06, | |
| "loss": 0.2439, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.27620528771384134, | |
| "grad_norm": 1.286843667284798, | |
| "learning_rate": 8.23282052351666e-06, | |
| "loss": 0.1825, | |
| "step": 1776 | |
| }, | |
| { | |
| "epoch": 0.27636080870917573, | |
| "grad_norm": 1.5684518100667084, | |
| "learning_rate": 8.230956534077366e-06, | |
| "loss": 0.2088, | |
| "step": 1777 | |
| }, | |
| { | |
| "epoch": 0.2765163297045101, | |
| "grad_norm": 1.3158757876867857, | |
| "learning_rate": 8.22909177336369e-06, | |
| "loss": 0.1965, | |
| "step": 1778 | |
| }, | |
| { | |
| "epoch": 0.27667185069984446, | |
| "grad_norm": 0.7862541895693009, | |
| "learning_rate": 8.227226241820779e-06, | |
| "loss": 0.1388, | |
| "step": 1779 | |
| }, | |
| { | |
| "epoch": 0.27682737169517885, | |
| "grad_norm": 0.9288123715441376, | |
| "learning_rate": 8.225359939893954e-06, | |
| "loss": 0.243, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.27698289269051324, | |
| "grad_norm": 1.491008802108701, | |
| "learning_rate": 8.223492868028736e-06, | |
| "loss": 0.2521, | |
| "step": 1781 | |
| }, | |
| { | |
| "epoch": 0.2771384136858476, | |
| "grad_norm": 1.1202886550853388, | |
| "learning_rate": 8.221625026670814e-06, | |
| "loss": 0.1688, | |
| "step": 1782 | |
| }, | |
| { | |
| "epoch": 0.27729393468118196, | |
| "grad_norm": 1.1962734383960754, | |
| "learning_rate": 8.219756416266073e-06, | |
| "loss": 0.1294, | |
| "step": 1783 | |
| }, | |
| { | |
| "epoch": 0.27744945567651635, | |
| "grad_norm": 0.6740427840476089, | |
| "learning_rate": 8.217887037260575e-06, | |
| "loss": 0.1501, | |
| "step": 1784 | |
| }, | |
| { | |
| "epoch": 0.2776049766718507, | |
| "grad_norm": 1.8752578154372959, | |
| "learning_rate": 8.216016890100564e-06, | |
| "loss": 0.2524, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.2777604976671851, | |
| "grad_norm": 1.3276982202120067, | |
| "learning_rate": 8.214145975232474e-06, | |
| "loss": 0.1611, | |
| "step": 1786 | |
| }, | |
| { | |
| "epoch": 0.2779160186625194, | |
| "grad_norm": 0.9180331686214024, | |
| "learning_rate": 8.212274293102917e-06, | |
| "loss": 0.2069, | |
| "step": 1787 | |
| }, | |
| { | |
| "epoch": 0.2780715396578538, | |
| "grad_norm": 1.1644000920434754, | |
| "learning_rate": 8.210401844158688e-06, | |
| "loss": 0.2113, | |
| "step": 1788 | |
| }, | |
| { | |
| "epoch": 0.2782270606531882, | |
| "grad_norm": 1.6247680870264813, | |
| "learning_rate": 8.20852862884677e-06, | |
| "loss": 0.2167, | |
| "step": 1789 | |
| }, | |
| { | |
| "epoch": 0.27838258164852253, | |
| "grad_norm": 2.465352962757943, | |
| "learning_rate": 8.206654647614323e-06, | |
| "loss": 0.2917, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.2785381026438569, | |
| "grad_norm": 0.9826147561106185, | |
| "learning_rate": 8.204779900908694e-06, | |
| "loss": 0.1513, | |
| "step": 1791 | |
| }, | |
| { | |
| "epoch": 0.2786936236391913, | |
| "grad_norm": 1.1924827625995933, | |
| "learning_rate": 8.202904389177409e-06, | |
| "loss": 0.2069, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.27884914463452565, | |
| "grad_norm": 1.2507233550051102, | |
| "learning_rate": 8.201028112868182e-06, | |
| "loss": 0.1713, | |
| "step": 1793 | |
| }, | |
| { | |
| "epoch": 0.27900466562986004, | |
| "grad_norm": 1.056564405898492, | |
| "learning_rate": 8.199151072428903e-06, | |
| "loss": 0.152, | |
| "step": 1794 | |
| }, | |
| { | |
| "epoch": 0.27916018662519443, | |
| "grad_norm": 1.0582767182694146, | |
| "learning_rate": 8.19727326830765e-06, | |
| "loss": 0.1313, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.27931570762052876, | |
| "grad_norm": 0.9960646193169612, | |
| "learning_rate": 8.195394700952681e-06, | |
| "loss": 0.1663, | |
| "step": 1796 | |
| }, | |
| { | |
| "epoch": 0.27947122861586315, | |
| "grad_norm": 0.8580536351756373, | |
| "learning_rate": 8.193515370812433e-06, | |
| "loss": 0.1595, | |
| "step": 1797 | |
| }, | |
| { | |
| "epoch": 0.2796267496111975, | |
| "grad_norm": 1.0831765474333348, | |
| "learning_rate": 8.191635278335533e-06, | |
| "loss": 0.1646, | |
| "step": 1798 | |
| }, | |
| { | |
| "epoch": 0.2797822706065319, | |
| "grad_norm": 1.0292790758688968, | |
| "learning_rate": 8.189754423970783e-06, | |
| "loss": 0.1294, | |
| "step": 1799 | |
| }, | |
| { | |
| "epoch": 0.27993779160186627, | |
| "grad_norm": 0.6900206697382273, | |
| "learning_rate": 8.18787280816717e-06, | |
| "loss": 0.1962, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.27993779160186627, | |
| "eval_loss": 0.1942623406648636, | |
| "eval_runtime": 9.4402, | |
| "eval_samples_per_second": 2.754, | |
| "eval_steps_per_second": 0.742, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2800933125972006, | |
| "grad_norm": 1.225359903420926, | |
| "learning_rate": 8.18599043137386e-06, | |
| "loss": 0.1613, | |
| "step": 1801 | |
| }, | |
| { | |
| "epoch": 0.280248833592535, | |
| "grad_norm": 1.6844618005986576, | |
| "learning_rate": 8.184107294040204e-06, | |
| "loss": 0.2253, | |
| "step": 1802 | |
| }, | |
| { | |
| "epoch": 0.2804043545878694, | |
| "grad_norm": 1.0175001190789204, | |
| "learning_rate": 8.182223396615733e-06, | |
| "loss": 0.1912, | |
| "step": 1803 | |
| }, | |
| { | |
| "epoch": 0.2805598755832037, | |
| "grad_norm": 1.050408004024866, | |
| "learning_rate": 8.18033873955016e-06, | |
| "loss": 0.2061, | |
| "step": 1804 | |
| }, | |
| { | |
| "epoch": 0.2807153965785381, | |
| "grad_norm": 1.4763046668239692, | |
| "learning_rate": 8.178453323293378e-06, | |
| "loss": 0.2781, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.28087091757387245, | |
| "grad_norm": 0.8219546561822222, | |
| "learning_rate": 8.176567148295462e-06, | |
| "loss": 0.2129, | |
| "step": 1806 | |
| }, | |
| { | |
| "epoch": 0.28102643856920684, | |
| "grad_norm": 0.9534941567105831, | |
| "learning_rate": 8.174680215006671e-06, | |
| "loss": 0.1653, | |
| "step": 1807 | |
| }, | |
| { | |
| "epoch": 0.28118195956454123, | |
| "grad_norm": 1.0531235123680651, | |
| "learning_rate": 8.172792523877439e-06, | |
| "loss": 0.1384, | |
| "step": 1808 | |
| }, | |
| { | |
| "epoch": 0.28133748055987556, | |
| "grad_norm": 1.3227244850484494, | |
| "learning_rate": 8.170904075358386e-06, | |
| "loss": 0.1878, | |
| "step": 1809 | |
| }, | |
| { | |
| "epoch": 0.28149300155520995, | |
| "grad_norm": 0.8199812475506189, | |
| "learning_rate": 8.169014869900308e-06, | |
| "loss": 0.1583, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.28164852255054434, | |
| "grad_norm": 1.1873233496157647, | |
| "learning_rate": 8.167124907954188e-06, | |
| "loss": 0.1689, | |
| "step": 1811 | |
| }, | |
| { | |
| "epoch": 0.2818040435458787, | |
| "grad_norm": 1.332689389458692, | |
| "learning_rate": 8.165234189971188e-06, | |
| "loss": 0.1509, | |
| "step": 1812 | |
| }, | |
| { | |
| "epoch": 0.28195956454121307, | |
| "grad_norm": 1.4288659016319332, | |
| "learning_rate": 8.163342716402645e-06, | |
| "loss": 0.1862, | |
| "step": 1813 | |
| }, | |
| { | |
| "epoch": 0.28211508553654746, | |
| "grad_norm": 1.314918590717926, | |
| "learning_rate": 8.16145048770008e-06, | |
| "loss": 0.226, | |
| "step": 1814 | |
| }, | |
| { | |
| "epoch": 0.2822706065318818, | |
| "grad_norm": 0.9155638179955898, | |
| "learning_rate": 8.159557504315197e-06, | |
| "loss": 0.1929, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.2824261275272162, | |
| "grad_norm": 1.0431139463881003, | |
| "learning_rate": 8.157663766699875e-06, | |
| "loss": 0.1443, | |
| "step": 1816 | |
| }, | |
| { | |
| "epoch": 0.2825816485225505, | |
| "grad_norm": 1.3294250069242237, | |
| "learning_rate": 8.155769275306178e-06, | |
| "loss": 0.193, | |
| "step": 1817 | |
| }, | |
| { | |
| "epoch": 0.2827371695178849, | |
| "grad_norm": 0.9943106694297035, | |
| "learning_rate": 8.153874030586343e-06, | |
| "loss": 0.1421, | |
| "step": 1818 | |
| }, | |
| { | |
| "epoch": 0.2828926905132193, | |
| "grad_norm": 1.165982265832558, | |
| "learning_rate": 8.151978032992798e-06, | |
| "loss": 0.1739, | |
| "step": 1819 | |
| }, | |
| { | |
| "epoch": 0.28304821150855364, | |
| "grad_norm": 0.7428727580266941, | |
| "learning_rate": 8.150081282978139e-06, | |
| "loss": 0.1572, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.28320373250388803, | |
| "grad_norm": 1.3026564844558632, | |
| "learning_rate": 8.14818378099515e-06, | |
| "loss": 0.1805, | |
| "step": 1821 | |
| }, | |
| { | |
| "epoch": 0.2833592534992224, | |
| "grad_norm": 1.2645554368075294, | |
| "learning_rate": 8.146285527496789e-06, | |
| "loss": 0.1798, | |
| "step": 1822 | |
| }, | |
| { | |
| "epoch": 0.28351477449455675, | |
| "grad_norm": 1.456481044065325, | |
| "learning_rate": 8.144386522936195e-06, | |
| "loss": 0.1598, | |
| "step": 1823 | |
| }, | |
| { | |
| "epoch": 0.28367029548989114, | |
| "grad_norm": 1.1300906728090474, | |
| "learning_rate": 8.142486767766688e-06, | |
| "loss": 0.1648, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.2838258164852255, | |
| "grad_norm": 1.1388001178297193, | |
| "learning_rate": 8.140586262441767e-06, | |
| "loss": 0.2733, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.28398133748055987, | |
| "grad_norm": 0.7532300919484063, | |
| "learning_rate": 8.138685007415109e-06, | |
| "loss": 0.1213, | |
| "step": 1826 | |
| }, | |
| { | |
| "epoch": 0.28413685847589426, | |
| "grad_norm": 1.0796067807349936, | |
| "learning_rate": 8.136783003140568e-06, | |
| "loss": 0.2189, | |
| "step": 1827 | |
| }, | |
| { | |
| "epoch": 0.2842923794712286, | |
| "grad_norm": 1.331438012696905, | |
| "learning_rate": 8.134880250072179e-06, | |
| "loss": 0.1804, | |
| "step": 1828 | |
| }, | |
| { | |
| "epoch": 0.284447900466563, | |
| "grad_norm": 1.2091930191659346, | |
| "learning_rate": 8.13297674866416e-06, | |
| "loss": 0.2194, | |
| "step": 1829 | |
| }, | |
| { | |
| "epoch": 0.2846034214618974, | |
| "grad_norm": 1.0049073814467957, | |
| "learning_rate": 8.131072499370897e-06, | |
| "loss": 0.1333, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.2847589424572317, | |
| "grad_norm": 1.0223717163539678, | |
| "learning_rate": 8.129167502646966e-06, | |
| "loss": 0.1988, | |
| "step": 1831 | |
| }, | |
| { | |
| "epoch": 0.2849144634525661, | |
| "grad_norm": 1.4867747212307119, | |
| "learning_rate": 8.127261758947114e-06, | |
| "loss": 0.1467, | |
| "step": 1832 | |
| }, | |
| { | |
| "epoch": 0.2850699844479005, | |
| "grad_norm": 0.8173079980321136, | |
| "learning_rate": 8.125355268726266e-06, | |
| "loss": 0.1058, | |
| "step": 1833 | |
| }, | |
| { | |
| "epoch": 0.28522550544323483, | |
| "grad_norm": 1.570586484505542, | |
| "learning_rate": 8.123448032439534e-06, | |
| "loss": 0.2065, | |
| "step": 1834 | |
| }, | |
| { | |
| "epoch": 0.2853810264385692, | |
| "grad_norm": 1.5595299992669573, | |
| "learning_rate": 8.121540050542198e-06, | |
| "loss": 0.2193, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.28553654743390355, | |
| "grad_norm": 1.007755342730857, | |
| "learning_rate": 8.119631323489722e-06, | |
| "loss": 0.1371, | |
| "step": 1836 | |
| }, | |
| { | |
| "epoch": 0.28569206842923794, | |
| "grad_norm": 1.301433540358406, | |
| "learning_rate": 8.117721851737744e-06, | |
| "loss": 0.176, | |
| "step": 1837 | |
| }, | |
| { | |
| "epoch": 0.28584758942457233, | |
| "grad_norm": 0.8910831403501445, | |
| "learning_rate": 8.115811635742079e-06, | |
| "loss": 0.1626, | |
| "step": 1838 | |
| }, | |
| { | |
| "epoch": 0.28600311041990667, | |
| "grad_norm": 0.7467945070581918, | |
| "learning_rate": 8.113900675958728e-06, | |
| "loss": 0.1821, | |
| "step": 1839 | |
| }, | |
| { | |
| "epoch": 0.28615863141524106, | |
| "grad_norm": 1.1530448700016815, | |
| "learning_rate": 8.111988972843859e-06, | |
| "loss": 0.1923, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.28631415241057545, | |
| "grad_norm": 2.088923036862537, | |
| "learning_rate": 8.110076526853824e-06, | |
| "loss": 0.1206, | |
| "step": 1841 | |
| }, | |
| { | |
| "epoch": 0.2864696734059098, | |
| "grad_norm": 1.2835755029352423, | |
| "learning_rate": 8.108163338445152e-06, | |
| "loss": 0.2546, | |
| "step": 1842 | |
| }, | |
| { | |
| "epoch": 0.2866251944012442, | |
| "grad_norm": 0.8829913186503389, | |
| "learning_rate": 8.106249408074544e-06, | |
| "loss": 0.1445, | |
| "step": 1843 | |
| }, | |
| { | |
| "epoch": 0.2867807153965785, | |
| "grad_norm": 1.7629923458811358, | |
| "learning_rate": 8.104334736198887e-06, | |
| "loss": 0.1544, | |
| "step": 1844 | |
| }, | |
| { | |
| "epoch": 0.2869362363919129, | |
| "grad_norm": 0.9657174681831697, | |
| "learning_rate": 8.102419323275234e-06, | |
| "loss": 0.2351, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.2870917573872473, | |
| "grad_norm": 1.492589192357126, | |
| "learning_rate": 8.100503169760827e-06, | |
| "loss": 0.186, | |
| "step": 1846 | |
| }, | |
| { | |
| "epoch": 0.28724727838258163, | |
| "grad_norm": 1.1233971084394738, | |
| "learning_rate": 8.098586276113073e-06, | |
| "loss": 0.1946, | |
| "step": 1847 | |
| }, | |
| { | |
| "epoch": 0.287402799377916, | |
| "grad_norm": 0.8577653456028049, | |
| "learning_rate": 8.096668642789565e-06, | |
| "loss": 0.1633, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 0.2875583203732504, | |
| "grad_norm": 1.1536545920544707, | |
| "learning_rate": 8.094750270248065e-06, | |
| "loss": 0.1603, | |
| "step": 1849 | |
| }, | |
| { | |
| "epoch": 0.28771384136858474, | |
| "grad_norm": 0.9838814306399297, | |
| "learning_rate": 8.09283115894652e-06, | |
| "loss": 0.1623, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.28786936236391913, | |
| "grad_norm": 0.9601384951644616, | |
| "learning_rate": 8.090911309343045e-06, | |
| "loss": 0.1252, | |
| "step": 1851 | |
| }, | |
| { | |
| "epoch": 0.2880248833592535, | |
| "grad_norm": 0.9976176427201153, | |
| "learning_rate": 8.088990721895938e-06, | |
| "loss": 0.1815, | |
| "step": 1852 | |
| }, | |
| { | |
| "epoch": 0.28818040435458786, | |
| "grad_norm": 0.7583399424827217, | |
| "learning_rate": 8.087069397063666e-06, | |
| "loss": 0.141, | |
| "step": 1853 | |
| }, | |
| { | |
| "epoch": 0.28833592534992225, | |
| "grad_norm": 1.5185081715928586, | |
| "learning_rate": 8.085147335304879e-06, | |
| "loss": 0.1887, | |
| "step": 1854 | |
| }, | |
| { | |
| "epoch": 0.2884914463452566, | |
| "grad_norm": 1.3061310770751247, | |
| "learning_rate": 8.083224537078401e-06, | |
| "loss": 0.1451, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.288646967340591, | |
| "grad_norm": 1.7351614485797129, | |
| "learning_rate": 8.081301002843226e-06, | |
| "loss": 0.1264, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.28880248833592537, | |
| "grad_norm": 0.9943204442132273, | |
| "learning_rate": 8.079376733058532e-06, | |
| "loss": 0.1743, | |
| "step": 1857 | |
| }, | |
| { | |
| "epoch": 0.2889580093312597, | |
| "grad_norm": 1.1594102001358773, | |
| "learning_rate": 8.07745172818367e-06, | |
| "loss": 0.1607, | |
| "step": 1858 | |
| }, | |
| { | |
| "epoch": 0.2891135303265941, | |
| "grad_norm": 1.6350163238448654, | |
| "learning_rate": 8.075525988678163e-06, | |
| "loss": 0.1813, | |
| "step": 1859 | |
| }, | |
| { | |
| "epoch": 0.2892690513219285, | |
| "grad_norm": 1.083878957563236, | |
| "learning_rate": 8.073599515001713e-06, | |
| "loss": 0.1194, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.2894245723172628, | |
| "grad_norm": 0.8178116527073355, | |
| "learning_rate": 8.071672307614195e-06, | |
| "loss": 0.228, | |
| "step": 1861 | |
| }, | |
| { | |
| "epoch": 0.2895800933125972, | |
| "grad_norm": 1.1118324651261078, | |
| "learning_rate": 8.069744366975664e-06, | |
| "loss": 0.197, | |
| "step": 1862 | |
| }, | |
| { | |
| "epoch": 0.28973561430793154, | |
| "grad_norm": 1.149751561349185, | |
| "learning_rate": 8.06781569354634e-06, | |
| "loss": 0.269, | |
| "step": 1863 | |
| }, | |
| { | |
| "epoch": 0.28989113530326593, | |
| "grad_norm": 1.1618468357632399, | |
| "learning_rate": 8.06588628778663e-06, | |
| "loss": 0.1846, | |
| "step": 1864 | |
| }, | |
| { | |
| "epoch": 0.2900466562986003, | |
| "grad_norm": 1.3277875865938236, | |
| "learning_rate": 8.063956150157107e-06, | |
| "loss": 0.1273, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.29020217729393466, | |
| "grad_norm": 2.436613602568436, | |
| "learning_rate": 8.062025281118524e-06, | |
| "loss": 0.2442, | |
| "step": 1866 | |
| }, | |
| { | |
| "epoch": 0.29035769828926905, | |
| "grad_norm": 1.1958552059286012, | |
| "learning_rate": 8.060093681131804e-06, | |
| "loss": 0.1874, | |
| "step": 1867 | |
| }, | |
| { | |
| "epoch": 0.29051321928460344, | |
| "grad_norm": 0.8401669076143116, | |
| "learning_rate": 8.058161350658047e-06, | |
| "loss": 0.1901, | |
| "step": 1868 | |
| }, | |
| { | |
| "epoch": 0.2906687402799378, | |
| "grad_norm": 0.9487811357677395, | |
| "learning_rate": 8.056228290158528e-06, | |
| "loss": 0.1346, | |
| "step": 1869 | |
| }, | |
| { | |
| "epoch": 0.29082426127527217, | |
| "grad_norm": 0.957901732333534, | |
| "learning_rate": 8.054294500094697e-06, | |
| "loss": 0.1411, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.29097978227060656, | |
| "grad_norm": 1.292131532239805, | |
| "learning_rate": 8.052359980928172e-06, | |
| "loss": 0.1827, | |
| "step": 1871 | |
| }, | |
| { | |
| "epoch": 0.2911353032659409, | |
| "grad_norm": 0.9675272253773427, | |
| "learning_rate": 8.050424733120757e-06, | |
| "loss": 0.1738, | |
| "step": 1872 | |
| }, | |
| { | |
| "epoch": 0.2912908242612753, | |
| "grad_norm": 1.367184606419033, | |
| "learning_rate": 8.048488757134416e-06, | |
| "loss": 0.1787, | |
| "step": 1873 | |
| }, | |
| { | |
| "epoch": 0.2914463452566096, | |
| "grad_norm": 1.2673549853684765, | |
| "learning_rate": 8.046552053431298e-06, | |
| "loss": 0.2333, | |
| "step": 1874 | |
| }, | |
| { | |
| "epoch": 0.291601866251944, | |
| "grad_norm": 1.9351495105907597, | |
| "learning_rate": 8.044614622473717e-06, | |
| "loss": 0.1987, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.2917573872472784, | |
| "grad_norm": 0.8606527150680897, | |
| "learning_rate": 8.042676464724169e-06, | |
| "loss": 0.172, | |
| "step": 1876 | |
| }, | |
| { | |
| "epoch": 0.29191290824261273, | |
| "grad_norm": 1.4901933699318817, | |
| "learning_rate": 8.040737580645316e-06, | |
| "loss": 0.1735, | |
| "step": 1877 | |
| }, | |
| { | |
| "epoch": 0.2920684292379471, | |
| "grad_norm": 1.1691647071434712, | |
| "learning_rate": 8.038797970699998e-06, | |
| "loss": 0.2316, | |
| "step": 1878 | |
| }, | |
| { | |
| "epoch": 0.2922239502332815, | |
| "grad_norm": 1.240770117738676, | |
| "learning_rate": 8.036857635351226e-06, | |
| "loss": 0.1667, | |
| "step": 1879 | |
| }, | |
| { | |
| "epoch": 0.29237947122861585, | |
| "grad_norm": 1.0351622011955766, | |
| "learning_rate": 8.034916575062188e-06, | |
| "loss": 0.1405, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.29253499222395024, | |
| "grad_norm": 0.9153491401389935, | |
| "learning_rate": 8.032974790296239e-06, | |
| "loss": 0.1726, | |
| "step": 1881 | |
| }, | |
| { | |
| "epoch": 0.2926905132192846, | |
| "grad_norm": 1.4362209764019978, | |
| "learning_rate": 8.031032281516913e-06, | |
| "loss": 0.1827, | |
| "step": 1882 | |
| }, | |
| { | |
| "epoch": 0.29284603421461897, | |
| "grad_norm": 1.183605507206791, | |
| "learning_rate": 8.029089049187909e-06, | |
| "loss": 0.1883, | |
| "step": 1883 | |
| }, | |
| { | |
| "epoch": 0.29300155520995336, | |
| "grad_norm": 1.0539400115284923, | |
| "learning_rate": 8.02714509377311e-06, | |
| "loss": 0.1208, | |
| "step": 1884 | |
| }, | |
| { | |
| "epoch": 0.2931570762052877, | |
| "grad_norm": 1.0217425114195149, | |
| "learning_rate": 8.02520041573656e-06, | |
| "loss": 0.174, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.2933125972006221, | |
| "grad_norm": 1.0405110359742253, | |
| "learning_rate": 8.023255015542482e-06, | |
| "loss": 0.249, | |
| "step": 1886 | |
| }, | |
| { | |
| "epoch": 0.2934681181959565, | |
| "grad_norm": 0.9949747841829932, | |
| "learning_rate": 8.021308893655273e-06, | |
| "loss": 0.1861, | |
| "step": 1887 | |
| }, | |
| { | |
| "epoch": 0.2936236391912908, | |
| "grad_norm": 0.9631918396707634, | |
| "learning_rate": 8.019362050539497e-06, | |
| "loss": 0.22, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.2937791601866252, | |
| "grad_norm": 1.471400212660711, | |
| "learning_rate": 8.017414486659894e-06, | |
| "loss": 0.2831, | |
| "step": 1889 | |
| }, | |
| { | |
| "epoch": 0.2939346811819596, | |
| "grad_norm": 1.6502542476240603, | |
| "learning_rate": 8.015466202481371e-06, | |
| "loss": 0.1856, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.2940902021772939, | |
| "grad_norm": 1.0678255046461738, | |
| "learning_rate": 8.013517198469017e-06, | |
| "loss": 0.2714, | |
| "step": 1891 | |
| }, | |
| { | |
| "epoch": 0.2942457231726283, | |
| "grad_norm": 1.5419672646129527, | |
| "learning_rate": 8.01156747508808e-06, | |
| "loss": 0.2432, | |
| "step": 1892 | |
| }, | |
| { | |
| "epoch": 0.29440124416796265, | |
| "grad_norm": 1.691620262630438, | |
| "learning_rate": 8.009617032803989e-06, | |
| "loss": 0.2494, | |
| "step": 1893 | |
| }, | |
| { | |
| "epoch": 0.29455676516329704, | |
| "grad_norm": 1.0149866152436102, | |
| "learning_rate": 8.007665872082343e-06, | |
| "loss": 0.1446, | |
| "step": 1894 | |
| }, | |
| { | |
| "epoch": 0.29471228615863143, | |
| "grad_norm": 1.2593397067130077, | |
| "learning_rate": 8.005713993388908e-06, | |
| "loss": 0.1813, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.29486780715396577, | |
| "grad_norm": 1.751259190433369, | |
| "learning_rate": 8.003761397189629e-06, | |
| "loss": 0.3067, | |
| "step": 1896 | |
| }, | |
| { | |
| "epoch": 0.29502332814930016, | |
| "grad_norm": 1.0592944557403567, | |
| "learning_rate": 8.001808083950615e-06, | |
| "loss": 0.1774, | |
| "step": 1897 | |
| }, | |
| { | |
| "epoch": 0.29517884914463455, | |
| "grad_norm": 0.7601316574689209, | |
| "learning_rate": 7.999854054138148e-06, | |
| "loss": 0.1986, | |
| "step": 1898 | |
| }, | |
| { | |
| "epoch": 0.2953343701399689, | |
| "grad_norm": 1.0763633141744329, | |
| "learning_rate": 7.997899308218687e-06, | |
| "loss": 0.1693, | |
| "step": 1899 | |
| }, | |
| { | |
| "epoch": 0.2954898911353033, | |
| "grad_norm": 0.848192935949934, | |
| "learning_rate": 7.995943846658852e-06, | |
| "loss": 0.1785, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2954898911353033, | |
| "eval_loss": 0.19579939544200897, | |
| "eval_runtime": 9.4258, | |
| "eval_samples_per_second": 2.758, | |
| "eval_steps_per_second": 0.743, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.29564541213063766, | |
| "grad_norm": 1.1366949640186217, | |
| "learning_rate": 7.99398766992544e-06, | |
| "loss": 0.3427, | |
| "step": 1901 | |
| }, | |
| { | |
| "epoch": 0.295800933125972, | |
| "grad_norm": 1.3011369731626548, | |
| "learning_rate": 7.99203077848542e-06, | |
| "loss": 0.128, | |
| "step": 1902 | |
| }, | |
| { | |
| "epoch": 0.2959564541213064, | |
| "grad_norm": 1.6239083693901217, | |
| "learning_rate": 7.990073172805927e-06, | |
| "loss": 0.2033, | |
| "step": 1903 | |
| }, | |
| { | |
| "epoch": 0.2961119751166407, | |
| "grad_norm": 2.136757506768007, | |
| "learning_rate": 7.98811485335427e-06, | |
| "loss": 0.8244, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 0.2962674961119751, | |
| "grad_norm": 1.4156103108687226, | |
| "learning_rate": 7.986155820597927e-06, | |
| "loss": 0.2266, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.2964230171073095, | |
| "grad_norm": 1.3059948518525273, | |
| "learning_rate": 7.984196075004547e-06, | |
| "loss": 0.1772, | |
| "step": 1906 | |
| }, | |
| { | |
| "epoch": 0.29657853810264384, | |
| "grad_norm": 1.1897397554067446, | |
| "learning_rate": 7.982235617041947e-06, | |
| "loss": 0.2153, | |
| "step": 1907 | |
| }, | |
| { | |
| "epoch": 0.29673405909797823, | |
| "grad_norm": 1.8814984942898336, | |
| "learning_rate": 7.980274447178116e-06, | |
| "loss": 0.163, | |
| "step": 1908 | |
| }, | |
| { | |
| "epoch": 0.2968895800933126, | |
| "grad_norm": 0.8490191091642275, | |
| "learning_rate": 7.978312565881212e-06, | |
| "loss": 0.1929, | |
| "step": 1909 | |
| }, | |
| { | |
| "epoch": 0.29704510108864696, | |
| "grad_norm": 1.0730207253151238, | |
| "learning_rate": 7.976349973619567e-06, | |
| "loss": 0.152, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.29720062208398135, | |
| "grad_norm": 1.0988494794101311, | |
| "learning_rate": 7.974386670861676e-06, | |
| "loss": 0.1796, | |
| "step": 1911 | |
| }, | |
| { | |
| "epoch": 0.2973561430793157, | |
| "grad_norm": 0.8890702707468837, | |
| "learning_rate": 7.972422658076206e-06, | |
| "loss": 0.1658, | |
| "step": 1912 | |
| }, | |
| { | |
| "epoch": 0.2975116640746501, | |
| "grad_norm": 1.5485447290305507, | |
| "learning_rate": 7.970457935731996e-06, | |
| "loss": 0.219, | |
| "step": 1913 | |
| }, | |
| { | |
| "epoch": 0.29766718506998446, | |
| "grad_norm": 1.1870158533528972, | |
| "learning_rate": 7.968492504298053e-06, | |
| "loss": 0.1678, | |
| "step": 1914 | |
| }, | |
| { | |
| "epoch": 0.2978227060653188, | |
| "grad_norm": 0.8791513734953905, | |
| "learning_rate": 7.966526364243553e-06, | |
| "loss": 0.1379, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.2979782270606532, | |
| "grad_norm": 1.1547532699065137, | |
| "learning_rate": 7.96455951603784e-06, | |
| "loss": 0.1578, | |
| "step": 1916 | |
| }, | |
| { | |
| "epoch": 0.2981337480559876, | |
| "grad_norm": 1.2343036137247707, | |
| "learning_rate": 7.962591960150426e-06, | |
| "loss": 0.167, | |
| "step": 1917 | |
| }, | |
| { | |
| "epoch": 0.2982892690513219, | |
| "grad_norm": 1.199679900214714, | |
| "learning_rate": 7.960623697051e-06, | |
| "loss": 0.2216, | |
| "step": 1918 | |
| }, | |
| { | |
| "epoch": 0.2984447900466563, | |
| "grad_norm": 0.8701547919093023, | |
| "learning_rate": 7.958654727209406e-06, | |
| "loss": 0.1334, | |
| "step": 1919 | |
| }, | |
| { | |
| "epoch": 0.2986003110419907, | |
| "grad_norm": 1.0186941275746395, | |
| "learning_rate": 7.956685051095672e-06, | |
| "loss": 0.1992, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.29875583203732503, | |
| "grad_norm": 1.677907044209659, | |
| "learning_rate": 7.954714669179981e-06, | |
| "loss": 0.2557, | |
| "step": 1921 | |
| }, | |
| { | |
| "epoch": 0.2989113530326594, | |
| "grad_norm": 1.0741276350489937, | |
| "learning_rate": 7.952743581932696e-06, | |
| "loss": 0.2228, | |
| "step": 1922 | |
| }, | |
| { | |
| "epoch": 0.29906687402799376, | |
| "grad_norm": 1.1370483443720154, | |
| "learning_rate": 7.950771789824341e-06, | |
| "loss": 0.1822, | |
| "step": 1923 | |
| }, | |
| { | |
| "epoch": 0.29922239502332815, | |
| "grad_norm": 1.4805485099895457, | |
| "learning_rate": 7.948799293325607e-06, | |
| "loss": 0.2066, | |
| "step": 1924 | |
| }, | |
| { | |
| "epoch": 0.29937791601866254, | |
| "grad_norm": 1.0841471732459598, | |
| "learning_rate": 7.946826092907362e-06, | |
| "loss": 0.2086, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.2995334370139969, | |
| "grad_norm": 0.9923801848699839, | |
| "learning_rate": 7.944852189040633e-06, | |
| "loss": 0.1457, | |
| "step": 1926 | |
| }, | |
| { | |
| "epoch": 0.29968895800933126, | |
| "grad_norm": 1.1826489754247185, | |
| "learning_rate": 7.942877582196618e-06, | |
| "loss": 0.1335, | |
| "step": 1927 | |
| }, | |
| { | |
| "epoch": 0.29984447900466565, | |
| "grad_norm": 1.0374422374980892, | |
| "learning_rate": 7.940902272846684e-06, | |
| "loss": 0.1747, | |
| "step": 1928 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.9355242713051211, | |
| "learning_rate": 7.938926261462366e-06, | |
| "loss": 0.2035, | |
| "step": 1929 | |
| }, | |
| { | |
| "epoch": 0.3001555209953344, | |
| "grad_norm": 1.3452996964657524, | |
| "learning_rate": 7.936949548515364e-06, | |
| "loss": 0.2284, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.3003110419906687, | |
| "grad_norm": 0.7948433007606517, | |
| "learning_rate": 7.93497213447755e-06, | |
| "loss": 0.2051, | |
| "step": 1931 | |
| }, | |
| { | |
| "epoch": 0.3004665629860031, | |
| "grad_norm": 1.130699049423352, | |
| "learning_rate": 7.932994019820956e-06, | |
| "loss": 0.174, | |
| "step": 1932 | |
| }, | |
| { | |
| "epoch": 0.3006220839813375, | |
| "grad_norm": 4.331642107991714, | |
| "learning_rate": 7.931015205017788e-06, | |
| "loss": 0.2259, | |
| "step": 1933 | |
| }, | |
| { | |
| "epoch": 0.30077760497667183, | |
| "grad_norm": 1.5306684316210843, | |
| "learning_rate": 7.929035690540414e-06, | |
| "loss": 0.1917, | |
| "step": 1934 | |
| }, | |
| { | |
| "epoch": 0.3009331259720062, | |
| "grad_norm": 0.8871970028065491, | |
| "learning_rate": 7.927055476861376e-06, | |
| "loss": 0.1765, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.3010886469673406, | |
| "grad_norm": 0.9400711133682595, | |
| "learning_rate": 7.925074564453376e-06, | |
| "loss": 0.1824, | |
| "step": 1936 | |
| }, | |
| { | |
| "epoch": 0.30124416796267495, | |
| "grad_norm": 0.9734009328190283, | |
| "learning_rate": 7.923092953789287e-06, | |
| "loss": 0.1575, | |
| "step": 1937 | |
| }, | |
| { | |
| "epoch": 0.30139968895800934, | |
| "grad_norm": 1.1309704131602631, | |
| "learning_rate": 7.921110645342144e-06, | |
| "loss": 0.2438, | |
| "step": 1938 | |
| }, | |
| { | |
| "epoch": 0.30155520995334373, | |
| "grad_norm": 1.2491112218817273, | |
| "learning_rate": 7.919127639585153e-06, | |
| "loss": 0.2252, | |
| "step": 1939 | |
| }, | |
| { | |
| "epoch": 0.30171073094867806, | |
| "grad_norm": 0.9626959898568382, | |
| "learning_rate": 7.917143936991688e-06, | |
| "loss": 0.1416, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.30186625194401245, | |
| "grad_norm": 0.933932349728071, | |
| "learning_rate": 7.915159538035284e-06, | |
| "loss": 0.1924, | |
| "step": 1941 | |
| }, | |
| { | |
| "epoch": 0.3020217729393468, | |
| "grad_norm": 1.198922066826054, | |
| "learning_rate": 7.913174443189645e-06, | |
| "loss": 0.1918, | |
| "step": 1942 | |
| }, | |
| { | |
| "epoch": 0.3021772939346812, | |
| "grad_norm": 0.711619672728743, | |
| "learning_rate": 7.911188652928639e-06, | |
| "loss": 0.1322, | |
| "step": 1943 | |
| }, | |
| { | |
| "epoch": 0.30233281493001557, | |
| "grad_norm": 0.9224372120486194, | |
| "learning_rate": 7.909202167726306e-06, | |
| "loss": 0.1775, | |
| "step": 1944 | |
| }, | |
| { | |
| "epoch": 0.3024883359253499, | |
| "grad_norm": 1.3276511094955517, | |
| "learning_rate": 7.907214988056844e-06, | |
| "loss": 0.2187, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.3026438569206843, | |
| "grad_norm": 0.8655219464600901, | |
| "learning_rate": 7.905227114394623e-06, | |
| "loss": 0.1465, | |
| "step": 1946 | |
| }, | |
| { | |
| "epoch": 0.3027993779160187, | |
| "grad_norm": 0.995295145761775, | |
| "learning_rate": 7.903238547214173e-06, | |
| "loss": 0.2004, | |
| "step": 1947 | |
| }, | |
| { | |
| "epoch": 0.302954898911353, | |
| "grad_norm": 1.1948454763354273, | |
| "learning_rate": 7.901249286990196e-06, | |
| "loss": 0.1755, | |
| "step": 1948 | |
| }, | |
| { | |
| "epoch": 0.3031104199066874, | |
| "grad_norm": 0.8682961110627464, | |
| "learning_rate": 7.899259334197554e-06, | |
| "loss": 0.1999, | |
| "step": 1949 | |
| }, | |
| { | |
| "epoch": 0.30326594090202175, | |
| "grad_norm": 1.0906703142458485, | |
| "learning_rate": 7.897268689311278e-06, | |
| "loss": 0.1014, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.30342146189735614, | |
| "grad_norm": 1.2664526681944839, | |
| "learning_rate": 7.895277352806562e-06, | |
| "loss": 0.2251, | |
| "step": 1951 | |
| }, | |
| { | |
| "epoch": 0.30357698289269053, | |
| "grad_norm": 0.9627019771781781, | |
| "learning_rate": 7.893285325158766e-06, | |
| "loss": 0.1591, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.30373250388802486, | |
| "grad_norm": 1.9216322578695895, | |
| "learning_rate": 7.891292606843414e-06, | |
| "loss": 0.2066, | |
| "step": 1953 | |
| }, | |
| { | |
| "epoch": 0.30388802488335925, | |
| "grad_norm": 0.9086586156841527, | |
| "learning_rate": 7.889299198336197e-06, | |
| "loss": 0.2196, | |
| "step": 1954 | |
| }, | |
| { | |
| "epoch": 0.30404354587869364, | |
| "grad_norm": 1.4203649142405548, | |
| "learning_rate": 7.887305100112967e-06, | |
| "loss": 0.1804, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.304199066874028, | |
| "grad_norm": 1.2381428600296667, | |
| "learning_rate": 7.885310312649747e-06, | |
| "loss": 0.1434, | |
| "step": 1956 | |
| }, | |
| { | |
| "epoch": 0.30435458786936237, | |
| "grad_norm": 0.7952770821447226, | |
| "learning_rate": 7.883314836422717e-06, | |
| "loss": 0.1955, | |
| "step": 1957 | |
| }, | |
| { | |
| "epoch": 0.30451010886469676, | |
| "grad_norm": 0.998488522800322, | |
| "learning_rate": 7.881318671908228e-06, | |
| "loss": 0.2239, | |
| "step": 1958 | |
| }, | |
| { | |
| "epoch": 0.3046656298600311, | |
| "grad_norm": 1.0829580403987296, | |
| "learning_rate": 7.879321819582788e-06, | |
| "loss": 0.2401, | |
| "step": 1959 | |
| }, | |
| { | |
| "epoch": 0.3048211508553655, | |
| "grad_norm": 1.043363355464928, | |
| "learning_rate": 7.877324279923078e-06, | |
| "loss": 0.1821, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3049766718506998, | |
| "grad_norm": 1.7533270649933215, | |
| "learning_rate": 7.875326053405936e-06, | |
| "loss": 0.2513, | |
| "step": 1961 | |
| }, | |
| { | |
| "epoch": 0.3051321928460342, | |
| "grad_norm": 1.3436274607263432, | |
| "learning_rate": 7.873327140508367e-06, | |
| "loss": 0.2352, | |
| "step": 1962 | |
| }, | |
| { | |
| "epoch": 0.3052877138413686, | |
| "grad_norm": 2.0633364771352274, | |
| "learning_rate": 7.87132754170754e-06, | |
| "loss": 0.2125, | |
| "step": 1963 | |
| }, | |
| { | |
| "epoch": 0.30544323483670294, | |
| "grad_norm": 0.9097966158633792, | |
| "learning_rate": 7.869327257480787e-06, | |
| "loss": 0.1627, | |
| "step": 1964 | |
| }, | |
| { | |
| "epoch": 0.3055987558320373, | |
| "grad_norm": 1.8317277834761483, | |
| "learning_rate": 7.867326288305603e-06, | |
| "loss": 0.211, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.3057542768273717, | |
| "grad_norm": 1.1448361049962872, | |
| "learning_rate": 7.865324634659647e-06, | |
| "loss": 0.1683, | |
| "step": 1966 | |
| }, | |
| { | |
| "epoch": 0.30590979782270605, | |
| "grad_norm": 1.14865744697956, | |
| "learning_rate": 7.863322297020743e-06, | |
| "loss": 0.2238, | |
| "step": 1967 | |
| }, | |
| { | |
| "epoch": 0.30606531881804044, | |
| "grad_norm": 1.0967845312311937, | |
| "learning_rate": 7.861319275866877e-06, | |
| "loss": 0.1889, | |
| "step": 1968 | |
| }, | |
| { | |
| "epoch": 0.3062208398133748, | |
| "grad_norm": 1.2461473684464468, | |
| "learning_rate": 7.859315571676198e-06, | |
| "loss": 0.2138, | |
| "step": 1969 | |
| }, | |
| { | |
| "epoch": 0.30637636080870917, | |
| "grad_norm": 1.1992165952645324, | |
| "learning_rate": 7.857311184927015e-06, | |
| "loss": 0.2289, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.30653188180404356, | |
| "grad_norm": 0.9734656478980178, | |
| "learning_rate": 7.855306116097807e-06, | |
| "loss": 0.1798, | |
| "step": 1971 | |
| }, | |
| { | |
| "epoch": 0.3066874027993779, | |
| "grad_norm": 0.8576094794110676, | |
| "learning_rate": 7.853300365667211e-06, | |
| "loss": 0.1849, | |
| "step": 1972 | |
| }, | |
| { | |
| "epoch": 0.3068429237947123, | |
| "grad_norm": 0.9320489557446329, | |
| "learning_rate": 7.851293934114026e-06, | |
| "loss": 0.1663, | |
| "step": 1973 | |
| }, | |
| { | |
| "epoch": 0.3069984447900467, | |
| "grad_norm": 1.5628965027384294, | |
| "learning_rate": 7.849286821917217e-06, | |
| "loss": 0.2741, | |
| "step": 1974 | |
| }, | |
| { | |
| "epoch": 0.307153965785381, | |
| "grad_norm": 1.1064029390023975, | |
| "learning_rate": 7.847279029555908e-06, | |
| "loss": 0.1655, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.3073094867807154, | |
| "grad_norm": 1.1272492512254035, | |
| "learning_rate": 7.845270557509389e-06, | |
| "loss": 0.1473, | |
| "step": 1976 | |
| }, | |
| { | |
| "epoch": 0.3074650077760498, | |
| "grad_norm": 0.8321910160414181, | |
| "learning_rate": 7.843261406257108e-06, | |
| "loss": 0.1571, | |
| "step": 1977 | |
| }, | |
| { | |
| "epoch": 0.3076205287713841, | |
| "grad_norm": 0.9606345664210296, | |
| "learning_rate": 7.841251576278681e-06, | |
| "loss": 0.227, | |
| "step": 1978 | |
| }, | |
| { | |
| "epoch": 0.3077760497667185, | |
| "grad_norm": 1.0695344914586096, | |
| "learning_rate": 7.839241068053878e-06, | |
| "loss": 0.1616, | |
| "step": 1979 | |
| }, | |
| { | |
| "epoch": 0.30793157076205285, | |
| "grad_norm": 2.415757365845339, | |
| "learning_rate": 7.837229882062638e-06, | |
| "loss": 0.2091, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.30808709175738724, | |
| "grad_norm": 0.8945861519999385, | |
| "learning_rate": 7.83521801878506e-06, | |
| "loss": 0.1769, | |
| "step": 1981 | |
| }, | |
| { | |
| "epoch": 0.30824261275272163, | |
| "grad_norm": 1.1779736396630833, | |
| "learning_rate": 7.8332054787014e-06, | |
| "loss": 0.2311, | |
| "step": 1982 | |
| }, | |
| { | |
| "epoch": 0.30839813374805597, | |
| "grad_norm": 1.372493149836755, | |
| "learning_rate": 7.831192262292082e-06, | |
| "loss": 0.172, | |
| "step": 1983 | |
| }, | |
| { | |
| "epoch": 0.30855365474339036, | |
| "grad_norm": 2.4487535069237407, | |
| "learning_rate": 7.82917837003769e-06, | |
| "loss": 0.1395, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.30870917573872475, | |
| "grad_norm": 0.871516091971395, | |
| "learning_rate": 7.827163802418967e-06, | |
| "loss": 0.1437, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.3088646967340591, | |
| "grad_norm": 1.2731269701284036, | |
| "learning_rate": 7.825148559916817e-06, | |
| "loss": 0.1857, | |
| "step": 1986 | |
| }, | |
| { | |
| "epoch": 0.3090202177293935, | |
| "grad_norm": 0.9768725926218434, | |
| "learning_rate": 7.823132643012308e-06, | |
| "loss": 0.195, | |
| "step": 1987 | |
| }, | |
| { | |
| "epoch": 0.3091757387247278, | |
| "grad_norm": 0.9369720572131188, | |
| "learning_rate": 7.821116052186668e-06, | |
| "loss": 0.2034, | |
| "step": 1988 | |
| }, | |
| { | |
| "epoch": 0.3093312597200622, | |
| "grad_norm": 1.1315495162839369, | |
| "learning_rate": 7.819098787921283e-06, | |
| "loss": 0.1755, | |
| "step": 1989 | |
| }, | |
| { | |
| "epoch": 0.3094867807153966, | |
| "grad_norm": 1.0958800584806985, | |
| "learning_rate": 7.817080850697705e-06, | |
| "loss": 0.2575, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.3096423017107309, | |
| "grad_norm": 0.9336006846477578, | |
| "learning_rate": 7.815062240997642e-06, | |
| "loss": 0.1376, | |
| "step": 1991 | |
| }, | |
| { | |
| "epoch": 0.3097978227060653, | |
| "grad_norm": 0.9121065280126879, | |
| "learning_rate": 7.813042959302963e-06, | |
| "loss": 0.1212, | |
| "step": 1992 | |
| }, | |
| { | |
| "epoch": 0.3099533437013997, | |
| "grad_norm": 0.6936258475052076, | |
| "learning_rate": 7.811023006095703e-06, | |
| "loss": 0.13, | |
| "step": 1993 | |
| }, | |
| { | |
| "epoch": 0.31010886469673404, | |
| "grad_norm": 1.278051470184625, | |
| "learning_rate": 7.809002381858048e-06, | |
| "loss": 0.1686, | |
| "step": 1994 | |
| }, | |
| { | |
| "epoch": 0.31026438569206843, | |
| "grad_norm": 1.2807241898257353, | |
| "learning_rate": 7.806981087072354e-06, | |
| "loss": 0.2569, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.3104199066874028, | |
| "grad_norm": 1.6449581085415006, | |
| "learning_rate": 7.804959122221127e-06, | |
| "loss": 0.3075, | |
| "step": 1996 | |
| }, | |
| { | |
| "epoch": 0.31057542768273716, | |
| "grad_norm": 0.9051580549498448, | |
| "learning_rate": 7.802936487787045e-06, | |
| "loss": 0.1603, | |
| "step": 1997 | |
| }, | |
| { | |
| "epoch": 0.31073094867807155, | |
| "grad_norm": 1.5451818475345835, | |
| "learning_rate": 7.800913184252931e-06, | |
| "loss": 0.2057, | |
| "step": 1998 | |
| }, | |
| { | |
| "epoch": 0.3108864696734059, | |
| "grad_norm": 1.0935081143315897, | |
| "learning_rate": 7.79888921210178e-06, | |
| "loss": 0.2238, | |
| "step": 1999 | |
| }, | |
| { | |
| "epoch": 0.3110419906687403, | |
| "grad_norm": 1.262020237993972, | |
| "learning_rate": 7.796864571816745e-06, | |
| "loss": 0.1977, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3110419906687403, | |
| "eval_loss": 0.19129334390163422, | |
| "eval_runtime": 9.4405, | |
| "eval_samples_per_second": 2.754, | |
| "eval_steps_per_second": 0.741, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 6430, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 191987712000000.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |