Instructions to use tkhangg/Dual-Explain-2round-Q1_first with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use tkhangg/Dual-Explain-2round-Q1_first with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("tkhangg0910/Merged-Dual-Explain-Stage_full-Q1_first") model = PeftModel.from_pretrained(base_model, "tkhangg/Dual-Explain-2round-Q1_first") - Transformers
How to use tkhangg/Dual-Explain-2round-Q1_first with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="tkhangg/Dual-Explain-2round-Q1_first") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("tkhangg/Dual-Explain-2round-Q1_first", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use tkhangg/Dual-Explain-2round-Q1_first with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "tkhangg/Dual-Explain-2round-Q1_first" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tkhangg/Dual-Explain-2round-Q1_first", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/tkhangg/Dual-Explain-2round-Q1_first
- SGLang
How to use tkhangg/Dual-Explain-2round-Q1_first with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "tkhangg/Dual-Explain-2round-Q1_first" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tkhangg/Dual-Explain-2round-Q1_first", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "tkhangg/Dual-Explain-2round-Q1_first" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tkhangg/Dual-Explain-2round-Q1_first", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use tkhangg/Dual-Explain-2round-Q1_first with Docker Model Runner:
docker model run hf.co/tkhangg/Dual-Explain-2round-Q1_first
| { | |
| "best_global_step": null, | |
| "best_metric": 2.056363105773926, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2022, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.001483679525222552, | |
| "grad_norm": 8.939897537231445, | |
| "learning_rate": 0.0, | |
| "loss": 0.8632, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004451038575667656, | |
| "grad_norm": 9.648813247680664, | |
| "learning_rate": 2.8314346251117696e-06, | |
| "loss": 0.801, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.008902077151335312, | |
| "grad_norm": 6.610266208648682, | |
| "learning_rate": 4.61787097538723e-06, | |
| "loss": 0.8504, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.013353115727002967, | |
| "grad_norm": 6.382794380187988, | |
| "learning_rate": 5.662869250223539e-06, | |
| "loss": 0.8868, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.017804154302670624, | |
| "grad_norm": 11.73421859741211, | |
| "learning_rate": 6.40430732566269e-06, | |
| "loss": 0.7982, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.02225519287833828, | |
| "grad_norm": 9.07333755493164, | |
| "learning_rate": 6.979411376544402e-06, | |
| "loss": 0.7539, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.026706231454005934, | |
| "grad_norm": 5.257121562957764, | |
| "learning_rate": 7.449305600498998e-06, | |
| "loss": 0.7841, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03115727002967359, | |
| "grad_norm": 10.231422424316406, | |
| "learning_rate": 7.846595506000205e-06, | |
| "loss": 0.773, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.03560830860534125, | |
| "grad_norm": 6.98926305770874, | |
| "learning_rate": 8.190743675938152e-06, | |
| "loss": 0.794, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.040059347181008904, | |
| "grad_norm": 11.664831161499023, | |
| "learning_rate": 8.494303875335309e-06, | |
| "loss": 0.7717, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.04451038575667656, | |
| "grad_norm": 7.91517448425293, | |
| "learning_rate": 8.765847726819862e-06, | |
| "loss": 0.7704, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04896142433234421, | |
| "grad_norm": 4.517396926879883, | |
| "learning_rate": 9.01148901993771e-06, | |
| "loss": 0.7508, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.05341246290801187, | |
| "grad_norm": 6.36966609954834, | |
| "learning_rate": 9.23574195077446e-06, | |
| "loss": 0.8119, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.057863501483679525, | |
| "grad_norm": 11.161980628967285, | |
| "learning_rate": 9.442034649602095e-06, | |
| "loss": 0.8007, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.06231454005934718, | |
| "grad_norm": 8.65380859375, | |
| "learning_rate": 9.633031856275666e-06, | |
| "loss": 0.7474, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.06676557863501484, | |
| "grad_norm": 8.394684791564941, | |
| "learning_rate": 9.81084600165617e-06, | |
| "loss": 0.7685, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0712166172106825, | |
| "grad_norm": 9.330730438232422, | |
| "learning_rate": 9.977180026213612e-06, | |
| "loss": 0.7716, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.07566765578635015, | |
| "grad_norm": 6.538848400115967, | |
| "learning_rate": 1.013342682512159e-05, | |
| "loss": 0.8483, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.08011869436201781, | |
| "grad_norm": 5.0795793533325195, | |
| "learning_rate": 1.0280740225610769e-05, | |
| "loss": 0.8217, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.08456973293768547, | |
| "grad_norm": 4.957094192504883, | |
| "learning_rate": 1.042008674846264e-05, | |
| "loss": 0.8032, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.08902077151335312, | |
| "grad_norm": 11.510953903198242, | |
| "learning_rate": 1.0552284077095323e-05, | |
| "loss": 0.8015, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09347181008902077, | |
| "grad_norm": 5.8856096267700195, | |
| "learning_rate": 1.0678030131111975e-05, | |
| "loss": 0.7644, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.09792284866468842, | |
| "grad_norm": 6.385069847106934, | |
| "learning_rate": 1.0797925370213172e-05, | |
| "loss": 0.782, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.10237388724035608, | |
| "grad_norm": 7.3922295570373535, | |
| "learning_rate": 1.0912490136135183e-05, | |
| "loss": 0.8079, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.10682492581602374, | |
| "grad_norm": 6.742562770843506, | |
| "learning_rate": 1.102217830104992e-05, | |
| "loss": 0.7727, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.11127596439169139, | |
| "grad_norm": 10.546209335327148, | |
| "learning_rate": 1.1127388127977034e-05, | |
| "loss": 0.8124, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.11572700296735905, | |
| "grad_norm": 10.506328582763672, | |
| "learning_rate": 1.1228470999877556e-05, | |
| "loss": 0.7427, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.1201780415430267, | |
| "grad_norm": 8.066429138183594, | |
| "learning_rate": 1.1325738500447079e-05, | |
| "loss": 0.7873, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.12462908011869436, | |
| "grad_norm": 9.615484237670898, | |
| "learning_rate": 1.1419468206551126e-05, | |
| "loss": 0.8364, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.129080118694362, | |
| "grad_norm": 14.365260124206543, | |
| "learning_rate": 1.150990846375502e-05, | |
| "loss": 0.7708, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.13353115727002968, | |
| "grad_norm": 8.187057495117188, | |
| "learning_rate": 1.1597282351931633e-05, | |
| "loss": 0.7984, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.13798219584569732, | |
| "grad_norm": 14.292377471923828, | |
| "learning_rate": 1.1681791000387452e-05, | |
| "loss": 0.7983, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.142433234421365, | |
| "grad_norm": 12.673680305480957, | |
| "learning_rate": 1.1763616376489071e-05, | |
| "loss": 0.8131, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.14688427299703263, | |
| "grad_norm": 5.767789363861084, | |
| "learning_rate": 1.1842923645049482e-05, | |
| "loss": 0.8275, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.1513353115727003, | |
| "grad_norm": 10.027091026306152, | |
| "learning_rate": 1.1919863175397048e-05, | |
| "loss": 0.7671, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.15578635014836795, | |
| "grad_norm": 5.067603588104248, | |
| "learning_rate": 1.1994572257432837e-05, | |
| "loss": 0.7883, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.16023738872403562, | |
| "grad_norm": 12.904479026794434, | |
| "learning_rate": 1.206717657588623e-05, | |
| "loss": 0.7876, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.16468842729970326, | |
| "grad_norm": 8.401494979858398, | |
| "learning_rate": 1.2137791482536164e-05, | |
| "loss": 0.8008, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.16913946587537093, | |
| "grad_norm": 11.146966934204102, | |
| "learning_rate": 1.2206523098738102e-05, | |
| "loss": 0.7833, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.17359050445103857, | |
| "grad_norm": 9.299595832824707, | |
| "learning_rate": 1.2273469274713866e-05, | |
| "loss": 0.7752, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.17804154302670624, | |
| "grad_norm": 9.26494312286377, | |
| "learning_rate": 1.2338720427370783e-05, | |
| "loss": 0.8126, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1824925816023739, | |
| "grad_norm": 9.886981964111328, | |
| "learning_rate": 1.2402360274652675e-05, | |
| "loss": 0.7478, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.18694362017804153, | |
| "grad_norm": 11.463534355163574, | |
| "learning_rate": 1.2464466481387436e-05, | |
| "loss": 0.7334, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.1913946587537092, | |
| "grad_norm": 7.69536018371582, | |
| "learning_rate": 1.252511122913015e-05, | |
| "loss": 0.7756, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.19584569732937684, | |
| "grad_norm": 9.41036605834961, | |
| "learning_rate": 1.2584361720488632e-05, | |
| "loss": 0.7992, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.20029673590504452, | |
| "grad_norm": 6.2021803855896, | |
| "learning_rate": 1.264228062676794e-05, | |
| "loss": 0.7504, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.20474777448071216, | |
| "grad_norm": 6.035690784454346, | |
| "learning_rate": 1.2698926486410644e-05, | |
| "loss": 0.8319, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.20919881305637983, | |
| "grad_norm": 6.217399597167969, | |
| "learning_rate": 1.275435406058353e-05, | |
| "loss": 0.8151, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.21364985163204747, | |
| "grad_norm": 10.224953651428223, | |
| "learning_rate": 1.280861465132538e-05, | |
| "loss": 0.8005, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.21810089020771514, | |
| "grad_norm": 5.470182418823242, | |
| "learning_rate": 1.286175638688864e-05, | |
| "loss": 0.8147, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.22255192878338279, | |
| "grad_norm": 6.979781627655029, | |
| "learning_rate": 1.2913824478252495e-05, | |
| "loss": 0.7623, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22700296735905046, | |
| "grad_norm": 7.8118414878845215, | |
| "learning_rate": 1.2964861450233358e-05, | |
| "loss": 0.7851, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.2314540059347181, | |
| "grad_norm": 8.010553359985352, | |
| "learning_rate": 1.3014907350153016e-05, | |
| "loss": 0.8243, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.23590504451038577, | |
| "grad_norm": 6.6702470779418945, | |
| "learning_rate": 1.3063999936629808e-05, | |
| "loss": 0.8348, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.2403560830860534, | |
| "grad_norm": 6.347877025604248, | |
| "learning_rate": 1.3112174850722537e-05, | |
| "loss": 0.8093, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.24480712166172106, | |
| "grad_norm": 8.773872375488281, | |
| "learning_rate": 1.3159465771370344e-05, | |
| "loss": 0.8424, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.24925816023738873, | |
| "grad_norm": 9.952154159545898, | |
| "learning_rate": 1.3205904556826587e-05, | |
| "loss": 0.8068, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.25370919881305637, | |
| "grad_norm": 4.658379554748535, | |
| "learning_rate": 1.325152137357441e-05, | |
| "loss": 0.8352, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.258160237388724, | |
| "grad_norm": 8.045439720153809, | |
| "learning_rate": 1.329634481403048e-05, | |
| "loss": 0.7586, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.2626112759643917, | |
| "grad_norm": 13.365344047546387, | |
| "learning_rate": 1.3340402004187042e-05, | |
| "loss": 0.7838, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.26706231454005935, | |
| "grad_norm": 10.028172492980957, | |
| "learning_rate": 1.3383718702207093e-05, | |
| "loss": 0.8142, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.271513353115727, | |
| "grad_norm": 7.544151782989502, | |
| "learning_rate": 1.3426319388870015e-05, | |
| "loss": 0.8386, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.27596439169139464, | |
| "grad_norm": 11.747024536132812, | |
| "learning_rate": 1.3468227350662914e-05, | |
| "loss": 0.7884, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.28041543026706234, | |
| "grad_norm": 13.382097244262695, | |
| "learning_rate": 1.3509464756223744e-05, | |
| "loss": 0.7878, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.28486646884273, | |
| "grad_norm": 6.687026023864746, | |
| "learning_rate": 1.3550052726764533e-05, | |
| "loss": 0.7692, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.2893175074183976, | |
| "grad_norm": 5.648706912994385, | |
| "learning_rate": 1.3590011401034729e-05, | |
| "loss": 0.8231, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.29376854599406527, | |
| "grad_norm": 12.061844825744629, | |
| "learning_rate": 1.3629359995324941e-05, | |
| "loss": 0.7478, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.29821958456973297, | |
| "grad_norm": 7.498600006103516, | |
| "learning_rate": 1.3668116858958576e-05, | |
| "loss": 0.7725, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.3026706231454006, | |
| "grad_norm": 9.396763801574707, | |
| "learning_rate": 1.370629952567251e-05, | |
| "loss": 0.7833, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.30712166172106825, | |
| "grad_norm": 8.891100883483887, | |
| "learning_rate": 1.3743924761246951e-05, | |
| "loss": 0.8031, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.3115727002967359, | |
| "grad_norm": 4.5692830085754395, | |
| "learning_rate": 1.3781008607708299e-05, | |
| "loss": 0.7857, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.31602373887240354, | |
| "grad_norm": 6.108200550079346, | |
| "learning_rate": 1.381756642439674e-05, | |
| "loss": 0.7977, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.32047477744807124, | |
| "grad_norm": 11.626425743103027, | |
| "learning_rate": 1.385361292616169e-05, | |
| "loss": 0.7656, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.3249258160237389, | |
| "grad_norm": 7.599254131317139, | |
| "learning_rate": 1.38891622189228e-05, | |
| "loss": 0.8164, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.3293768545994065, | |
| "grad_norm": 13.067089080810547, | |
| "learning_rate": 1.3924227832811623e-05, | |
| "loss": 0.8302, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.33382789317507416, | |
| "grad_norm": 8.126757621765137, | |
| "learning_rate": 1.3958822753088804e-05, | |
| "loss": 0.772, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.33827893175074186, | |
| "grad_norm": 4.585634231567383, | |
| "learning_rate": 1.3992959449013562e-05, | |
| "loss": 0.8119, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.3427299703264095, | |
| "grad_norm": 7.228199481964111, | |
| "learning_rate": 1.4026649900826146e-05, | |
| "loss": 0.7462, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.34718100890207715, | |
| "grad_norm": 8.683415412902832, | |
| "learning_rate": 1.4059905624989326e-05, | |
| "loss": 0.7746, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.3516320474777448, | |
| "grad_norm": 7.1446990966796875, | |
| "learning_rate": 1.4092737697821986e-05, | |
| "loss": 0.9112, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.3560830860534125, | |
| "grad_norm": 7.082102298736572, | |
| "learning_rate": 1.4125156777646244e-05, | |
| "loss": 0.7838, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.36053412462908013, | |
| "grad_norm": 4.828745365142822, | |
| "learning_rate": 1.4157173125558845e-05, | |
| "loss": 0.806, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.3649851632047478, | |
| "grad_norm": 9.047410011291504, | |
| "learning_rate": 1.4188796624928136e-05, | |
| "loss": 0.8594, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.3694362017804154, | |
| "grad_norm": 4.606753349304199, | |
| "learning_rate": 1.4220036799709316e-05, | |
| "loss": 0.7615, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.37388724035608306, | |
| "grad_norm": 6.191976547241211, | |
| "learning_rate": 1.4250902831662896e-05, | |
| "loss": 0.8045, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.37833827893175076, | |
| "grad_norm": 5.661036491394043, | |
| "learning_rate": 1.4281403576554221e-05, | |
| "loss": 0.8259, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3827893175074184, | |
| "grad_norm": 11.667348861694336, | |
| "learning_rate": 1.4311547579405614e-05, | |
| "loss": 0.8555, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.38724035608308605, | |
| "grad_norm": 7.464338302612305, | |
| "learning_rate": 1.4341343088866789e-05, | |
| "loss": 0.8525, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.3916913946587537, | |
| "grad_norm": 4.3380584716796875, | |
| "learning_rate": 1.4370798070764093e-05, | |
| "loss": 0.7874, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.3961424332344214, | |
| "grad_norm": 9.564691543579102, | |
| "learning_rate": 1.4399920220884169e-05, | |
| "loss": 0.8047, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.40059347181008903, | |
| "grad_norm": 8.386067390441895, | |
| "learning_rate": 1.4428716977043402e-05, | |
| "loss": 0.8484, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4050445103857567, | |
| "grad_norm": 6.571981430053711, | |
| "learning_rate": 1.4457195530490532e-05, | |
| "loss": 0.8236, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.4094955489614243, | |
| "grad_norm": 4.435903549194336, | |
| "learning_rate": 1.4485362836686102e-05, | |
| "loss": 0.8457, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.413946587537092, | |
| "grad_norm": 6.691802024841309, | |
| "learning_rate": 1.451322562549922e-05, | |
| "loss": 0.8016, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.41839762611275966, | |
| "grad_norm": 15.839409828186035, | |
| "learning_rate": 1.454079041085899e-05, | |
| "loss": 0.845, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.4228486646884273, | |
| "grad_norm": 7.598920822143555, | |
| "learning_rate": 1.4568063499895273e-05, | |
| "loss": 0.7626, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.42729970326409494, | |
| "grad_norm": 10.305091857910156, | |
| "learning_rate": 1.4595051001600841e-05, | |
| "loss": 0.7926, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.4317507418397626, | |
| "grad_norm": 5.350500106811523, | |
| "learning_rate": 1.4621758835044685e-05, | |
| "loss": 0.8092, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.4362017804154303, | |
| "grad_norm": 18.546098709106445, | |
| "learning_rate": 1.4648192737164102e-05, | |
| "loss": 0.815, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.4406528189910979, | |
| "grad_norm": 8.379362106323242, | |
| "learning_rate": 1.4674358270161251e-05, | |
| "loss": 0.823, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.44510385756676557, | |
| "grad_norm": 8.076055526733398, | |
| "learning_rate": 1.4700260828527957e-05, | |
| "loss": 0.8321, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4495548961424332, | |
| "grad_norm": 16.41190528869629, | |
| "learning_rate": 1.4725905645721047e-05, | |
| "loss": 0.8191, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.4540059347181009, | |
| "grad_norm": 6.45059871673584, | |
| "learning_rate": 1.475129780050882e-05, | |
| "loss": 0.7658, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.45845697329376855, | |
| "grad_norm": 10.349370002746582, | |
| "learning_rate": 1.4776442223007901e-05, | |
| "loss": 0.7575, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.4629080118694362, | |
| "grad_norm": 3.693134069442749, | |
| "learning_rate": 1.4801343700428479e-05, | |
| "loss": 0.8456, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.46735905044510384, | |
| "grad_norm": 9.323659896850586, | |
| "learning_rate": 1.4826006882544607e-05, | |
| "loss": 0.8387, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.47181008902077154, | |
| "grad_norm": 7.65170431137085, | |
| "learning_rate": 1.4850436286905268e-05, | |
| "loss": 0.8939, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.4762611275964392, | |
| "grad_norm": 15.972323417663574, | |
| "learning_rate": 1.4874636303800742e-05, | |
| "loss": 0.8639, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.4807121661721068, | |
| "grad_norm": 22.23072052001953, | |
| "learning_rate": 1.4898611200997996e-05, | |
| "loss": 0.8339, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.48516320474777447, | |
| "grad_norm": 6.0545430183410645, | |
| "learning_rate": 1.4922365128257845e-05, | |
| "loss": 0.8315, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.4896142433234421, | |
| "grad_norm": 15.703563690185547, | |
| "learning_rate": 1.4945902121645804e-05, | |
| "loss": 0.8343, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4940652818991098, | |
| "grad_norm": 9.006149291992188, | |
| "learning_rate": 1.4969226107647933e-05, | |
| "loss": 0.9232, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.49851632047477745, | |
| "grad_norm": 9.227169036865234, | |
| "learning_rate": 1.4992340907102047e-05, | |
| "loss": 0.8832, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.5029673590504451, | |
| "grad_norm": 13.163078308105469, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8781, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.5074183976261127, | |
| "grad_norm": 7.369640350341797, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8778, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.5118694362017804, | |
| "grad_norm": 11.006775856018066, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8643, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.516320474777448, | |
| "grad_norm": 3.54710054397583, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.843, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.5207715133531158, | |
| "grad_norm": 9.564830780029297, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8245, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.5252225519287834, | |
| "grad_norm": 8.431432723999023, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.886, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.5296735905044511, | |
| "grad_norm": 7.211839199066162, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8578, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.5341246290801187, | |
| "grad_norm": 7.788987159729004, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.872, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5385756676557863, | |
| "grad_norm": 4.075163841247559, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8802, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.543026706231454, | |
| "grad_norm": 13.805707931518555, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8325, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.5474777448071216, | |
| "grad_norm": 7.167026519775391, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9119, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.5519287833827893, | |
| "grad_norm": 8.409590721130371, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8572, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.5563798219584569, | |
| "grad_norm": 11.929038047790527, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8327, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5608308605341247, | |
| "grad_norm": 4.433465480804443, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7959, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.5652818991097923, | |
| "grad_norm": 7.223580360412598, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8157, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.56973293768546, | |
| "grad_norm": 6.028378009796143, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8469, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.5741839762611276, | |
| "grad_norm": 10.804591178894043, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8891, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.5786350148367952, | |
| "grad_norm": 10.0234956741333, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.906, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5830860534124629, | |
| "grad_norm": 4.883424758911133, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8422, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.5875370919881305, | |
| "grad_norm": 8.042715072631836, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8673, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.5919881305637982, | |
| "grad_norm": 6.206501007080078, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8115, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.5964391691394659, | |
| "grad_norm": 14.539153099060059, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8752, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.6008902077151336, | |
| "grad_norm": 9.136768341064453, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8752, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6053412462908012, | |
| "grad_norm": 4.936409950256348, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8807, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.6097922848664689, | |
| "grad_norm": 12.717706680297852, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8133, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.6142433234421365, | |
| "grad_norm": 13.034161567687988, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8437, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.6186943620178041, | |
| "grad_norm": 4.76663064956665, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8141, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.6231454005934718, | |
| "grad_norm": 6.521324634552002, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8687, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6275964391691394, | |
| "grad_norm": 19.489913940429688, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8154, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.6320474777448071, | |
| "grad_norm": 13.211241722106934, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8417, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.6364985163204748, | |
| "grad_norm": 8.362677574157715, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8387, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.6409495548961425, | |
| "grad_norm": 13.372685432434082, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8563, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.6454005934718101, | |
| "grad_norm": 4.561835765838623, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8571, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6498516320474778, | |
| "grad_norm": 12.67446231842041, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8822, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.6543026706231454, | |
| "grad_norm": 11.653807640075684, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8925, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.658753709198813, | |
| "grad_norm": 9.325271606445312, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7984, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.6632047477744807, | |
| "grad_norm": 5.574127674102783, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8253, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.6676557863501483, | |
| "grad_norm": 5.725862979888916, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9019, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.672106824925816, | |
| "grad_norm": 8.04867172241211, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8331, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.6765578635014837, | |
| "grad_norm": 8.826385498046875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8056, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.6810089020771514, | |
| "grad_norm": 7.501665115356445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8137, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.685459940652819, | |
| "grad_norm": 3.8520801067352295, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8345, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.6899109792284867, | |
| "grad_norm": 15.48876953125, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.859, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.6943620178041543, | |
| "grad_norm": 12.54112720489502, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8799, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.6988130563798219, | |
| "grad_norm": 7.668098449707031, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8135, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.7032640949554896, | |
| "grad_norm": 13.388195991516113, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8104, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.7077151335311572, | |
| "grad_norm": 8.277421951293945, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.792, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.712166172106825, | |
| "grad_norm": 10.426804542541504, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8533, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7166172106824926, | |
| "grad_norm": 15.068408012390137, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8912, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.7210682492581603, | |
| "grad_norm": 7.566452980041504, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8625, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.7255192878338279, | |
| "grad_norm": 14.359679222106934, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7973, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.7299703264094956, | |
| "grad_norm": 8.746999740600586, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9004, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.7344213649851632, | |
| "grad_norm": 15.901468276977539, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8499, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7388724035608308, | |
| "grad_norm": 5.345223903656006, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9112, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.7433234421364985, | |
| "grad_norm": 8.00938892364502, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8521, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.7477744807121661, | |
| "grad_norm": 7.601090431213379, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8708, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.7522255192878339, | |
| "grad_norm": 14.48643970489502, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8476, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.7566765578635015, | |
| "grad_norm": 16.052143096923828, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8846, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7611275964391692, | |
| "grad_norm": 11.447772979736328, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8062, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.7655786350148368, | |
| "grad_norm": 7.909060955047607, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8715, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.7700296735905044, | |
| "grad_norm": 10.86801528930664, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8102, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.7744807121661721, | |
| "grad_norm": 6.530400276184082, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8667, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.7789317507418397, | |
| "grad_norm": 13.691858291625977, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8159, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.7833827893175074, | |
| "grad_norm": 8.416064262390137, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8298, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.787833827893175, | |
| "grad_norm": 8.614116668701172, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8542, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.7922848664688428, | |
| "grad_norm": 10.818787574768066, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8476, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.7967359050445104, | |
| "grad_norm": 12.394455909729004, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.884, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.8011869436201781, | |
| "grad_norm": 7.285090446472168, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8706, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8056379821958457, | |
| "grad_norm": 6.0529632568359375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8389, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.8100890207715133, | |
| "grad_norm": 5.722935199737549, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9546, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.814540059347181, | |
| "grad_norm": 8.289714813232422, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8797, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.8189910979228486, | |
| "grad_norm": 11.81054973602295, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9221, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.8234421364985163, | |
| "grad_norm": 8.692460060119629, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8738, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.827893175074184, | |
| "grad_norm": 12.485095024108887, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.824, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.8323442136498517, | |
| "grad_norm": 15.875768661499023, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8862, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.8367952522255193, | |
| "grad_norm": 11.125205039978027, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8969, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.841246290801187, | |
| "grad_norm": 9.451800346374512, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8265, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.8456973293768546, | |
| "grad_norm": 8.435980796813965, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9067, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8501483679525222, | |
| "grad_norm": 7.994401931762695, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8713, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.8545994065281899, | |
| "grad_norm": 8.249290466308594, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8681, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.8590504451038575, | |
| "grad_norm": 10.910624504089355, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8686, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.8635014836795252, | |
| "grad_norm": 9.459715843200684, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8453, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.8679525222551929, | |
| "grad_norm": 11.252153396606445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8494, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8724035608308606, | |
| "grad_norm": 13.70508098602295, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8768, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.8768545994065282, | |
| "grad_norm": 5.890571117401123, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.855, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.8813056379821959, | |
| "grad_norm": 9.50145149230957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8869, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.8857566765578635, | |
| "grad_norm": 14.335087776184082, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8234, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.8902077151335311, | |
| "grad_norm": 7.306372165679932, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8548, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8946587537091988, | |
| "grad_norm": 8.26121711730957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8765, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 0.8991097922848664, | |
| "grad_norm": 9.257493019104004, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8978, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.9035608308605341, | |
| "grad_norm": 5.7045817375183105, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8315, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.9080118694362018, | |
| "grad_norm": 13.060375213623047, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8272, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.9124629080118695, | |
| "grad_norm": 7.294022560119629, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8223, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.9169139465875371, | |
| "grad_norm": 5.998393535614014, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8029, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.9213649851632048, | |
| "grad_norm": 4.604220390319824, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8613, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 0.9258160237388724, | |
| "grad_norm": 10.009174346923828, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.856, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.93026706231454, | |
| "grad_norm": 6.437814235687256, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9074, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 0.9347181008902077, | |
| "grad_norm": 16.38670539855957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8116, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9391691394658753, | |
| "grad_norm": 7.195847511291504, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8411, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 0.9436201780415431, | |
| "grad_norm": 4.928689479827881, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8057, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 0.9480712166172107, | |
| "grad_norm": 4.637528896331787, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8651, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 0.9525222551928784, | |
| "grad_norm": 10.825565338134766, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8757, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 0.956973293768546, | |
| "grad_norm": 9.891039848327637, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7774, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.9614243323442137, | |
| "grad_norm": 6.30767297744751, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8346, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.9658753709198813, | |
| "grad_norm": 9.901351928710938, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8439, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 0.9703264094955489, | |
| "grad_norm": 8.528812408447266, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.922, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 0.9747774480712166, | |
| "grad_norm": 8.85007381439209, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8565, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 0.9792284866468842, | |
| "grad_norm": 7.137876510620117, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.886, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.983679525222552, | |
| "grad_norm": 12.115394592285156, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9001, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 0.9881305637982196, | |
| "grad_norm": 3.7681024074554443, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8983, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 0.9925816023738873, | |
| "grad_norm": 7.687930583953857, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8629, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 0.9970326409495549, | |
| "grad_norm": 5.996459007263184, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8284, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 2.075103521347046, | |
| "eval_runtime": 554.3952, | |
| "eval_samples_per_second": 2.781, | |
| "eval_steps_per_second": 0.348, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 1.0014836795252227, | |
| "grad_norm": 10.066774368286133, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8555, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.0059347181008902, | |
| "grad_norm": 6.481128692626953, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.838, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 1.010385756676558, | |
| "grad_norm": 12.874567985534668, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8869, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 1.0148367952522255, | |
| "grad_norm": 5.834901332855225, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8517, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 1.0192878338278932, | |
| "grad_norm": 15.476794242858887, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8777, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 1.0237388724035608, | |
| "grad_norm": 6.580478191375732, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.827, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.0281899109792285, | |
| "grad_norm": 9.56643009185791, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8886, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 1.032640949554896, | |
| "grad_norm": 4.428914546966553, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9552, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 1.0370919881305638, | |
| "grad_norm": 4.398239612579346, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.845, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 1.0415430267062316, | |
| "grad_norm": 5.540760040283203, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8776, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 1.045994065281899, | |
| "grad_norm": 15.209844589233398, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9569, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.0504451038575668, | |
| "grad_norm": 11.813831329345703, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8452, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 1.0548961424332344, | |
| "grad_norm": 8.536015510559082, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8486, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 1.0593471810089021, | |
| "grad_norm": 6.545129299163818, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8856, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 1.0637982195845697, | |
| "grad_norm": 8.14754581451416, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8801, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 1.0682492581602374, | |
| "grad_norm": 7.521109580993652, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8708, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.072700296735905, | |
| "grad_norm": 9.358808517456055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8476, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 1.0771513353115727, | |
| "grad_norm": 6.190918922424316, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8605, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 1.0816023738872405, | |
| "grad_norm": 10.316658020019531, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8518, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 1.086053412462908, | |
| "grad_norm": 5.746811389923096, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8707, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 1.0905044510385757, | |
| "grad_norm": 7.9586663246154785, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8193, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.0949554896142433, | |
| "grad_norm": 6.76649808883667, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8491, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 1.099406528189911, | |
| "grad_norm": 7.164156436920166, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8137, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 1.1038575667655786, | |
| "grad_norm": 5.188474178314209, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8105, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 1.1083086053412463, | |
| "grad_norm": 11.81541633605957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8628, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 1.1127596439169138, | |
| "grad_norm": 6.901886940002441, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8279, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.1172106824925816, | |
| "grad_norm": 5.522708892822266, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.834, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 1.1216617210682494, | |
| "grad_norm": 10.341312408447266, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7452, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 1.1261127596439169, | |
| "grad_norm": 11.882563591003418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8481, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 1.1305637982195846, | |
| "grad_norm": 4.872053146362305, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8284, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 1.1350148367952522, | |
| "grad_norm": 10.799345016479492, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8375, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.13946587537092, | |
| "grad_norm": 5.207538604736328, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8427, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 1.1439169139465875, | |
| "grad_norm": 12.862470626831055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8248, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 1.1483679525222552, | |
| "grad_norm": 6.997878074645996, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8827, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 1.1528189910979227, | |
| "grad_norm": 5.541961669921875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8391, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 1.1572700296735905, | |
| "grad_norm": 8.02748966217041, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8518, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.1617210682492582, | |
| "grad_norm": 10.839200973510742, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8715, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 1.1661721068249258, | |
| "grad_norm": 6.69924259185791, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8505, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 1.1706231454005935, | |
| "grad_norm": 15.232388496398926, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8483, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 1.175074183976261, | |
| "grad_norm": 7.842281341552734, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8434, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 1.1795252225519288, | |
| "grad_norm": 9.89548110961914, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8434, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.1839762611275964, | |
| "grad_norm": 12.349285125732422, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8522, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 1.188427299703264, | |
| "grad_norm": 6.074175834655762, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8639, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 1.1928783382789319, | |
| "grad_norm": 12.941549301147461, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8215, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 1.1973293768545994, | |
| "grad_norm": 6.716182708740234, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8417, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 1.2017804154302671, | |
| "grad_norm": 7.472183704376221, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8061, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.2062314540059347, | |
| "grad_norm": 9.26564884185791, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8834, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 1.2106824925816024, | |
| "grad_norm": 12.621788024902344, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8299, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 1.21513353115727, | |
| "grad_norm": 8.867362976074219, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8374, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 1.2195845697329377, | |
| "grad_norm": 12.072689056396484, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8223, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 1.2240356083086052, | |
| "grad_norm": 10.037847518920898, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7896, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.228486646884273, | |
| "grad_norm": 7.16823148727417, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7912, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 1.2329376854599405, | |
| "grad_norm": 13.862353324890137, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8545, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 1.2373887240356083, | |
| "grad_norm": 6.668301582336426, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7566, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 1.241839762611276, | |
| "grad_norm": 10.224084854125977, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8056, | |
| "step": 837 | |
| }, | |
| { | |
| "epoch": 1.2462908011869436, | |
| "grad_norm": 6.452188014984131, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.82, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.2507418397626113, | |
| "grad_norm": 7.246963024139404, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8351, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 1.2551928783382789, | |
| "grad_norm": 6.421880722045898, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8563, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 1.2596439169139466, | |
| "grad_norm": 6.952515602111816, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9159, | |
| "step": 849 | |
| }, | |
| { | |
| "epoch": 1.2640949554896141, | |
| "grad_norm": 4.98225212097168, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8609, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 1.268545994065282, | |
| "grad_norm": 7.60207462310791, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8765, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.2729970326409497, | |
| "grad_norm": 7.847710609436035, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8208, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 1.2774480712166172, | |
| "grad_norm": 8.309576988220215, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8579, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 1.281899109792285, | |
| "grad_norm": 8.306654930114746, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8137, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 1.2863501483679525, | |
| "grad_norm": 7.458807945251465, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8503, | |
| "step": 867 | |
| }, | |
| { | |
| "epoch": 1.2908011869436202, | |
| "grad_norm": 5.15773344039917, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8074, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.2952522255192878, | |
| "grad_norm": 5.930022716522217, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7877, | |
| "step": 873 | |
| }, | |
| { | |
| "epoch": 1.2997032640949555, | |
| "grad_norm": 10.39821720123291, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7825, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 1.3041543026706233, | |
| "grad_norm": 10.691167831420898, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.84, | |
| "step": 879 | |
| }, | |
| { | |
| "epoch": 1.3086053412462908, | |
| "grad_norm": 11.49881362915039, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8858, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 1.3130563798219583, | |
| "grad_norm": 8.16782283782959, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8237, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.317507418397626, | |
| "grad_norm": 5.213159561157227, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.776, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 1.3219584569732938, | |
| "grad_norm": 7.029541969299316, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8448, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 1.3264094955489614, | |
| "grad_norm": 5.071165084838867, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.915, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 1.3308605341246291, | |
| "grad_norm": 8.019569396972656, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.873, | |
| "step": 897 | |
| }, | |
| { | |
| "epoch": 1.3353115727002967, | |
| "grad_norm": 8.721610069274902, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8049, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3397626112759644, | |
| "grad_norm": 15.425809860229492, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8703, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 1.344213649851632, | |
| "grad_norm": 10.104765892028809, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8404, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 1.3486646884272997, | |
| "grad_norm": 6.787662506103516, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8585, | |
| "step": 909 | |
| }, | |
| { | |
| "epoch": 1.3531157270029674, | |
| "grad_norm": 10.807848930358887, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8074, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 1.357566765578635, | |
| "grad_norm": 4.6103129386901855, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8287, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.3620178041543027, | |
| "grad_norm": 7.826140880584717, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8319, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 1.3664688427299703, | |
| "grad_norm": 4.535531044006348, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8546, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 1.370919881305638, | |
| "grad_norm": 6.764124870300293, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8228, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 1.3753709198813056, | |
| "grad_norm": 9.835914611816406, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8356, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 1.3798219584569733, | |
| "grad_norm": 10.747434616088867, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8648, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.384272997032641, | |
| "grad_norm": 7.541149139404297, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8486, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 1.3887240356083086, | |
| "grad_norm": 6.978203296661377, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8054, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 1.3931750741839761, | |
| "grad_norm": 15.082099914550781, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8219, | |
| "step": 939 | |
| }, | |
| { | |
| "epoch": 1.3976261127596439, | |
| "grad_norm": 8.855502128601074, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8023, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 1.4020771513353116, | |
| "grad_norm": 4.794929027557373, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8232, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.4065281899109792, | |
| "grad_norm": 5.567296028137207, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7989, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 1.410979228486647, | |
| "grad_norm": 9.492593765258789, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8452, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 1.4154302670623147, | |
| "grad_norm": 7.953827857971191, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8377, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 1.4198813056379822, | |
| "grad_norm": 9.085283279418945, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8983, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 1.4243323442136497, | |
| "grad_norm": 8.406304359436035, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8422, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.4287833827893175, | |
| "grad_norm": 5.686973571777344, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8542, | |
| "step": 963 | |
| }, | |
| { | |
| "epoch": 1.4332344213649852, | |
| "grad_norm": 9.381924629211426, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8904, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 1.4376854599406528, | |
| "grad_norm": 4.451043128967285, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8172, | |
| "step": 969 | |
| }, | |
| { | |
| "epoch": 1.4421364985163205, | |
| "grad_norm": 7.336870193481445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8578, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 1.446587537091988, | |
| "grad_norm": 8.10446548461914, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8826, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.4510385756676558, | |
| "grad_norm": 8.376605033874512, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8302, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 1.4554896142433233, | |
| "grad_norm": 11.178180694580078, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8797, | |
| "step": 981 | |
| }, | |
| { | |
| "epoch": 1.459940652818991, | |
| "grad_norm": 10.056670188903809, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8314, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 1.4643916913946589, | |
| "grad_norm": 5.353207588195801, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.823, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 1.4688427299703264, | |
| "grad_norm": 10.648890495300293, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8789, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.4732937685459941, | |
| "grad_norm": 5.265453338623047, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8392, | |
| "step": 993 | |
| }, | |
| { | |
| "epoch": 1.4777448071216617, | |
| "grad_norm": 4.404312610626221, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8659, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 1.4821958456973294, | |
| "grad_norm": 7.063133716583252, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8903, | |
| "step": 999 | |
| }, | |
| { | |
| "epoch": 1.486646884272997, | |
| "grad_norm": 12.400032043457031, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8696, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 1.4910979228486647, | |
| "grad_norm": 8.297316551208496, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9001, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.4955489614243325, | |
| "grad_norm": 11.91292667388916, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8941, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 6.494741916656494, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8534, | |
| "step": 1011 | |
| }, | |
| { | |
| "epoch": 1.5044510385756675, | |
| "grad_norm": 11.065376281738281, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8624, | |
| "step": 1014 | |
| }, | |
| { | |
| "epoch": 1.5089020771513353, | |
| "grad_norm": 7.178919315338135, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8742, | |
| "step": 1017 | |
| }, | |
| { | |
| "epoch": 1.513353115727003, | |
| "grad_norm": 5.641129493713379, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8493, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.5178041543026706, | |
| "grad_norm": 14.277860641479492, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8686, | |
| "step": 1023 | |
| }, | |
| { | |
| "epoch": 1.5222551928783383, | |
| "grad_norm": 9.708137512207031, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.83, | |
| "step": 1026 | |
| }, | |
| { | |
| "epoch": 1.526706231454006, | |
| "grad_norm": 7.91434383392334, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.841, | |
| "step": 1029 | |
| }, | |
| { | |
| "epoch": 1.5311572700296736, | |
| "grad_norm": 14.816337585449219, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9307, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 1.5356083086053411, | |
| "grad_norm": 12.463879585266113, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8844, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.540059347181009, | |
| "grad_norm": 7.6568217277526855, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9198, | |
| "step": 1038 | |
| }, | |
| { | |
| "epoch": 1.5445103857566767, | |
| "grad_norm": 11.649917602539062, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9174, | |
| "step": 1041 | |
| }, | |
| { | |
| "epoch": 1.5489614243323442, | |
| "grad_norm": 9.973616600036621, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9474, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 1.5534124629080117, | |
| "grad_norm": 11.569575309753418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9166, | |
| "step": 1047 | |
| }, | |
| { | |
| "epoch": 1.5578635014836797, | |
| "grad_norm": 11.283512115478516, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8188, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.5623145400593472, | |
| "grad_norm": 6.829236030578613, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8318, | |
| "step": 1053 | |
| }, | |
| { | |
| "epoch": 1.5667655786350148, | |
| "grad_norm": 10.211257934570312, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8139, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 1.5712166172106825, | |
| "grad_norm": 6.259841442108154, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8444, | |
| "step": 1059 | |
| }, | |
| { | |
| "epoch": 1.5756676557863503, | |
| "grad_norm": 14.19024658203125, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8641, | |
| "step": 1062 | |
| }, | |
| { | |
| "epoch": 1.5801186943620178, | |
| "grad_norm": 6.3594231605529785, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9041, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.5845697329376853, | |
| "grad_norm": 9.81156063079834, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8703, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 1.589020771513353, | |
| "grad_norm": 6.122777938842773, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8309, | |
| "step": 1071 | |
| }, | |
| { | |
| "epoch": 1.5934718100890208, | |
| "grad_norm": 11.714445114135742, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8151, | |
| "step": 1074 | |
| }, | |
| { | |
| "epoch": 1.5979228486646884, | |
| "grad_norm": 12.073863983154297, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9148, | |
| "step": 1077 | |
| }, | |
| { | |
| "epoch": 1.6023738872403561, | |
| "grad_norm": 8.177748680114746, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.865, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.6068249258160239, | |
| "grad_norm": 13.913122177124023, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8409, | |
| "step": 1083 | |
| }, | |
| { | |
| "epoch": 1.6112759643916914, | |
| "grad_norm": 8.375801086425781, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7768, | |
| "step": 1086 | |
| }, | |
| { | |
| "epoch": 1.615727002967359, | |
| "grad_norm": 6.173603057861328, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8343, | |
| "step": 1089 | |
| }, | |
| { | |
| "epoch": 1.6201780415430267, | |
| "grad_norm": 10.390620231628418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8338, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 1.6246290801186944, | |
| "grad_norm": 8.413612365722656, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8517, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.629080118694362, | |
| "grad_norm": 9.790428161621094, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8443, | |
| "step": 1098 | |
| }, | |
| { | |
| "epoch": 1.6335311572700295, | |
| "grad_norm": 13.228864669799805, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8481, | |
| "step": 1101 | |
| }, | |
| { | |
| "epoch": 1.6379821958456975, | |
| "grad_norm": 11.918046951293945, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8901, | |
| "step": 1104 | |
| }, | |
| { | |
| "epoch": 1.642433234421365, | |
| "grad_norm": 6.354975700378418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8817, | |
| "step": 1107 | |
| }, | |
| { | |
| "epoch": 1.6468842729970326, | |
| "grad_norm": 10.373885154724121, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8443, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.6513353115727003, | |
| "grad_norm": 7.181490421295166, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.857, | |
| "step": 1113 | |
| }, | |
| { | |
| "epoch": 1.655786350148368, | |
| "grad_norm": 8.490324020385742, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8714, | |
| "step": 1116 | |
| }, | |
| { | |
| "epoch": 1.6602373887240356, | |
| "grad_norm": 5.962569236755371, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7782, | |
| "step": 1119 | |
| }, | |
| { | |
| "epoch": 1.6646884272997031, | |
| "grad_norm": 7.268184185028076, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8829, | |
| "step": 1122 | |
| }, | |
| { | |
| "epoch": 1.6691394658753709, | |
| "grad_norm": 9.73929500579834, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8069, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.6735905044510386, | |
| "grad_norm": 8.92696762084961, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8727, | |
| "step": 1128 | |
| }, | |
| { | |
| "epoch": 1.6780415430267062, | |
| "grad_norm": 7.317033767700195, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8104, | |
| "step": 1131 | |
| }, | |
| { | |
| "epoch": 1.682492581602374, | |
| "grad_norm": 6.796001434326172, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9314, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 1.6869436201780417, | |
| "grad_norm": 8.300507545471191, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8108, | |
| "step": 1137 | |
| }, | |
| { | |
| "epoch": 1.6913946587537092, | |
| "grad_norm": 14.353339195251465, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8735, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.6958456973293767, | |
| "grad_norm": 8.713440895080566, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8634, | |
| "step": 1143 | |
| }, | |
| { | |
| "epoch": 1.7002967359050445, | |
| "grad_norm": 12.015419960021973, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8491, | |
| "step": 1146 | |
| }, | |
| { | |
| "epoch": 1.7047477744807122, | |
| "grad_norm": 5.322451114654541, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8491, | |
| "step": 1149 | |
| }, | |
| { | |
| "epoch": 1.7091988130563798, | |
| "grad_norm": 7.663971900939941, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.841, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 1.7136498516320475, | |
| "grad_norm": 9.272565841674805, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9399, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.7181008902077153, | |
| "grad_norm": 6.013884544372559, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8239, | |
| "step": 1158 | |
| }, | |
| { | |
| "epoch": 1.7225519287833828, | |
| "grad_norm": 9.190864562988281, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8211, | |
| "step": 1161 | |
| }, | |
| { | |
| "epoch": 1.7270029673590503, | |
| "grad_norm": 9.801536560058594, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8571, | |
| "step": 1164 | |
| }, | |
| { | |
| "epoch": 1.731454005934718, | |
| "grad_norm": 13.254154205322266, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8762, | |
| "step": 1167 | |
| }, | |
| { | |
| "epoch": 1.7359050445103859, | |
| "grad_norm": 10.48544979095459, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7882, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.7403560830860534, | |
| "grad_norm": 9.48491382598877, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8939, | |
| "step": 1173 | |
| }, | |
| { | |
| "epoch": 1.744807121661721, | |
| "grad_norm": 8.662673950195312, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8562, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 1.7492581602373887, | |
| "grad_norm": 11.683974266052246, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8178, | |
| "step": 1179 | |
| }, | |
| { | |
| "epoch": 1.7537091988130564, | |
| "grad_norm": 17.12523078918457, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8192, | |
| "step": 1182 | |
| }, | |
| { | |
| "epoch": 1.758160237388724, | |
| "grad_norm": 4.6900835037231445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8683, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.7626112759643917, | |
| "grad_norm": 7.892794132232666, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8594, | |
| "step": 1188 | |
| }, | |
| { | |
| "epoch": 1.7670623145400595, | |
| "grad_norm": 9.247455596923828, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.858, | |
| "step": 1191 | |
| }, | |
| { | |
| "epoch": 1.771513353115727, | |
| "grad_norm": 7.50583028793335, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7865, | |
| "step": 1194 | |
| }, | |
| { | |
| "epoch": 1.7759643916913945, | |
| "grad_norm": 4.668313503265381, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8215, | |
| "step": 1197 | |
| }, | |
| { | |
| "epoch": 1.7804154302670623, | |
| "grad_norm": 10.414295196533203, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8199, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.78486646884273, | |
| "grad_norm": 4.297197341918945, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8129, | |
| "step": 1203 | |
| }, | |
| { | |
| "epoch": 1.7893175074183976, | |
| "grad_norm": 9.394143104553223, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8624, | |
| "step": 1206 | |
| }, | |
| { | |
| "epoch": 1.7937685459940653, | |
| "grad_norm": 8.61468505859375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.797, | |
| "step": 1209 | |
| }, | |
| { | |
| "epoch": 1.798219584569733, | |
| "grad_norm": 8.216081619262695, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8217, | |
| "step": 1212 | |
| }, | |
| { | |
| "epoch": 1.8026706231454006, | |
| "grad_norm": 7.414550304412842, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8259, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.8071216617210681, | |
| "grad_norm": 7.1664042472839355, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8568, | |
| "step": 1218 | |
| }, | |
| { | |
| "epoch": 1.811572700296736, | |
| "grad_norm": 6.590891361236572, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8269, | |
| "step": 1221 | |
| }, | |
| { | |
| "epoch": 1.8160237388724036, | |
| "grad_norm": 8.408268928527832, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9052, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 1.8204747774480712, | |
| "grad_norm": 19.62491226196289, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8959, | |
| "step": 1227 | |
| }, | |
| { | |
| "epoch": 1.8249258160237387, | |
| "grad_norm": 11.636604309082031, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8714, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.8293768545994067, | |
| "grad_norm": 8.018316268920898, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8673, | |
| "step": 1233 | |
| }, | |
| { | |
| "epoch": 1.8338278931750742, | |
| "grad_norm": 4.958278179168701, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8551, | |
| "step": 1236 | |
| }, | |
| { | |
| "epoch": 1.8382789317507418, | |
| "grad_norm": 13.244430541992188, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8011, | |
| "step": 1239 | |
| }, | |
| { | |
| "epoch": 1.8427299703264095, | |
| "grad_norm": 7.185425281524658, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7873, | |
| "step": 1242 | |
| }, | |
| { | |
| "epoch": 1.8471810089020773, | |
| "grad_norm": 5.537222862243652, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8237, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.8516320474777448, | |
| "grad_norm": 5.888150215148926, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8278, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 1.8560830860534123, | |
| "grad_norm": 7.887198448181152, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8399, | |
| "step": 1251 | |
| }, | |
| { | |
| "epoch": 1.86053412462908, | |
| "grad_norm": 8.108527183532715, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8711, | |
| "step": 1254 | |
| }, | |
| { | |
| "epoch": 1.8649851632047478, | |
| "grad_norm": 4.459034442901611, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9171, | |
| "step": 1257 | |
| }, | |
| { | |
| "epoch": 1.8694362017804154, | |
| "grad_norm": 4.293658256530762, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.882, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.8738872403560831, | |
| "grad_norm": 6.042054176330566, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8623, | |
| "step": 1263 | |
| }, | |
| { | |
| "epoch": 1.8783382789317509, | |
| "grad_norm": 11.530425071716309, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9091, | |
| "step": 1266 | |
| }, | |
| { | |
| "epoch": 1.8827893175074184, | |
| "grad_norm": 7.389677047729492, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8579, | |
| "step": 1269 | |
| }, | |
| { | |
| "epoch": 1.887240356083086, | |
| "grad_norm": 10.24569034576416, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8281, | |
| "step": 1272 | |
| }, | |
| { | |
| "epoch": 1.8916913946587537, | |
| "grad_norm": 9.817954063415527, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8575, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.8961424332344214, | |
| "grad_norm": 11.875582695007324, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8375, | |
| "step": 1278 | |
| }, | |
| { | |
| "epoch": 1.900593471810089, | |
| "grad_norm": 7.8601837158203125, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.875, | |
| "step": 1281 | |
| }, | |
| { | |
| "epoch": 1.9050445103857567, | |
| "grad_norm": 6.628482341766357, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8568, | |
| "step": 1284 | |
| }, | |
| { | |
| "epoch": 1.9094955489614245, | |
| "grad_norm": 14.135390281677246, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7945, | |
| "step": 1287 | |
| }, | |
| { | |
| "epoch": 1.913946587537092, | |
| "grad_norm": 7.571518421173096, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8387, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.9183976261127595, | |
| "grad_norm": 4.84207010269165, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8573, | |
| "step": 1293 | |
| }, | |
| { | |
| "epoch": 1.9228486646884273, | |
| "grad_norm": 7.50210428237915, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8396, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 1.927299703264095, | |
| "grad_norm": 10.158517837524414, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8407, | |
| "step": 1299 | |
| }, | |
| { | |
| "epoch": 1.9317507418397626, | |
| "grad_norm": 4.945800304412842, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.849, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 1.9362017804154301, | |
| "grad_norm": 5.40016508102417, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7703, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.9406528189910979, | |
| "grad_norm": 7.8204665184021, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8625, | |
| "step": 1308 | |
| }, | |
| { | |
| "epoch": 1.9451038575667656, | |
| "grad_norm": 6.786766052246094, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8816, | |
| "step": 1311 | |
| }, | |
| { | |
| "epoch": 1.9495548961424332, | |
| "grad_norm": 6.751473903656006, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8586, | |
| "step": 1314 | |
| }, | |
| { | |
| "epoch": 1.954005934718101, | |
| "grad_norm": 9.781673431396484, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8348, | |
| "step": 1317 | |
| }, | |
| { | |
| "epoch": 1.9584569732937687, | |
| "grad_norm": 14.07801628112793, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9011, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.9629080118694362, | |
| "grad_norm": 10.769022941589355, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8547, | |
| "step": 1323 | |
| }, | |
| { | |
| "epoch": 1.9673590504451037, | |
| "grad_norm": 5.165210723876953, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8819, | |
| "step": 1326 | |
| }, | |
| { | |
| "epoch": 1.9718100890207715, | |
| "grad_norm": 6.151379108428955, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8767, | |
| "step": 1329 | |
| }, | |
| { | |
| "epoch": 1.9762611275964392, | |
| "grad_norm": 8.154912948608398, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8856, | |
| "step": 1332 | |
| }, | |
| { | |
| "epoch": 1.9807121661721068, | |
| "grad_norm": 7.511419773101807, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.835, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.9851632047477745, | |
| "grad_norm": 8.648750305175781, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8391, | |
| "step": 1338 | |
| }, | |
| { | |
| "epoch": 1.9896142433234423, | |
| "grad_norm": 5.8288984298706055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8746, | |
| "step": 1341 | |
| }, | |
| { | |
| "epoch": 1.9940652818991098, | |
| "grad_norm": 7.342560768127441, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8591, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 1.9985163204747773, | |
| "grad_norm": 9.906895637512207, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8925, | |
| "step": 1347 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 2.071904420852661, | |
| "eval_runtime": 553.6606, | |
| "eval_samples_per_second": 2.785, | |
| "eval_steps_per_second": 0.349, | |
| "step": 1348 | |
| }, | |
| { | |
| "epoch": 2.0029673590504453, | |
| "grad_norm": 11.174464225769043, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8723, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.007418397626113, | |
| "grad_norm": 6.163029193878174, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8487, | |
| "step": 1353 | |
| }, | |
| { | |
| "epoch": 2.0118694362017804, | |
| "grad_norm": 11.33940601348877, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8795, | |
| "step": 1356 | |
| }, | |
| { | |
| "epoch": 2.016320474777448, | |
| "grad_norm": 15.676403999328613, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.91, | |
| "step": 1359 | |
| }, | |
| { | |
| "epoch": 2.020771513353116, | |
| "grad_norm": 13.067048072814941, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8249, | |
| "step": 1362 | |
| }, | |
| { | |
| "epoch": 2.0252225519287834, | |
| "grad_norm": 9.354158401489258, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8865, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.029673590504451, | |
| "grad_norm": 5.574648380279541, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8637, | |
| "step": 1368 | |
| }, | |
| { | |
| "epoch": 2.0341246290801185, | |
| "grad_norm": 15.917570114135742, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8854, | |
| "step": 1371 | |
| }, | |
| { | |
| "epoch": 2.0385756676557865, | |
| "grad_norm": 5.499011516571045, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8468, | |
| "step": 1374 | |
| }, | |
| { | |
| "epoch": 2.043026706231454, | |
| "grad_norm": 15.698616027832031, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.821, | |
| "step": 1377 | |
| }, | |
| { | |
| "epoch": 2.0474777448071215, | |
| "grad_norm": 6.169116497039795, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8785, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.0519287833827895, | |
| "grad_norm": 8.438339233398438, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8298, | |
| "step": 1383 | |
| }, | |
| { | |
| "epoch": 2.056379821958457, | |
| "grad_norm": 13.13904857635498, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8698, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 2.0608308605341246, | |
| "grad_norm": 5.194973468780518, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8581, | |
| "step": 1389 | |
| }, | |
| { | |
| "epoch": 2.065281899109792, | |
| "grad_norm": 6.09019660949707, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8442, | |
| "step": 1392 | |
| }, | |
| { | |
| "epoch": 2.06973293768546, | |
| "grad_norm": 12.113000869750977, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8349, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 2.0741839762611276, | |
| "grad_norm": 8.027348518371582, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8507, | |
| "step": 1398 | |
| }, | |
| { | |
| "epoch": 2.078635014836795, | |
| "grad_norm": 11.222186088562012, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8345, | |
| "step": 1401 | |
| }, | |
| { | |
| "epoch": 2.083086053412463, | |
| "grad_norm": 7.976278781890869, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8036, | |
| "step": 1404 | |
| }, | |
| { | |
| "epoch": 2.0875370919881306, | |
| "grad_norm": 9.854942321777344, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8363, | |
| "step": 1407 | |
| }, | |
| { | |
| "epoch": 2.091988130563798, | |
| "grad_norm": 11.801050186157227, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7906, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.0964391691394657, | |
| "grad_norm": 9.733396530151367, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8425, | |
| "step": 1413 | |
| }, | |
| { | |
| "epoch": 2.1008902077151337, | |
| "grad_norm": 11.16501235961914, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7729, | |
| "step": 1416 | |
| }, | |
| { | |
| "epoch": 2.105341246290801, | |
| "grad_norm": 10.145631790161133, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8017, | |
| "step": 1419 | |
| }, | |
| { | |
| "epoch": 2.1097922848664687, | |
| "grad_norm": 4.5289764404296875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9057, | |
| "step": 1422 | |
| }, | |
| { | |
| "epoch": 2.1142433234421363, | |
| "grad_norm": 6.727800369262695, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8651, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 2.1186943620178043, | |
| "grad_norm": 11.357308387756348, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8358, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 2.123145400593472, | |
| "grad_norm": 6.047675609588623, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8265, | |
| "step": 1431 | |
| }, | |
| { | |
| "epoch": 2.1275964391691393, | |
| "grad_norm": 8.08861255645752, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8374, | |
| "step": 1434 | |
| }, | |
| { | |
| "epoch": 2.1320474777448073, | |
| "grad_norm": 7.7563958168029785, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8818, | |
| "step": 1437 | |
| }, | |
| { | |
| "epoch": 2.136498516320475, | |
| "grad_norm": 7.988875865936279, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8241, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.1409495548961424, | |
| "grad_norm": 12.524341583251953, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7918, | |
| "step": 1443 | |
| }, | |
| { | |
| "epoch": 2.14540059347181, | |
| "grad_norm": 6.229768753051758, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.862, | |
| "step": 1446 | |
| }, | |
| { | |
| "epoch": 2.149851632047478, | |
| "grad_norm": 8.271695137023926, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8522, | |
| "step": 1449 | |
| }, | |
| { | |
| "epoch": 2.1543026706231454, | |
| "grad_norm": 5.045875072479248, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8574, | |
| "step": 1452 | |
| }, | |
| { | |
| "epoch": 2.158753709198813, | |
| "grad_norm": 11.379587173461914, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8524, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 2.163204747774481, | |
| "grad_norm": 8.184687614440918, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.84, | |
| "step": 1458 | |
| }, | |
| { | |
| "epoch": 2.1676557863501484, | |
| "grad_norm": 9.615589141845703, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8334, | |
| "step": 1461 | |
| }, | |
| { | |
| "epoch": 2.172106824925816, | |
| "grad_norm": 18.80459213256836, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.805, | |
| "step": 1464 | |
| }, | |
| { | |
| "epoch": 2.1765578635014835, | |
| "grad_norm": 14.540130615234375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8749, | |
| "step": 1467 | |
| }, | |
| { | |
| "epoch": 2.1810089020771515, | |
| "grad_norm": 6.465779781341553, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8481, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.185459940652819, | |
| "grad_norm": 9.467011451721191, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8818, | |
| "step": 1473 | |
| }, | |
| { | |
| "epoch": 2.1899109792284865, | |
| "grad_norm": 11.624500274658203, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8488, | |
| "step": 1476 | |
| }, | |
| { | |
| "epoch": 2.1943620178041545, | |
| "grad_norm": 4.053292751312256, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8816, | |
| "step": 1479 | |
| }, | |
| { | |
| "epoch": 2.198813056379822, | |
| "grad_norm": 11.990628242492676, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8632, | |
| "step": 1482 | |
| }, | |
| { | |
| "epoch": 2.2032640949554896, | |
| "grad_norm": 5.125602722167969, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8546, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 2.207715133531157, | |
| "grad_norm": 12.101594924926758, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7745, | |
| "step": 1488 | |
| }, | |
| { | |
| "epoch": 2.212166172106825, | |
| "grad_norm": 7.778988838195801, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8345, | |
| "step": 1491 | |
| }, | |
| { | |
| "epoch": 2.2166172106824926, | |
| "grad_norm": 9.549551010131836, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.788, | |
| "step": 1494 | |
| }, | |
| { | |
| "epoch": 2.22106824925816, | |
| "grad_norm": 9.322439193725586, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8578, | |
| "step": 1497 | |
| }, | |
| { | |
| "epoch": 2.2255192878338277, | |
| "grad_norm": 4.3148298263549805, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8553, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.2299703264094957, | |
| "grad_norm": 8.451520919799805, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8682, | |
| "step": 1503 | |
| }, | |
| { | |
| "epoch": 2.234421364985163, | |
| "grad_norm": 6.928389072418213, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8307, | |
| "step": 1506 | |
| }, | |
| { | |
| "epoch": 2.2388724035608307, | |
| "grad_norm": 6.243911266326904, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8033, | |
| "step": 1509 | |
| }, | |
| { | |
| "epoch": 2.2433234421364987, | |
| "grad_norm": 5.559226036071777, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8383, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 2.2477744807121662, | |
| "grad_norm": 4.369063854217529, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8009, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 2.2522255192878338, | |
| "grad_norm": 7.634733200073242, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7984, | |
| "step": 1518 | |
| }, | |
| { | |
| "epoch": 2.2566765578635013, | |
| "grad_norm": 6.254056453704834, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8802, | |
| "step": 1521 | |
| }, | |
| { | |
| "epoch": 2.2611275964391693, | |
| "grad_norm": 5.46887731552124, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8178, | |
| "step": 1524 | |
| }, | |
| { | |
| "epoch": 2.265578635014837, | |
| "grad_norm": 16.369176864624023, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8448, | |
| "step": 1527 | |
| }, | |
| { | |
| "epoch": 2.2700296735905043, | |
| "grad_norm": 7.460346221923828, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8313, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.274480712166172, | |
| "grad_norm": 11.850396156311035, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8652, | |
| "step": 1533 | |
| }, | |
| { | |
| "epoch": 2.27893175074184, | |
| "grad_norm": 7.525960445404053, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9073, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 2.2833827893175074, | |
| "grad_norm": 6.6893391609191895, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8186, | |
| "step": 1539 | |
| }, | |
| { | |
| "epoch": 2.287833827893175, | |
| "grad_norm": 8.127947807312012, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8062, | |
| "step": 1542 | |
| }, | |
| { | |
| "epoch": 2.292284866468843, | |
| "grad_norm": 4.763282299041748, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7815, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 2.2967359050445104, | |
| "grad_norm": 8.980463981628418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8257, | |
| "step": 1548 | |
| }, | |
| { | |
| "epoch": 2.301186943620178, | |
| "grad_norm": 5.902709484100342, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8655, | |
| "step": 1551 | |
| }, | |
| { | |
| "epoch": 2.3056379821958455, | |
| "grad_norm": 9.1312255859375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8125, | |
| "step": 1554 | |
| }, | |
| { | |
| "epoch": 2.3100890207715135, | |
| "grad_norm": 11.039669036865234, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8216, | |
| "step": 1557 | |
| }, | |
| { | |
| "epoch": 2.314540059347181, | |
| "grad_norm": 8.05490779876709, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8396, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.3189910979228485, | |
| "grad_norm": 5.826514720916748, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8757, | |
| "step": 1563 | |
| }, | |
| { | |
| "epoch": 2.3234421364985165, | |
| "grad_norm": 7.574896812438965, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7736, | |
| "step": 1566 | |
| }, | |
| { | |
| "epoch": 2.327893175074184, | |
| "grad_norm": 6.01354455947876, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8401, | |
| "step": 1569 | |
| }, | |
| { | |
| "epoch": 2.3323442136498516, | |
| "grad_norm": 6.542453289031982, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8068, | |
| "step": 1572 | |
| }, | |
| { | |
| "epoch": 2.336795252225519, | |
| "grad_norm": 9.089799880981445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8628, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 2.341246290801187, | |
| "grad_norm": 6.65020227432251, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8298, | |
| "step": 1578 | |
| }, | |
| { | |
| "epoch": 2.3456973293768546, | |
| "grad_norm": 6.966747760772705, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8445, | |
| "step": 1581 | |
| }, | |
| { | |
| "epoch": 2.350148367952522, | |
| "grad_norm": 8.938283920288086, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8404, | |
| "step": 1584 | |
| }, | |
| { | |
| "epoch": 2.35459940652819, | |
| "grad_norm": 7.403584957122803, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8531, | |
| "step": 1587 | |
| }, | |
| { | |
| "epoch": 2.3590504451038576, | |
| "grad_norm": 5.015456199645996, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8488, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.363501483679525, | |
| "grad_norm": 10.829426765441895, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8292, | |
| "step": 1593 | |
| }, | |
| { | |
| "epoch": 2.3679525222551927, | |
| "grad_norm": 10.542449951171875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9097, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 2.3724035608308607, | |
| "grad_norm": 5.919280529022217, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.857, | |
| "step": 1599 | |
| }, | |
| { | |
| "epoch": 2.376854599406528, | |
| "grad_norm": 12.15097713470459, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8439, | |
| "step": 1602 | |
| }, | |
| { | |
| "epoch": 2.3813056379821957, | |
| "grad_norm": 12.634583473205566, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8838, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 2.3857566765578637, | |
| "grad_norm": 9.54806900024414, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9001, | |
| "step": 1608 | |
| }, | |
| { | |
| "epoch": 2.3902077151335313, | |
| "grad_norm": 5.300346851348877, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.84, | |
| "step": 1611 | |
| }, | |
| { | |
| "epoch": 2.394658753709199, | |
| "grad_norm": 6.94837760925293, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7643, | |
| "step": 1614 | |
| }, | |
| { | |
| "epoch": 2.3991097922848663, | |
| "grad_norm": 12.666196823120117, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8797, | |
| "step": 1617 | |
| }, | |
| { | |
| "epoch": 2.4035608308605343, | |
| "grad_norm": 11.400768280029297, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8755, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.408011869436202, | |
| "grad_norm": 9.61484146118164, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8662, | |
| "step": 1623 | |
| }, | |
| { | |
| "epoch": 2.4124629080118694, | |
| "grad_norm": 7.2894110679626465, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7903, | |
| "step": 1626 | |
| }, | |
| { | |
| "epoch": 2.4169139465875373, | |
| "grad_norm": 4.545930862426758, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.862, | |
| "step": 1629 | |
| }, | |
| { | |
| "epoch": 2.421364985163205, | |
| "grad_norm": 16.610261917114258, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8769, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 2.4258160237388724, | |
| "grad_norm": 14.895539283752441, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8884, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 2.43026706231454, | |
| "grad_norm": 6.956692218780518, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8112, | |
| "step": 1638 | |
| }, | |
| { | |
| "epoch": 2.434718100890208, | |
| "grad_norm": 8.233116149902344, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8461, | |
| "step": 1641 | |
| }, | |
| { | |
| "epoch": 2.4391691394658754, | |
| "grad_norm": 9.529879570007324, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8649, | |
| "step": 1644 | |
| }, | |
| { | |
| "epoch": 2.443620178041543, | |
| "grad_norm": 7.341912269592285, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8393, | |
| "step": 1647 | |
| }, | |
| { | |
| "epoch": 2.4480712166172105, | |
| "grad_norm": 7.184902667999268, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8309, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.4525222551928785, | |
| "grad_norm": 14.62401008605957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8549, | |
| "step": 1653 | |
| }, | |
| { | |
| "epoch": 2.456973293768546, | |
| "grad_norm": 5.0358662605285645, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8726, | |
| "step": 1656 | |
| }, | |
| { | |
| "epoch": 2.4614243323442135, | |
| "grad_norm": 3.3150253295898438, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8276, | |
| "step": 1659 | |
| }, | |
| { | |
| "epoch": 2.465875370919881, | |
| "grad_norm": 8.129389762878418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8692, | |
| "step": 1662 | |
| }, | |
| { | |
| "epoch": 2.470326409495549, | |
| "grad_norm": 8.148307800292969, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8713, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 2.4747774480712166, | |
| "grad_norm": 5.398252487182617, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8615, | |
| "step": 1668 | |
| }, | |
| { | |
| "epoch": 2.479228486646884, | |
| "grad_norm": 4.667980194091797, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8324, | |
| "step": 1671 | |
| }, | |
| { | |
| "epoch": 2.483679525222552, | |
| "grad_norm": 6.927284240722656, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8589, | |
| "step": 1674 | |
| }, | |
| { | |
| "epoch": 2.4881305637982196, | |
| "grad_norm": 11.005992889404297, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8564, | |
| "step": 1677 | |
| }, | |
| { | |
| "epoch": 2.492581602373887, | |
| "grad_norm": 16.280454635620117, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8761, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.4970326409495547, | |
| "grad_norm": 8.563511848449707, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8643, | |
| "step": 1683 | |
| }, | |
| { | |
| "epoch": 2.5014836795252227, | |
| "grad_norm": 17.003629684448242, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8249, | |
| "step": 1686 | |
| }, | |
| { | |
| "epoch": 2.50593471810089, | |
| "grad_norm": 6.441048622131348, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9072, | |
| "step": 1689 | |
| }, | |
| { | |
| "epoch": 2.5103857566765577, | |
| "grad_norm": 6.359565734863281, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8495, | |
| "step": 1692 | |
| }, | |
| { | |
| "epoch": 2.5148367952522257, | |
| "grad_norm": 9.161234855651855, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8319, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 2.5192878338278932, | |
| "grad_norm": 10.241405487060547, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7881, | |
| "step": 1698 | |
| }, | |
| { | |
| "epoch": 2.5237388724035608, | |
| "grad_norm": 9.603667259216309, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8681, | |
| "step": 1701 | |
| }, | |
| { | |
| "epoch": 2.5281899109792283, | |
| "grad_norm": 8.364523887634277, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7961, | |
| "step": 1704 | |
| }, | |
| { | |
| "epoch": 2.5326409495548963, | |
| "grad_norm": 8.140654563903809, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8807, | |
| "step": 1707 | |
| }, | |
| { | |
| "epoch": 2.537091988130564, | |
| "grad_norm": 12.45283031463623, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8206, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.5415430267062313, | |
| "grad_norm": 7.65419864654541, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8989, | |
| "step": 1713 | |
| }, | |
| { | |
| "epoch": 2.5459940652818993, | |
| "grad_norm": 4.040281295776367, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8814, | |
| "step": 1716 | |
| }, | |
| { | |
| "epoch": 2.550445103857567, | |
| "grad_norm": 11.04344654083252, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8469, | |
| "step": 1719 | |
| }, | |
| { | |
| "epoch": 2.5548961424332344, | |
| "grad_norm": 12.735292434692383, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8847, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 2.559347181008902, | |
| "grad_norm": 7.6085686683654785, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.851, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 2.56379821958457, | |
| "grad_norm": 10.644798278808594, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.861, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 2.5682492581602374, | |
| "grad_norm": 7.817785263061523, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8574, | |
| "step": 1731 | |
| }, | |
| { | |
| "epoch": 2.572700296735905, | |
| "grad_norm": 14.533990859985352, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8883, | |
| "step": 1734 | |
| }, | |
| { | |
| "epoch": 2.577151335311573, | |
| "grad_norm": 9.98595905303955, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7992, | |
| "step": 1737 | |
| }, | |
| { | |
| "epoch": 2.5816023738872405, | |
| "grad_norm": 13.704192161560059, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8781, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.586053412462908, | |
| "grad_norm": 6.400760650634766, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8248, | |
| "step": 1743 | |
| }, | |
| { | |
| "epoch": 2.5905044510385755, | |
| "grad_norm": 3.9698002338409424, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7804, | |
| "step": 1746 | |
| }, | |
| { | |
| "epoch": 2.594955489614243, | |
| "grad_norm": 5.405271053314209, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8691, | |
| "step": 1749 | |
| }, | |
| { | |
| "epoch": 2.599406528189911, | |
| "grad_norm": 7.326030731201172, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7876, | |
| "step": 1752 | |
| }, | |
| { | |
| "epoch": 2.6038575667655786, | |
| "grad_norm": 12.94884967803955, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7877, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 2.6083086053412465, | |
| "grad_norm": 12.542633056640625, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8311, | |
| "step": 1758 | |
| }, | |
| { | |
| "epoch": 2.612759643916914, | |
| "grad_norm": 12.357892036437988, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8398, | |
| "step": 1761 | |
| }, | |
| { | |
| "epoch": 2.6172106824925816, | |
| "grad_norm": 10.735803604125977, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8557, | |
| "step": 1764 | |
| }, | |
| { | |
| "epoch": 2.621661721068249, | |
| "grad_norm": 7.849278450012207, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8559, | |
| "step": 1767 | |
| }, | |
| { | |
| "epoch": 2.6261127596439167, | |
| "grad_norm": 7.459741592407227, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8382, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.6305637982195846, | |
| "grad_norm": 9.422908782958984, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8586, | |
| "step": 1773 | |
| }, | |
| { | |
| "epoch": 2.635014836795252, | |
| "grad_norm": 6.327311038970947, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8232, | |
| "step": 1776 | |
| }, | |
| { | |
| "epoch": 2.63946587537092, | |
| "grad_norm": 10.571976661682129, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8694, | |
| "step": 1779 | |
| }, | |
| { | |
| "epoch": 2.6439169139465877, | |
| "grad_norm": 17.467416763305664, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8693, | |
| "step": 1782 | |
| }, | |
| { | |
| "epoch": 2.648367952522255, | |
| "grad_norm": 6.911043167114258, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8643, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 2.6528189910979227, | |
| "grad_norm": 10.180506706237793, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.832, | |
| "step": 1788 | |
| }, | |
| { | |
| "epoch": 2.6572700296735903, | |
| "grad_norm": 5.487372398376465, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8211, | |
| "step": 1791 | |
| }, | |
| { | |
| "epoch": 2.6617210682492582, | |
| "grad_norm": 8.488285064697266, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8174, | |
| "step": 1794 | |
| }, | |
| { | |
| "epoch": 2.666172106824926, | |
| "grad_norm": 14.654566764831543, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8443, | |
| "step": 1797 | |
| }, | |
| { | |
| "epoch": 2.6706231454005933, | |
| "grad_norm": 8.551965713500977, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8778, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.6750741839762613, | |
| "grad_norm": 6.797290802001953, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.859, | |
| "step": 1803 | |
| }, | |
| { | |
| "epoch": 2.679525222551929, | |
| "grad_norm": 4.413401126861572, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9004, | |
| "step": 1806 | |
| }, | |
| { | |
| "epoch": 2.6839762611275964, | |
| "grad_norm": 8.826961517333984, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8618, | |
| "step": 1809 | |
| }, | |
| { | |
| "epoch": 2.688427299703264, | |
| "grad_norm": 6.915543556213379, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8242, | |
| "step": 1812 | |
| }, | |
| { | |
| "epoch": 2.692878338278932, | |
| "grad_norm": 7.802698612213135, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8373, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 2.6973293768545994, | |
| "grad_norm": 4.345271587371826, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8212, | |
| "step": 1818 | |
| }, | |
| { | |
| "epoch": 2.701780415430267, | |
| "grad_norm": 8.312252044677734, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.807, | |
| "step": 1821 | |
| }, | |
| { | |
| "epoch": 2.706231454005935, | |
| "grad_norm": 5.19842004776001, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8513, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 2.7106824925816024, | |
| "grad_norm": 14.573792457580566, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8215, | |
| "step": 1827 | |
| }, | |
| { | |
| "epoch": 2.71513353115727, | |
| "grad_norm": 6.5800323486328125, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8826, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.7195845697329375, | |
| "grad_norm": 14.643542289733887, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7716, | |
| "step": 1833 | |
| }, | |
| { | |
| "epoch": 2.7240356083086055, | |
| "grad_norm": 12.744583129882812, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8621, | |
| "step": 1836 | |
| }, | |
| { | |
| "epoch": 2.728486646884273, | |
| "grad_norm": 12.435503005981445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8908, | |
| "step": 1839 | |
| }, | |
| { | |
| "epoch": 2.7329376854599405, | |
| "grad_norm": 6.115302562713623, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8921, | |
| "step": 1842 | |
| }, | |
| { | |
| "epoch": 2.7373887240356085, | |
| "grad_norm": 14.632364273071289, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8535, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 2.741839762611276, | |
| "grad_norm": 5.676476001739502, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8872, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 2.7462908011869436, | |
| "grad_norm": 12.727757453918457, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8836, | |
| "step": 1851 | |
| }, | |
| { | |
| "epoch": 2.750741839762611, | |
| "grad_norm": 5.729983329772949, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8453, | |
| "step": 1854 | |
| }, | |
| { | |
| "epoch": 2.755192878338279, | |
| "grad_norm": 8.607340812683105, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8795, | |
| "step": 1857 | |
| }, | |
| { | |
| "epoch": 2.7596439169139466, | |
| "grad_norm": 7.55084228515625, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8788, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.764094955489614, | |
| "grad_norm": 10.093510627746582, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8816, | |
| "step": 1863 | |
| }, | |
| { | |
| "epoch": 2.768545994065282, | |
| "grad_norm": 8.678201675415039, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.867, | |
| "step": 1866 | |
| }, | |
| { | |
| "epoch": 2.7729970326409497, | |
| "grad_norm": 6.614081859588623, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.887, | |
| "step": 1869 | |
| }, | |
| { | |
| "epoch": 2.777448071216617, | |
| "grad_norm": 6.593700408935547, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8929, | |
| "step": 1872 | |
| }, | |
| { | |
| "epoch": 2.7818991097922847, | |
| "grad_norm": 5.097481727600098, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8629, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 2.7863501483679523, | |
| "grad_norm": 5.016817569732666, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8413, | |
| "step": 1878 | |
| }, | |
| { | |
| "epoch": 2.7908011869436202, | |
| "grad_norm": 7.502362251281738, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8945, | |
| "step": 1881 | |
| }, | |
| { | |
| "epoch": 2.7952522255192878, | |
| "grad_norm": 4.612887859344482, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8811, | |
| "step": 1884 | |
| }, | |
| { | |
| "epoch": 2.7997032640949557, | |
| "grad_norm": 5.493846893310547, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8757, | |
| "step": 1887 | |
| }, | |
| { | |
| "epoch": 2.8041543026706233, | |
| "grad_norm": 8.605670928955078, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8047, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.808605341246291, | |
| "grad_norm": 12.178396224975586, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8905, | |
| "step": 1893 | |
| }, | |
| { | |
| "epoch": 2.8130563798219583, | |
| "grad_norm": 8.929186820983887, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8869, | |
| "step": 1896 | |
| }, | |
| { | |
| "epoch": 2.817507418397626, | |
| "grad_norm": 6.589859962463379, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8473, | |
| "step": 1899 | |
| }, | |
| { | |
| "epoch": 2.821958456973294, | |
| "grad_norm": 10.543880462646484, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8613, | |
| "step": 1902 | |
| }, | |
| { | |
| "epoch": 2.8264094955489614, | |
| "grad_norm": 8.176854133605957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8429, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 2.8308605341246293, | |
| "grad_norm": 5.652864456176758, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8754, | |
| "step": 1908 | |
| }, | |
| { | |
| "epoch": 2.835311572700297, | |
| "grad_norm": 8.587650299072266, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9134, | |
| "step": 1911 | |
| }, | |
| { | |
| "epoch": 2.8397626112759644, | |
| "grad_norm": 5.4106974601745605, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8334, | |
| "step": 1914 | |
| }, | |
| { | |
| "epoch": 2.844213649851632, | |
| "grad_norm": 6.253225803375244, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.901, | |
| "step": 1917 | |
| }, | |
| { | |
| "epoch": 2.8486646884272995, | |
| "grad_norm": 8.90531063079834, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8316, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.8531157270029674, | |
| "grad_norm": 4.412182807922363, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8384, | |
| "step": 1923 | |
| }, | |
| { | |
| "epoch": 2.857566765578635, | |
| "grad_norm": 6.357685565948486, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8903, | |
| "step": 1926 | |
| }, | |
| { | |
| "epoch": 2.8620178041543025, | |
| "grad_norm": 7.667703628540039, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8455, | |
| "step": 1929 | |
| }, | |
| { | |
| "epoch": 2.8664688427299705, | |
| "grad_norm": 10.909478187561035, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8399, | |
| "step": 1932 | |
| }, | |
| { | |
| "epoch": 2.870919881305638, | |
| "grad_norm": 7.347332954406738, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8198, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 2.8753709198813056, | |
| "grad_norm": 7.22322416305542, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9074, | |
| "step": 1938 | |
| }, | |
| { | |
| "epoch": 2.879821958456973, | |
| "grad_norm": 5.389438152313232, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.846, | |
| "step": 1941 | |
| }, | |
| { | |
| "epoch": 2.884272997032641, | |
| "grad_norm": 8.13633918762207, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8615, | |
| "step": 1944 | |
| }, | |
| { | |
| "epoch": 2.8887240356083086, | |
| "grad_norm": 7.694199085235596, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7791, | |
| "step": 1947 | |
| }, | |
| { | |
| "epoch": 2.893175074183976, | |
| "grad_norm": 10.673176765441895, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8234, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.897626112759644, | |
| "grad_norm": 7.695837020874023, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8471, | |
| "step": 1953 | |
| }, | |
| { | |
| "epoch": 2.9020771513353116, | |
| "grad_norm": 11.210200309753418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8561, | |
| "step": 1956 | |
| }, | |
| { | |
| "epoch": 2.906528189910979, | |
| "grad_norm": 13.856889724731445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8177, | |
| "step": 1959 | |
| }, | |
| { | |
| "epoch": 2.9109792284866467, | |
| "grad_norm": 6.0733489990234375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8885, | |
| "step": 1962 | |
| }, | |
| { | |
| "epoch": 2.9154302670623147, | |
| "grad_norm": 7.38955545425415, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8502, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 2.919881305637982, | |
| "grad_norm": 13.866927146911621, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8622, | |
| "step": 1968 | |
| }, | |
| { | |
| "epoch": 2.9243323442136497, | |
| "grad_norm": 6.984748363494873, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9041, | |
| "step": 1971 | |
| }, | |
| { | |
| "epoch": 2.9287833827893177, | |
| "grad_norm": 13.53242301940918, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8367, | |
| "step": 1974 | |
| }, | |
| { | |
| "epoch": 2.9332344213649852, | |
| "grad_norm": 6.702526092529297, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8343, | |
| "step": 1977 | |
| }, | |
| { | |
| "epoch": 2.9376854599406528, | |
| "grad_norm": 5.4899678230285645, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8873, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.9421364985163203, | |
| "grad_norm": 8.483062744140625, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8078, | |
| "step": 1983 | |
| }, | |
| { | |
| "epoch": 2.9465875370919883, | |
| "grad_norm": 7.6923065185546875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8316, | |
| "step": 1986 | |
| }, | |
| { | |
| "epoch": 2.951038575667656, | |
| "grad_norm": 4.571675777435303, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.882, | |
| "step": 1989 | |
| }, | |
| { | |
| "epoch": 2.9554896142433233, | |
| "grad_norm": 8.073565483093262, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8233, | |
| "step": 1992 | |
| }, | |
| { | |
| "epoch": 2.9599406528189913, | |
| "grad_norm": 13.49317455291748, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9236, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 2.964391691394659, | |
| "grad_norm": 6.603755474090576, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8692, | |
| "step": 1998 | |
| }, | |
| { | |
| "epoch": 2.9688427299703264, | |
| "grad_norm": 8.523149490356445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8333, | |
| "step": 2001 | |
| }, | |
| { | |
| "epoch": 2.973293768545994, | |
| "grad_norm": 9.513497352600098, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8879, | |
| "step": 2004 | |
| }, | |
| { | |
| "epoch": 2.9777448071216615, | |
| "grad_norm": 8.058304786682129, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8539, | |
| "step": 2007 | |
| }, | |
| { | |
| "epoch": 2.9821958456973294, | |
| "grad_norm": 6.942746162414551, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8532, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.986646884272997, | |
| "grad_norm": 7.1598639488220215, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8094, | |
| "step": 2013 | |
| }, | |
| { | |
| "epoch": 2.991097922848665, | |
| "grad_norm": 7.722570419311523, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.85, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 2.9955489614243325, | |
| "grad_norm": 7.609329700469971, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.9311, | |
| "step": 2019 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 6.5114426612854, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.8043, | |
| "step": 2022 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 2.056363105773926, | |
| "eval_runtime": 552.8503, | |
| "eval_samples_per_second": 2.789, | |
| "eval_steps_per_second": 0.349, | |
| "step": 2022 | |
| } | |
| ], | |
| "logging_steps": 3, | |
| "max_steps": 6740, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |