Instructions to use tkhangg0910/Dual-Explain-2_round with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use tkhangg0910/Dual-Explain-2_round with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("tkhangg0910/Merged-Dual-Explain-stage_full") model = PeftModel.from_pretrained(base_model, "tkhangg0910/Dual-Explain-2_round") - Transformers
How to use tkhangg0910/Dual-Explain-2_round with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="tkhangg0910/Dual-Explain-2_round") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("tkhangg0910/Dual-Explain-2_round", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use tkhangg0910/Dual-Explain-2_round with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "tkhangg0910/Dual-Explain-2_round" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tkhangg0910/Dual-Explain-2_round", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/tkhangg0910/Dual-Explain-2_round
- SGLang
How to use tkhangg0910/Dual-Explain-2_round with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "tkhangg0910/Dual-Explain-2_round" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tkhangg0910/Dual-Explain-2_round", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "tkhangg0910/Dual-Explain-2_round" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "tkhangg0910/Dual-Explain-2_round", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use tkhangg0910/Dual-Explain-2_round with Docker Model Runner:
docker model run hf.co/tkhangg0910/Dual-Explain-2_round
| { | |
| "best_global_step": null, | |
| "best_metric": 0.6839648485183716, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1011, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002967359050445104, | |
| "grad_norm": 15.052371978759766, | |
| "learning_rate": 0.0, | |
| "loss": 0.3511, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008902077151335312, | |
| "grad_norm": 11.145326614379883, | |
| "learning_rate": 3.2123800577354604e-06, | |
| "loss": 0.3486, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.017804154302670624, | |
| "grad_norm": 7.514125823974609, | |
| "learning_rate": 5.239166215940359e-06, | |
| "loss": 0.3444, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.026706231454005934, | |
| "grad_norm": 8.797248840332031, | |
| "learning_rate": 6.424760115470921e-06, | |
| "loss": 0.3284, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03560830860534125, | |
| "grad_norm": 3.7588727474212646, | |
| "learning_rate": 7.265952374145257e-06, | |
| "loss": 0.3375, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04451038575667656, | |
| "grad_norm": 7.172510623931885, | |
| "learning_rate": 7.918431780800236e-06, | |
| "loss": 0.3271, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05341246290801187, | |
| "grad_norm": 3.615478992462158, | |
| "learning_rate": 8.451546273675818e-06, | |
| "loss": 0.3277, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06231454005934718, | |
| "grad_norm": 6.7531232833862305, | |
| "learning_rate": 8.902288154930203e-06, | |
| "loss": 0.3343, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0712166172106825, | |
| "grad_norm": 6.248157501220703, | |
| "learning_rate": 9.292738532350157e-06, | |
| "loss": 0.3294, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08011869436201781, | |
| "grad_norm": 6.436707019805908, | |
| "learning_rate": 9.637140173206382e-06, | |
| "loss": 0.3358, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.08902077151335312, | |
| "grad_norm": 9.06855583190918, | |
| "learning_rate": 9.945217939005136e-06, | |
| "loss": 0.3293, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09792284866468842, | |
| "grad_norm": 7.107985973358154, | |
| "learning_rate": 1.0223908177645902e-05, | |
| "loss": 0.3111, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.10682492581602374, | |
| "grad_norm": 6.3856520652771, | |
| "learning_rate": 1.0478332431880717e-05, | |
| "loss": 0.3251, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.11572700296735905, | |
| "grad_norm": 4.95862340927124, | |
| "learning_rate": 1.0712380057735461e-05, | |
| "loss": 0.3211, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.12462908011869436, | |
| "grad_norm": 4.809103012084961, | |
| "learning_rate": 1.0929074313135101e-05, | |
| "loss": 0.3244, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.13353115727002968, | |
| "grad_norm": 6.697277069091797, | |
| "learning_rate": 1.1130811838535696e-05, | |
| "loss": 0.3179, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.142433234421365, | |
| "grad_norm": 12.093936920166016, | |
| "learning_rate": 1.1319524690555053e-05, | |
| "loss": 0.3339, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1513353115727003, | |
| "grad_norm": 8.427734375, | |
| "learning_rate": 1.1496793166558515e-05, | |
| "loss": 0.329, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.16023738872403562, | |
| "grad_norm": 11.345624923706055, | |
| "learning_rate": 1.1663926331411281e-05, | |
| "loss": 0.3212, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.16913946587537093, | |
| "grad_norm": 7.70440149307251, | |
| "learning_rate": 1.1822020743040672e-05, | |
| "loss": 0.3119, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.17804154302670624, | |
| "grad_norm": 4.138779163360596, | |
| "learning_rate": 1.1972004097210032e-05, | |
| "loss": 0.3186, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.18694362017804153, | |
| "grad_norm": 9.505159378051758, | |
| "learning_rate": 1.2114668212665663e-05, | |
| "loss": 0.3125, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.19584569732937684, | |
| "grad_norm": 5.866057395935059, | |
| "learning_rate": 1.2250694335850798e-05, | |
| "loss": 0.3217, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.20474777448071216, | |
| "grad_norm": 9.940937995910645, | |
| "learning_rate": 1.238067281605409e-05, | |
| "loss": 0.3295, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.21364985163204747, | |
| "grad_norm": 8.819473266601562, | |
| "learning_rate": 1.2505118590085615e-05, | |
| "loss": 0.3351, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.22255192878338279, | |
| "grad_norm": 9.78069019317627, | |
| "learning_rate": 1.262448350386501e-05, | |
| "loss": 0.3383, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2314540059347181, | |
| "grad_norm": 5.660183429718018, | |
| "learning_rate": 1.2739166215940359e-05, | |
| "loss": 0.3252, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2403560830860534, | |
| "grad_norm": 7.043887615203857, | |
| "learning_rate": 1.2849520230941842e-05, | |
| "loss": 0.3249, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.24925816023738873, | |
| "grad_norm": 7.998411178588867, | |
| "learning_rate": 1.295586047134e-05, | |
| "loss": 0.3359, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.258160237388724, | |
| "grad_norm": 11.106488227844238, | |
| "learning_rate": 1.3058468695482481e-05, | |
| "loss": 0.3211, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.26706231454005935, | |
| "grad_norm": 5.680809497833252, | |
| "learning_rate": 1.3157597996740594e-05, | |
| "loss": 0.3152, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.27596439169139464, | |
| "grad_norm": 6.006475925445557, | |
| "learning_rate": 1.3253476564657357e-05, | |
| "loss": 0.3171, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.28486646884273, | |
| "grad_norm": 4.647056579589844, | |
| "learning_rate": 1.3346310848759951e-05, | |
| "loss": 0.31, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.29376854599406527, | |
| "grad_norm": 4.605801582336426, | |
| "learning_rate": 1.343628823538136e-05, | |
| "loss": 0.3185, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.3026706231454006, | |
| "grad_norm": 17.812042236328125, | |
| "learning_rate": 1.3523579324763411e-05, | |
| "loss": 0.3161, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3115727002967359, | |
| "grad_norm": 12.627123832702637, | |
| "learning_rate": 1.3608339877994978e-05, | |
| "loss": 0.329, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.32047477744807124, | |
| "grad_norm": 7.308374404907227, | |
| "learning_rate": 1.3690712489616179e-05, | |
| "loss": 0.3179, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3293768545994065, | |
| "grad_norm": 5.522846698760986, | |
| "learning_rate": 1.3770828031006136e-05, | |
| "loss": 0.3245, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.33827893175074186, | |
| "grad_norm": 8.475018501281738, | |
| "learning_rate": 1.384880690124557e-05, | |
| "loss": 0.3105, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.34718100890207715, | |
| "grad_norm": 8.174044609069824, | |
| "learning_rate": 1.3924760115470921e-05, | |
| "loss": 0.3057, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.3560830860534125, | |
| "grad_norm": 6.196917533874512, | |
| "learning_rate": 1.399879025541493e-05, | |
| "loss": 0.336, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3649851632047478, | |
| "grad_norm": 9.92663860321045, | |
| "learning_rate": 1.4070992302558296e-05, | |
| "loss": 0.3253, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.37388724035608306, | |
| "grad_norm": 7.014742374420166, | |
| "learning_rate": 1.4141454370870561e-05, | |
| "loss": 0.3129, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3827893175074184, | |
| "grad_norm": 8.182991027832031, | |
| "learning_rate": 1.421025835332077e-05, | |
| "loss": 0.3329, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.3916913946587537, | |
| "grad_norm": 7.044063091278076, | |
| "learning_rate": 1.4277480494055697e-05, | |
| "loss": 0.3265, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.40059347181008903, | |
| "grad_norm": 7.129275321960449, | |
| "learning_rate": 1.4343191896271158e-05, | |
| "loss": 0.3259, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4094955489614243, | |
| "grad_norm": 6.028862476348877, | |
| "learning_rate": 1.4407458974258987e-05, | |
| "loss": 0.326, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.41839762611275966, | |
| "grad_norm": 9.75017261505127, | |
| "learning_rate": 1.4470343856834936e-05, | |
| "loss": 0.3242, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.42729970326409494, | |
| "grad_norm": 8.225783348083496, | |
| "learning_rate": 1.4531904748290513e-05, | |
| "loss": 0.3061, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.4362017804154303, | |
| "grad_norm": 7.090129852294922, | |
| "learning_rate": 1.4592196252124945e-05, | |
| "loss": 0.3152, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.44510385756676557, | |
| "grad_norm": 11.792762756347656, | |
| "learning_rate": 1.465126966206991e-05, | |
| "loss": 0.3258, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4540059347181009, | |
| "grad_norm": 8.907958984375, | |
| "learning_rate": 1.4709173224293973e-05, | |
| "loss": 0.3203, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.4629080118694362, | |
| "grad_norm": 4.160986423492432, | |
| "learning_rate": 1.4765952374145259e-05, | |
| "loss": 0.3156, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.47181008902077154, | |
| "grad_norm": 4.767416954040527, | |
| "learning_rate": 1.482164995034286e-05, | |
| "loss": 0.3238, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.4807121661721068, | |
| "grad_norm": 6.091423511505127, | |
| "learning_rate": 1.4876306389146738e-05, | |
| "loss": 0.3186, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.4896142433234421, | |
| "grad_norm": 9.760332107543945, | |
| "learning_rate": 1.4929959900710676e-05, | |
| "loss": 0.3111, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.49851632047477745, | |
| "grad_norm": 7.750044822692871, | |
| "learning_rate": 1.4982646629544899e-05, | |
| "loss": 0.3209, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5074183976261127, | |
| "grad_norm": 10.224579811096191, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3262, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.516320474777448, | |
| "grad_norm": 8.861902236938477, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3121, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5252225519287834, | |
| "grad_norm": 4.321209907531738, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3133, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5341246290801187, | |
| "grad_norm": 4.978445529937744, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3071, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.543026706231454, | |
| "grad_norm": 10.364422798156738, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3178, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5519287833827893, | |
| "grad_norm": 8.970362663269043, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3212, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.5608308605341247, | |
| "grad_norm": 6.0015645027160645, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3184, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.56973293768546, | |
| "grad_norm": 5.832813739776611, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3211, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.5786350148367952, | |
| "grad_norm": 7.478390693664551, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3432, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5875370919881305, | |
| "grad_norm": 5.908259868621826, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3287, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.5964391691394659, | |
| "grad_norm": 8.518238067626953, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.313, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6053412462908012, | |
| "grad_norm": 6.046856880187988, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3197, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.6142433234421365, | |
| "grad_norm": 9.07325553894043, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3105, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6231454005934718, | |
| "grad_norm": 7.418500900268555, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3178, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6320474777448071, | |
| "grad_norm": 10.935755729675293, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3192, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.6409495548961425, | |
| "grad_norm": 8.953109741210938, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.336, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.6498516320474778, | |
| "grad_norm": 10.9089937210083, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3362, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.658753709198813, | |
| "grad_norm": 7.62611198425293, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3219, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.6676557863501483, | |
| "grad_norm": 8.877312660217285, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3242, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6765578635014837, | |
| "grad_norm": 11.891584396362305, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3193, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.685459940652819, | |
| "grad_norm": 6.047501564025879, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3145, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.6943620178041543, | |
| "grad_norm": 11.06523609161377, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3205, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.7032640949554896, | |
| "grad_norm": 14.651629447937012, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3181, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.712166172106825, | |
| "grad_norm": 4.986928462982178, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3188, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7210682492581603, | |
| "grad_norm": 5.383213520050049, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3322, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7299703264094956, | |
| "grad_norm": 7.467197418212891, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3231, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.7388724035608308, | |
| "grad_norm": 8.040964126586914, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3194, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.7477744807121661, | |
| "grad_norm": 9.442214012145996, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3045, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.7566765578635015, | |
| "grad_norm": 5.0572919845581055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3198, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7655786350148368, | |
| "grad_norm": 9.763797760009766, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3038, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.7744807121661721, | |
| "grad_norm": 4.699306488037109, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3071, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.7833827893175074, | |
| "grad_norm": 6.758116245269775, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3079, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.7922848664688428, | |
| "grad_norm": 9.004244804382324, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3114, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8011869436201781, | |
| "grad_norm": 10.923787117004395, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3214, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8100890207715133, | |
| "grad_norm": 4.750248432159424, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3213, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.8189910979228486, | |
| "grad_norm": 6.5013346672058105, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3213, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.827893175074184, | |
| "grad_norm": 14.487788200378418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3117, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.8367952522255193, | |
| "grad_norm": 4.58863639831543, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.325, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.8456973293768546, | |
| "grad_norm": 5.803460597991943, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3095, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8545994065281899, | |
| "grad_norm": 6.8022871017456055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3279, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.8635014836795252, | |
| "grad_norm": 11.592184066772461, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3211, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.8724035608308606, | |
| "grad_norm": 4.380642890930176, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3121, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.8813056379821959, | |
| "grad_norm": 10.14802360534668, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3147, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.8902077151335311, | |
| "grad_norm": 6.102616786956787, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.327, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8991097922848664, | |
| "grad_norm": 6.485515117645264, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3165, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.9080118694362018, | |
| "grad_norm": 3.8234357833862305, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.316, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9169139465875371, | |
| "grad_norm": 9.781375885009766, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3147, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.9258160237388724, | |
| "grad_norm": 8.865571975708008, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3178, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.9347181008902077, | |
| "grad_norm": 6.27769660949707, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3152, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9436201780415431, | |
| "grad_norm": 5.6717143058776855, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3062, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.9525222551928784, | |
| "grad_norm": 14.846003532409668, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3204, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.9614243323442137, | |
| "grad_norm": 7.501077651977539, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.307, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.9703264094955489, | |
| "grad_norm": 7.450833320617676, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.313, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.9792284866468842, | |
| "grad_norm": 4.422346591949463, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3067, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9881305637982196, | |
| "grad_norm": 5.954162120819092, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3215, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.9970326409495549, | |
| "grad_norm": 4.766922950744629, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3139, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.6956011056900024, | |
| "eval_runtime": 297.7711, | |
| "eval_samples_per_second": 5.178, | |
| "eval_steps_per_second": 0.648, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.0059347181008902, | |
| "grad_norm": 7.0965895652771, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3104, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.0148367952522255, | |
| "grad_norm": 7.9634599685668945, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3082, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.0237388724035608, | |
| "grad_norm": 8.5128755569458, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3211, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.032640949554896, | |
| "grad_norm": 5.733129501342773, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3236, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.0415430267062316, | |
| "grad_norm": 8.197546005249023, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3136, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.0504451038575668, | |
| "grad_norm": 11.312963485717773, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3274, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.0593471810089021, | |
| "grad_norm": 4.885214328765869, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3268, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.0682492581602374, | |
| "grad_norm": 7.366455078125, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3214, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0771513353115727, | |
| "grad_norm": 7.0693559646606445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3262, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.086053412462908, | |
| "grad_norm": 12.16964054107666, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3232, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.0949554896142433, | |
| "grad_norm": 6.702571868896484, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3049, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.1038575667655786, | |
| "grad_norm": 8.25865650177002, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3086, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.1127596439169138, | |
| "grad_norm": 10.963550567626953, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3187, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.1216617210682494, | |
| "grad_norm": 10.957636833190918, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3119, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.1305637982195846, | |
| "grad_norm": 4.481369495391846, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3153, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.13946587537092, | |
| "grad_norm": 7.9678120613098145, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3143, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.1483679525222552, | |
| "grad_norm": 10.013398170471191, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3146, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.1572700296735905, | |
| "grad_norm": 9.361319541931152, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3069, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1661721068249258, | |
| "grad_norm": 7.185680866241455, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3198, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.175074183976261, | |
| "grad_norm": 9.780238151550293, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3173, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.1839762611275964, | |
| "grad_norm": 6.236032009124756, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3096, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.1928783382789319, | |
| "grad_norm": 6.732054710388184, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3086, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.2017804154302671, | |
| "grad_norm": 8.902305603027344, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3074, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.2106824925816024, | |
| "grad_norm": 8.529496192932129, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3208, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.2195845697329377, | |
| "grad_norm": 10.779397964477539, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2997, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.228486646884273, | |
| "grad_norm": 5.797762393951416, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2977, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.2373887240356083, | |
| "grad_norm": 10.339754104614258, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3099, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.2462908011869436, | |
| "grad_norm": 6.894352436065674, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3027, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2551928783382789, | |
| "grad_norm": 10.406209945678711, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.318, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.2640949554896141, | |
| "grad_norm": 4.105279922485352, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3271, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.2729970326409497, | |
| "grad_norm": 9.26810073852539, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3052, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.281899109792285, | |
| "grad_norm": 11.131587028503418, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.315, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.2908011869436202, | |
| "grad_norm": 7.997912883758545, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3037, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.2997032640949555, | |
| "grad_norm": 4.264193058013916, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2967, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.3086053412462908, | |
| "grad_norm": 6.291212558746338, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3078, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.317507418397626, | |
| "grad_norm": 10.159900665283203, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2994, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.3264094955489614, | |
| "grad_norm": 10.216263771057129, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3244, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.3353115727002967, | |
| "grad_norm": 7.566501617431641, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.31, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.344213649851632, | |
| "grad_norm": 5.979765892028809, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3069, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.3531157270029674, | |
| "grad_norm": 6.646083354949951, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3026, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.3620178041543027, | |
| "grad_norm": 6.50187349319458, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3065, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.370919881305638, | |
| "grad_norm": 4.8104705810546875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.307, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.3798219584569733, | |
| "grad_norm": 4.24050235748291, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3011, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.3887240356083086, | |
| "grad_norm": 7.853260040283203, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3051, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.3976261127596439, | |
| "grad_norm": 6.0949602127075195, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2931, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.4065281899109792, | |
| "grad_norm": 11.793612480163574, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2982, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.4154302670623147, | |
| "grad_norm": 8.261275291442871, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2966, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 1.4243323442136497, | |
| "grad_norm": 6.895599365234375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3195, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4332344213649852, | |
| "grad_norm": 5.414015293121338, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3063, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 1.4421364985163205, | |
| "grad_norm": 4.563000202178955, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3079, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.4510385756676558, | |
| "grad_norm": 7.205160617828369, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3204, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 1.459940652818991, | |
| "grad_norm": 7.146437168121338, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3049, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.4688427299703264, | |
| "grad_norm": 8.912725448608398, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3004, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.4777448071216617, | |
| "grad_norm": 5.934146881103516, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3053, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.486646884272997, | |
| "grad_norm": 7.54482889175415, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2962, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 1.4955489614243325, | |
| "grad_norm": 11.391508102416992, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3291, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 1.5044510385756675, | |
| "grad_norm": 9.863611221313477, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3068, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 1.513353115727003, | |
| "grad_norm": 7.5741376876831055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3036, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.5222551928783383, | |
| "grad_norm": 11.626495361328125, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3131, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 1.5311572700296736, | |
| "grad_norm": 4.790311813354492, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3126, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 1.540059347181009, | |
| "grad_norm": 5.693728446960449, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3221, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 1.5489614243323442, | |
| "grad_norm": 9.541658401489258, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3186, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.5578635014836797, | |
| "grad_norm": 10.08277416229248, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3094, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.5667655786350148, | |
| "grad_norm": 10.004911422729492, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3081, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 1.5756676557863503, | |
| "grad_norm": 4.247671127319336, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3173, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 1.5845697329376853, | |
| "grad_norm": 6.010837078094482, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.314, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 1.5934718100890208, | |
| "grad_norm": 10.42171859741211, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3111, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 1.6023738872403561, | |
| "grad_norm": 11.672240257263184, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3263, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.6112759643916914, | |
| "grad_norm": 9.143010139465332, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2983, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 1.6201780415430267, | |
| "grad_norm": 7.786658763885498, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3089, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.629080118694362, | |
| "grad_norm": 5.973340034484863, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.303, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 1.6379821958456975, | |
| "grad_norm": 4.11182975769043, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3059, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 1.6468842729970326, | |
| "grad_norm": 6.210434913635254, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3133, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.655786350148368, | |
| "grad_norm": 11.501874923706055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3078, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 1.6646884272997031, | |
| "grad_norm": 8.35253620147705, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3147, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 1.6735905044510386, | |
| "grad_norm": 6.669034957885742, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3104, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.682492581602374, | |
| "grad_norm": 13.310565948486328, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3172, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 1.6913946587537092, | |
| "grad_norm": 6.960197448730469, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3192, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.7002967359050445, | |
| "grad_norm": 10.452018737792969, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3072, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 1.7091988130563798, | |
| "grad_norm": 6.1864190101623535, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3178, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.7181008902077153, | |
| "grad_norm": 6.356491565704346, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3176, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 1.7270029673590503, | |
| "grad_norm": 5.232566833496094, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2963, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.7359050445103859, | |
| "grad_norm": 3.332583427429199, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2999, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.744807121661721, | |
| "grad_norm": 5.193176746368408, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3192, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 1.7537091988130564, | |
| "grad_norm": 6.814889907836914, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2958, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 1.7626112759643917, | |
| "grad_norm": 9.611870765686035, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3091, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 1.771513353115727, | |
| "grad_norm": 7.733308792114258, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3026, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 1.7804154302670623, | |
| "grad_norm": 5.742140769958496, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2962, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.7893175074183976, | |
| "grad_norm": 11.053295135498047, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2967, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 1.798219584569733, | |
| "grad_norm": 7.031610012054443, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2927, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 1.8071216617210681, | |
| "grad_norm": 6.521071910858154, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3139, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 1.8160237388724036, | |
| "grad_norm": 6.417489528656006, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3081, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 1.8249258160237387, | |
| "grad_norm": 9.378142356872559, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.298, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.8338278931750742, | |
| "grad_norm": 8.447271347045898, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3141, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 1.8427299703264095, | |
| "grad_norm": 10.930451393127441, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3021, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 1.8516320474777448, | |
| "grad_norm": 8.880478858947754, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3136, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 1.86053412462908, | |
| "grad_norm": 5.905041217803955, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3191, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 1.8694362017804154, | |
| "grad_norm": 6.188875675201416, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3283, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.8783382789317509, | |
| "grad_norm": 11.83849811553955, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3235, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 1.887240356083086, | |
| "grad_norm": 7.689598560333252, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3162, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 1.8961424332344214, | |
| "grad_norm": 3.9637110233306885, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3127, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 1.9050445103857567, | |
| "grad_norm": 13.587063789367676, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3268, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 1.913946587537092, | |
| "grad_norm": 7.881510257720947, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3038, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.9228486646884273, | |
| "grad_norm": 6.357386112213135, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3097, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 1.9317507418397626, | |
| "grad_norm": 6.852357387542725, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3056, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 1.9406528189910979, | |
| "grad_norm": 6.557038307189941, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3209, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 1.9495548961424332, | |
| "grad_norm": 7.013545036315918, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3237, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 1.9584569732937687, | |
| "grad_norm": 9.902325630187988, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3166, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.9673590504451037, | |
| "grad_norm": 6.723764896392822, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3198, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 1.9762611275964392, | |
| "grad_norm": 9.627095222473145, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.321, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 1.9851632047477745, | |
| "grad_norm": 8.035420417785645, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.31, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 1.9940652818991098, | |
| "grad_norm": 10.477612495422363, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2995, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.6920689940452576, | |
| "eval_runtime": 296.7344, | |
| "eval_samples_per_second": 5.197, | |
| "eval_steps_per_second": 0.65, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 2.0029673590504453, | |
| "grad_norm": 4.917605400085449, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3129, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.0118694362017804, | |
| "grad_norm": 14.471161842346191, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3121, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 2.020771513353116, | |
| "grad_norm": 10.123734474182129, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3136, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 2.029673590504451, | |
| "grad_norm": 7.0058794021606445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.314, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 2.0385756676557865, | |
| "grad_norm": 5.461868762969971, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.307, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 2.0474777448071215, | |
| "grad_norm": 5.689599514007568, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3053, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.056379821958457, | |
| "grad_norm": 8.585354804992676, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3041, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 2.065281899109792, | |
| "grad_norm": 4.620091915130615, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2921, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 2.0741839762611276, | |
| "grad_norm": 6.909940719604492, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3087, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 2.083086053412463, | |
| "grad_norm": 5.3829426765441895, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.294, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 2.091988130563798, | |
| "grad_norm": 10.095771789550781, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3027, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.1008902077151337, | |
| "grad_norm": 7.622206687927246, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2936, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 2.1097922848664687, | |
| "grad_norm": 9.839076042175293, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3093, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 2.1186943620178043, | |
| "grad_norm": 13.05020809173584, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3076, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 2.1275964391691393, | |
| "grad_norm": 4.418980598449707, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3043, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 2.136498516320475, | |
| "grad_norm": 3.569221019744873, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3083, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.14540059347181, | |
| "grad_norm": 6.468089580535889, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3016, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 2.1543026706231454, | |
| "grad_norm": 8.789352416992188, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3022, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 2.163204747774481, | |
| "grad_norm": 8.202059745788574, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.311, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 2.172106824925816, | |
| "grad_norm": 6.959595203399658, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2947, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 2.1810089020771515, | |
| "grad_norm": 11.653180122375488, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3094, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.1899109792284865, | |
| "grad_norm": 8.507452964782715, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3094, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 2.198813056379822, | |
| "grad_norm": 3.680802583694458, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3054, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 2.207715133531157, | |
| "grad_norm": 9.95173454284668, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2928, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 2.2166172106824926, | |
| "grad_norm": 10.835822105407715, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2882, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 2.2255192878338277, | |
| "grad_norm": 12.096845626831055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.308, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.234421364985163, | |
| "grad_norm": 4.49980354309082, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3173, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 2.2433234421364987, | |
| "grad_norm": 9.042285919189453, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3073, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 2.2522255192878338, | |
| "grad_norm": 5.250131130218506, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2966, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 2.2611275964391693, | |
| "grad_norm": 9.235132217407227, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3168, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 2.2700296735905043, | |
| "grad_norm": 7.330996513366699, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3027, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 2.27893175074184, | |
| "grad_norm": 5.805144309997559, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3232, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 2.287833827893175, | |
| "grad_norm": 10.95457649230957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.307, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 2.2967359050445104, | |
| "grad_norm": 5.920906066894531, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3052, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 2.3056379821958455, | |
| "grad_norm": 7.4418511390686035, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3195, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 2.314540059347181, | |
| "grad_norm": 9.739228248596191, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3146, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.3234421364985165, | |
| "grad_norm": 11.025596618652344, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3061, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 2.3323442136498516, | |
| "grad_norm": 5.031250953674316, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3128, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 2.341246290801187, | |
| "grad_norm": 9.482969284057617, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3067, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 2.350148367952522, | |
| "grad_norm": 4.4395270347595215, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2972, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 2.3590504451038576, | |
| "grad_norm": 4.755709171295166, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3078, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 2.3679525222551927, | |
| "grad_norm": 6.278073310852051, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3107, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 2.376854599406528, | |
| "grad_norm": 7.922651767730713, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3043, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 2.3857566765578637, | |
| "grad_norm": 9.521344184875488, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3158, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 2.394658753709199, | |
| "grad_norm": 12.499236106872559, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3086, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 2.4035608308605343, | |
| "grad_norm": 6.426900863647461, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3126, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.4124629080118694, | |
| "grad_norm": 8.431981086730957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 2.421364985163205, | |
| "grad_norm": 12.86776351928711, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2995, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 2.43026706231454, | |
| "grad_norm": 6.822738170623779, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3115, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 2.4391691394658754, | |
| "grad_norm": 6.153812408447266, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.297, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 2.4480712166172105, | |
| "grad_norm": 11.699315071105957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2951, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 2.456973293768546, | |
| "grad_norm": 5.795748710632324, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3062, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 2.465875370919881, | |
| "grad_norm": 6.4195756912231445, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2938, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 2.4747774480712166, | |
| "grad_norm": 6.024349212646484, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2887, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 2.483679525222552, | |
| "grad_norm": 5.880214691162109, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2943, | |
| "step": 837 | |
| }, | |
| { | |
| "epoch": 2.492581602373887, | |
| "grad_norm": 18.98047637939453, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2903, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.5014836795252227, | |
| "grad_norm": 14.550153732299805, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2999, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 2.5103857566765577, | |
| "grad_norm": 11.062093734741211, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3281, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 2.5192878338278932, | |
| "grad_norm": 6.1865644454956055, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3073, | |
| "step": 849 | |
| }, | |
| { | |
| "epoch": 2.5281899109792283, | |
| "grad_norm": 10.409070014953613, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3155, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 2.537091988130564, | |
| "grad_norm": 12.40860366821289, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3013, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 2.5459940652818993, | |
| "grad_norm": 6.20428466796875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3141, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 2.5548961424332344, | |
| "grad_norm": 4.158163547515869, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.307, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 2.56379821958457, | |
| "grad_norm": 7.828709602355957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3191, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 2.572700296735905, | |
| "grad_norm": 8.588981628417969, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3237, | |
| "step": 867 | |
| }, | |
| { | |
| "epoch": 2.5816023738872405, | |
| "grad_norm": 6.725210189819336, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.293, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.5905044510385755, | |
| "grad_norm": 8.876666069030762, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3042, | |
| "step": 873 | |
| }, | |
| { | |
| "epoch": 2.599406528189911, | |
| "grad_norm": 8.503588676452637, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3058, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 2.6083086053412465, | |
| "grad_norm": 7.051385402679443, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2856, | |
| "step": 879 | |
| }, | |
| { | |
| "epoch": 2.6172106824925816, | |
| "grad_norm": 11.214133262634277, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2899, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 2.6261127596439167, | |
| "grad_norm": 5.270874977111816, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2947, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 2.635014836795252, | |
| "grad_norm": 13.623291015625, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3001, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 2.6439169139465877, | |
| "grad_norm": 3.9485678672790527, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3027, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 2.6528189910979227, | |
| "grad_norm": 7.7399725914001465, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2988, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 2.6617210682492582, | |
| "grad_norm": 7.428469181060791, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2896, | |
| "step": 897 | |
| }, | |
| { | |
| "epoch": 2.6706231454005933, | |
| "grad_norm": 4.9085001945495605, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2955, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.679525222551929, | |
| "grad_norm": 7.616215705871582, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3143, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 2.688427299703264, | |
| "grad_norm": 6.225953102111816, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3004, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 2.6973293768545994, | |
| "grad_norm": 5.675787448883057, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2946, | |
| "step": 909 | |
| }, | |
| { | |
| "epoch": 2.706231454005935, | |
| "grad_norm": 7.747137069702148, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2966, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 2.71513353115727, | |
| "grad_norm": 12.72786808013916, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3106, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 2.7240356083086055, | |
| "grad_norm": 7.423135280609131, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2838, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 2.7329376854599405, | |
| "grad_norm": 6.8378520011901855, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3165, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 2.741839762611276, | |
| "grad_norm": 5.68455696105957, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3078, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 2.750741839762611, | |
| "grad_norm": 13.37850570678711, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3005, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 2.7596439169139466, | |
| "grad_norm": 5.610422611236572, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2948, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.768545994065282, | |
| "grad_norm": 9.621097564697266, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.307, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 2.777448071216617, | |
| "grad_norm": 4.709936141967773, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3011, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 2.7863501483679523, | |
| "grad_norm": 7.198949813842773, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2938, | |
| "step": 939 | |
| }, | |
| { | |
| "epoch": 2.7952522255192878, | |
| "grad_norm": 6.532808303833008, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3158, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 2.8041543026706233, | |
| "grad_norm": 10.170119285583496, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2862, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 2.8130563798219583, | |
| "grad_norm": 7.333060264587402, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2989, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 2.821958456973294, | |
| "grad_norm": 3.9618520736694336, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2759, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 2.8308605341246293, | |
| "grad_norm": 5.956901550292969, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.294, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 2.8397626112759644, | |
| "grad_norm": 5.030998706817627, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3016, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 2.8486646884272995, | |
| "grad_norm": 8.330857276916504, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3029, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.857566765578635, | |
| "grad_norm": 10.079005241394043, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2955, | |
| "step": 963 | |
| }, | |
| { | |
| "epoch": 2.8664688427299705, | |
| "grad_norm": 9.091019630432129, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2999, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 2.8753709198813056, | |
| "grad_norm": 7.372535705566406, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2949, | |
| "step": 969 | |
| }, | |
| { | |
| "epoch": 2.884272997032641, | |
| "grad_norm": 8.11223030090332, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2852, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 2.893175074183976, | |
| "grad_norm": 3.835611343383789, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2745, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.9020771513353116, | |
| "grad_norm": 11.748644828796387, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2875, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 2.9109792284866467, | |
| "grad_norm": 14.599609375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2854, | |
| "step": 981 | |
| }, | |
| { | |
| "epoch": 2.919881305637982, | |
| "grad_norm": 8.011322021484375, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2924, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 2.9287833827893177, | |
| "grad_norm": 5.392467498779297, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.293, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 2.9376854599406528, | |
| "grad_norm": 10.867618560791016, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3049, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.9465875370919883, | |
| "grad_norm": 11.08749771118164, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2943, | |
| "step": 993 | |
| }, | |
| { | |
| "epoch": 2.9554896142433233, | |
| "grad_norm": 7.80095100402832, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2984, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 2.964391691394659, | |
| "grad_norm": 6.650088310241699, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.3048, | |
| "step": 999 | |
| }, | |
| { | |
| "epoch": 2.973293768545994, | |
| "grad_norm": 9.152456283569336, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2985, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 2.9821958456973294, | |
| "grad_norm": 10.47088623046875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2934, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 2.991097922848665, | |
| "grad_norm": 3.175657272338867, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2741, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 10.17156982421875, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.2926, | |
| "step": 1011 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.6839648485183716, | |
| "eval_runtime": 298.4807, | |
| "eval_samples_per_second": 5.166, | |
| "eval_steps_per_second": 0.647, | |
| "step": 1011 | |
| } | |
| ], | |
| "logging_steps": 3, | |
| "max_steps": 3370, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |