Jinwu01 commited on
Commit
c3dc34b
·
verified ·
1 Parent(s): af8b6ec

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +30 -0
  2. Llama-2-13b-chat-hf/DomainBench/Agriculture/README.md +57 -0
  3. Llama-2-13b-chat-hf/DomainBench/Agriculture/adapter_config.json +29 -0
  4. Llama-2-13b-chat-hf/DomainBench/Agriculture/adapter_model.safetensors +3 -0
  5. Llama-2-13b-chat-hf/DomainBench/Agriculture/all_results.json +8 -0
  6. Llama-2-13b-chat-hf/DomainBench/Agriculture/logfile.txt +0 -0
  7. Llama-2-13b-chat-hf/DomainBench/Agriculture/special_tokens_map.json +24 -0
  8. Llama-2-13b-chat-hf/DomainBench/Agriculture/tokenizer.json +0 -0
  9. Llama-2-13b-chat-hf/DomainBench/Agriculture/tokenizer.model +3 -0
  10. Llama-2-13b-chat-hf/DomainBench/Agriculture/tokenizer_config.json +44 -0
  11. Llama-2-13b-chat-hf/DomainBench/Agriculture/train_results.json +8 -0
  12. Llama-2-13b-chat-hf/DomainBench/Agriculture/trainer_log.jsonl +501 -0
  13. Llama-2-13b-chat-hf/DomainBench/Agriculture/trainer_state.json +3542 -0
  14. Llama-2-13b-chat-hf/DomainBench/Agriculture/training_args.bin +3 -0
  15. Llama-2-13b-chat-hf/DomainBench/Agriculture/training_loss.png +0 -0
  16. Llama-2-13b-chat-hf/DomainBench/Finance/README.md +57 -0
  17. Llama-2-13b-chat-hf/DomainBench/Finance/adapter_config.json +29 -0
  18. Llama-2-13b-chat-hf/DomainBench/Finance/adapter_model.safetensors +3 -0
  19. Llama-2-13b-chat-hf/DomainBench/Finance/all_results.json +8 -0
  20. Llama-2-13b-chat-hf/DomainBench/Finance/logfile.txt +0 -0
  21. Llama-2-13b-chat-hf/DomainBench/Finance/special_tokens_map.json +24 -0
  22. Llama-2-13b-chat-hf/DomainBench/Finance/tokenizer.json +0 -0
  23. Llama-2-13b-chat-hf/DomainBench/Finance/tokenizer.model +3 -0
  24. Llama-2-13b-chat-hf/DomainBench/Finance/tokenizer_config.json +44 -0
  25. Llama-2-13b-chat-hf/DomainBench/Finance/train_results.json +8 -0
  26. Llama-2-13b-chat-hf/DomainBench/Finance/trainer_log.jsonl +501 -0
  27. Llama-2-13b-chat-hf/DomainBench/Finance/trainer_state.json +3542 -0
  28. Llama-2-13b-chat-hf/DomainBench/Finance/training_args.bin +3 -0
  29. Llama-2-13b-chat-hf/DomainBench/Finance/training_loss.png +0 -0
  30. Llama-2-13b-chat-hf/DomainBench/Geography/README.md +57 -0
  31. Llama-2-13b-chat-hf/DomainBench/Geography/adapter_config.json +29 -0
  32. Llama-2-13b-chat-hf/DomainBench/Geography/adapter_model.safetensors +3 -0
  33. Llama-2-13b-chat-hf/DomainBench/Geography/all_results.json +8 -0
  34. Llama-2-13b-chat-hf/DomainBench/Geography/logfile.txt +0 -0
  35. Llama-2-13b-chat-hf/DomainBench/Geography/special_tokens_map.json +24 -0
  36. Llama-2-13b-chat-hf/DomainBench/Geography/tokenizer.json +0 -0
  37. Llama-2-13b-chat-hf/DomainBench/Geography/tokenizer.model +3 -0
  38. Llama-2-13b-chat-hf/DomainBench/Geography/tokenizer_config.json +44 -0
  39. Llama-2-13b-chat-hf/DomainBench/Geography/train_results.json +8 -0
  40. Llama-2-13b-chat-hf/DomainBench/Geography/trainer_log.jsonl +501 -0
  41. Llama-2-13b-chat-hf/DomainBench/Geography/trainer_state.json +3542 -0
  42. Llama-2-13b-chat-hf/DomainBench/Geography/training_args.bin +3 -0
  43. Llama-2-13b-chat-hf/DomainBench/Geography/training_loss.png +0 -0
  44. Llama-2-13b-chat-hf/DomainBench/Medicine/README.md +57 -0
  45. Llama-2-13b-chat-hf/DomainBench/Medicine/adapter_config.json +29 -0
  46. Llama-2-13b-chat-hf/DomainBench/Medicine/adapter_model.safetensors +3 -0
  47. Llama-2-13b-chat-hf/DomainBench/Medicine/all_results.json +8 -0
  48. Llama-2-13b-chat-hf/DomainBench/Medicine/logfile.txt +0 -0
  49. Llama-2-13b-chat-hf/DomainBench/Medicine/special_tokens_map.json +24 -0
  50. Llama-2-13b-chat-hf/DomainBench/Medicine/tokenizer.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,33 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Llama-3.2-3B-Instruct/DomainBench/Agriculture/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ Llama-3.2-3B-Instruct/DomainBench/Finance/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ Llama-3.2-3B-Instruct/DomainBench/Geography/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ Llama-3.2-3B-Instruct/DomainBench/Medicine/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ Llama-3.2-3B-Instruct/InstructionBench/Alpaca-GPT4/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ Llama-3.2-3B-Instruct/InstructionBench/Dolly/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ Llama-3.2-3B-Instruct/InstructionBench/InstructionWild/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ Llama-3.2-3B-Instruct/ReasoningBench/GSM8K/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ Llama-3.2-3B-Instruct/ReasoningBench/Logiqa/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ Llama-3.2-3B-Instruct/ReasoningBench/MetaMath/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ Meta-Llama-3-8B-Instruct/DomainBench/Agriculture/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ Meta-Llama-3-8B-Instruct/DomainBench/Finance/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ Meta-Llama-3-8B-Instruct/DomainBench/Geography/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ Meta-Llama-3-8B-Instruct/DomainBench/Medicine/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ Meta-Llama-3-8B-Instruct/InstructionBench/Alpaca-GPT4/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ Meta-Llama-3-8B-Instruct/InstructionBench/Dolly/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ Meta-Llama-3-8B-Instruct/InstructionBench/InstructionWild/tokenizer.json filter=lfs diff=lfs merge=lfs -text
53
+ Meta-Llama-3-8B-Instruct/ReasoningBench/GSM8K/tokenizer.json filter=lfs diff=lfs merge=lfs -text
54
+ Meta-Llama-3-8B-Instruct/ReasoningBench/Logiqa/tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ Meta-Llama-3-8B-Instruct/ReasoningBench/MetaMath/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
+ Qwen2.5-7B-Instruct/DomainBench/Agriculture/tokenizer.json filter=lfs diff=lfs merge=lfs -text
57
+ Qwen2.5-7B-Instruct/DomainBench/Finance/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
+ Qwen2.5-7B-Instruct/DomainBench/Geography/tokenizer.json filter=lfs diff=lfs merge=lfs -text
59
+ Qwen2.5-7B-Instruct/DomainBench/Medicine/tokenizer.json filter=lfs diff=lfs merge=lfs -text
60
+ Qwen2.5-7B-Instruct/InstructionBench/Alpaca-GPT4/tokenizer.json filter=lfs diff=lfs merge=lfs -text
61
+ Qwen2.5-7B-Instruct/InstructionBench/Dolly/tokenizer.json filter=lfs diff=lfs merge=lfs -text
62
+ Qwen2.5-7B-Instruct/InstructionBench/InstructionWild/tokenizer.json filter=lfs diff=lfs merge=lfs -text
63
+ Qwen2.5-7B-Instruct/ReasoningBench/GSM8K/tokenizer.json filter=lfs diff=lfs merge=lfs -text
64
+ Qwen2.5-7B-Instruct/ReasoningBench/Logiqa/tokenizer.json filter=lfs diff=lfs merge=lfs -text
65
+ Qwen2.5-7B-Instruct/ReasoningBench/MetaMath/tokenizer.json filter=lfs diff=lfs merge=lfs -text
Llama-2-13b-chat-hf/DomainBench/Agriculture/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: /hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: threshold_3-lamb_0.1-lr_5e-5
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # threshold_3-lamb_0.1-lr_5e-5
18
+
19
+ This model is a fine-tuned version of [/hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf](https://huggingface.co//hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf) on the agriculture dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 1
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: cosine
44
+ - lr_scheduler_warmup_ratio: 0.1
45
+ - num_epochs: 1.0
46
+
47
+ ### Training results
48
+
49
+
50
+
51
+ ### Framework versions
52
+
53
+ - PEFT 0.12.0
54
+ - Transformers 4.46.1
55
+ - Pytorch 2.5.1+cu124
56
+ - Datasets 3.1.0
57
+ - Tokenizers 0.20.3
Llama-2-13b-chat-hf/DomainBench/Agriculture/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
Llama-2-13b-chat-hf/DomainBench/Agriculture/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34823ac8f137597ed7fb597f267ca574184059387f87535c4f3c36cfc91c3fa6
3
+ size 26235704
Llama-2-13b-chat-hf/DomainBench/Agriculture/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 9478520693637120.0,
4
+ "train_loss": 2.8943692499160765,
5
+ "train_runtime": 1526.3896,
6
+ "train_samples_per_second": 3.276,
7
+ "train_steps_per_second": 3.276
8
+ }
Llama-2-13b-chat-hf/DomainBench/Agriculture/logfile.txt ADDED
The diff for this file is too large to render. See raw diff
 
Llama-2-13b-chat-hf/DomainBench/Agriculture/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
Llama-2-13b-chat-hf/DomainBench/Agriculture/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama-2-13b-chat-hf/DomainBench/Agriculture/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
Llama-2-13b-chat-hf/DomainBench/Agriculture/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
Llama-2-13b-chat-hf/DomainBench/Agriculture/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 9478520693637120.0,
4
+ "train_loss": 2.8943692499160765,
5
+ "train_runtime": 1526.3896,
6
+ "train_samples_per_second": 3.276,
7
+ "train_steps_per_second": 3.276
8
+ }
Llama-2-13b-chat-hf/DomainBench/Agriculture/trainer_log.jsonl ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 5000, "loss": 10.8135, "lr": 1.0000000000000002e-06, "epoch": 0.002, "percentage": 0.2, "elapsed_time": "0:00:03", "remaining_time": "0:33:05"}
2
+ {"current_steps": 20, "total_steps": 5000, "loss": 8.4638, "lr": 2.0000000000000003e-06, "epoch": 0.004, "percentage": 0.4, "elapsed_time": "0:00:07", "remaining_time": "0:29:03"}
3
+ {"current_steps": 30, "total_steps": 5000, "loss": 15.4904, "lr": 3e-06, "epoch": 0.006, "percentage": 0.6, "elapsed_time": "0:00:10", "remaining_time": "0:27:39"}
4
+ {"current_steps": 40, "total_steps": 5000, "loss": 11.4875, "lr": 4.000000000000001e-06, "epoch": 0.008, "percentage": 0.8, "elapsed_time": "0:00:13", "remaining_time": "0:26:56"}
5
+ {"current_steps": 50, "total_steps": 5000, "loss": 14.2003, "lr": 5e-06, "epoch": 0.01, "percentage": 1.0, "elapsed_time": "0:00:16", "remaining_time": "0:26:26"}
6
+ {"current_steps": 60, "total_steps": 5000, "loss": 12.1374, "lr": 6e-06, "epoch": 0.012, "percentage": 1.2, "elapsed_time": "0:00:18", "remaining_time": "0:25:44"}
7
+ {"current_steps": 70, "total_steps": 5000, "loss": 11.6844, "lr": 7.000000000000001e-06, "epoch": 0.014, "percentage": 1.4, "elapsed_time": "0:00:21", "remaining_time": "0:25:36"}
8
+ {"current_steps": 80, "total_steps": 5000, "loss": 10.4387, "lr": 8.000000000000001e-06, "epoch": 0.016, "percentage": 1.6, "elapsed_time": "0:00:56", "remaining_time": "0:57:32"}
9
+ {"current_steps": 90, "total_steps": 5000, "loss": 5.5739, "lr": 9e-06, "epoch": 0.018, "percentage": 1.8, "elapsed_time": "0:00:59", "remaining_time": "0:53:49"}
10
+ {"current_steps": 100, "total_steps": 5000, "loss": 12.7118, "lr": 1e-05, "epoch": 0.02, "percentage": 2.0, "elapsed_time": "0:01:02", "remaining_time": "0:50:47"}
11
+ {"current_steps": 110, "total_steps": 5000, "loss": 15.3528, "lr": 1.1000000000000001e-05, "epoch": 0.022, "percentage": 2.2, "elapsed_time": "0:01:05", "remaining_time": "0:48:16"}
12
+ {"current_steps": 120, "total_steps": 5000, "loss": 14.2922, "lr": 1.2e-05, "epoch": 0.024, "percentage": 2.4, "elapsed_time": "0:01:08", "remaining_time": "0:46:11"}
13
+ {"current_steps": 130, "total_steps": 5000, "loss": 6.3563, "lr": 1.3000000000000001e-05, "epoch": 0.026, "percentage": 2.6, "elapsed_time": "0:01:11", "remaining_time": "0:44:26"}
14
+ {"current_steps": 140, "total_steps": 5000, "loss": 7.9494, "lr": 1.4000000000000001e-05, "epoch": 0.028, "percentage": 2.8, "elapsed_time": "0:01:14", "remaining_time": "0:42:51"}
15
+ {"current_steps": 150, "total_steps": 5000, "loss": 10.7366, "lr": 1.5e-05, "epoch": 0.03, "percentage": 3.0, "elapsed_time": "0:01:17", "remaining_time": "0:41:32"}
16
+ {"current_steps": 160, "total_steps": 5000, "loss": 4.7961, "lr": 1.6000000000000003e-05, "epoch": 0.032, "percentage": 3.2, "elapsed_time": "0:01:20", "remaining_time": "0:40:21"}
17
+ {"current_steps": 170, "total_steps": 5000, "loss": 2.8594, "lr": 1.7000000000000003e-05, "epoch": 0.034, "percentage": 3.4, "elapsed_time": "0:01:23", "remaining_time": "0:39:20"}
18
+ {"current_steps": 180, "total_steps": 5000, "loss": 4.2593, "lr": 1.8e-05, "epoch": 0.036, "percentage": 3.6, "elapsed_time": "0:01:26", "remaining_time": "0:38:24"}
19
+ {"current_steps": 190, "total_steps": 5000, "loss": 5.3202, "lr": 1.9e-05, "epoch": 0.038, "percentage": 3.8, "elapsed_time": "0:01:29", "remaining_time": "0:37:34"}
20
+ {"current_steps": 200, "total_steps": 5000, "loss": 8.7095, "lr": 2e-05, "epoch": 0.04, "percentage": 4.0, "elapsed_time": "0:01:31", "remaining_time": "0:36:47"}
21
+ {"current_steps": 210, "total_steps": 5000, "loss": 4.7786, "lr": 2.1e-05, "epoch": 0.042, "percentage": 4.2, "elapsed_time": "0:01:34", "remaining_time": "0:36:06"}
22
+ {"current_steps": 220, "total_steps": 5000, "loss": 2.5694, "lr": 2.2000000000000003e-05, "epoch": 0.044, "percentage": 4.4, "elapsed_time": "0:01:37", "remaining_time": "0:35:29"}
23
+ {"current_steps": 230, "total_steps": 5000, "loss": 4.3152, "lr": 2.3000000000000003e-05, "epoch": 0.046, "percentage": 4.6, "elapsed_time": "0:01:41", "remaining_time": "0:34:54"}
24
+ {"current_steps": 240, "total_steps": 5000, "loss": 4.0996, "lr": 2.4e-05, "epoch": 0.048, "percentage": 4.8, "elapsed_time": "0:01:43", "remaining_time": "0:34:22"}
25
+ {"current_steps": 250, "total_steps": 5000, "loss": 4.9146, "lr": 2.5e-05, "epoch": 0.05, "percentage": 5.0, "elapsed_time": "0:01:46", "remaining_time": "0:33:52"}
26
+ {"current_steps": 260, "total_steps": 5000, "loss": 1.8707, "lr": 2.6000000000000002e-05, "epoch": 0.052, "percentage": 5.2, "elapsed_time": "0:01:49", "remaining_time": "0:33:24"}
27
+ {"current_steps": 270, "total_steps": 5000, "loss": 3.1247, "lr": 2.7000000000000002e-05, "epoch": 0.054, "percentage": 5.4, "elapsed_time": "0:01:52", "remaining_time": "0:32:59"}
28
+ {"current_steps": 280, "total_steps": 5000, "loss": 3.8507, "lr": 2.8000000000000003e-05, "epoch": 0.056, "percentage": 5.6, "elapsed_time": "0:01:55", "remaining_time": "0:32:33"}
29
+ {"current_steps": 290, "total_steps": 5000, "loss": 2.8481, "lr": 2.9e-05, "epoch": 0.058, "percentage": 5.8, "elapsed_time": "0:01:58", "remaining_time": "0:32:10"}
30
+ {"current_steps": 300, "total_steps": 5000, "loss": 4.4567, "lr": 3e-05, "epoch": 0.06, "percentage": 6.0, "elapsed_time": "0:02:01", "remaining_time": "0:31:49"}
31
+ {"current_steps": 310, "total_steps": 5000, "loss": 3.544, "lr": 3.1e-05, "epoch": 0.062, "percentage": 6.2, "elapsed_time": "0:02:04", "remaining_time": "0:31:29"}
32
+ {"current_steps": 320, "total_steps": 5000, "loss": 2.028, "lr": 3.2000000000000005e-05, "epoch": 0.064, "percentage": 6.4, "elapsed_time": "0:02:07", "remaining_time": "0:31:09"}
33
+ {"current_steps": 330, "total_steps": 5000, "loss": 3.4244, "lr": 3.3e-05, "epoch": 0.066, "percentage": 6.6, "elapsed_time": "0:02:10", "remaining_time": "0:30:51"}
34
+ {"current_steps": 340, "total_steps": 5000, "loss": 5.216, "lr": 3.4000000000000007e-05, "epoch": 0.068, "percentage": 6.8, "elapsed_time": "0:02:13", "remaining_time": "0:30:33"}
35
+ {"current_steps": 350, "total_steps": 5000, "loss": 2.7441, "lr": 3.5e-05, "epoch": 0.07, "percentage": 7.0, "elapsed_time": "0:02:16", "remaining_time": "0:30:16"}
36
+ {"current_steps": 360, "total_steps": 5000, "loss": 2.6191, "lr": 3.6e-05, "epoch": 0.072, "percentage": 7.2, "elapsed_time": "0:02:19", "remaining_time": "0:30:01"}
37
+ {"current_steps": 370, "total_steps": 5000, "loss": 5.3131, "lr": 3.7e-05, "epoch": 0.074, "percentage": 7.4, "elapsed_time": "0:02:22", "remaining_time": "0:29:46"}
38
+ {"current_steps": 380, "total_steps": 5000, "loss": 5.2818, "lr": 3.8e-05, "epoch": 0.076, "percentage": 7.6, "elapsed_time": "0:02:25", "remaining_time": "0:29:31"}
39
+ {"current_steps": 390, "total_steps": 5000, "loss": 3.086, "lr": 3.9000000000000006e-05, "epoch": 0.078, "percentage": 7.8, "elapsed_time": "0:02:28", "remaining_time": "0:29:17"}
40
+ {"current_steps": 400, "total_steps": 5000, "loss": 2.6475, "lr": 4e-05, "epoch": 0.08, "percentage": 8.0, "elapsed_time": "0:02:31", "remaining_time": "0:29:03"}
41
+ {"current_steps": 410, "total_steps": 5000, "loss": 2.0594, "lr": 4.1e-05, "epoch": 0.082, "percentage": 8.2, "elapsed_time": "0:02:34", "remaining_time": "0:28:51"}
42
+ {"current_steps": 420, "total_steps": 5000, "loss": 3.5431, "lr": 4.2e-05, "epoch": 0.084, "percentage": 8.4, "elapsed_time": "0:02:37", "remaining_time": "0:28:38"}
43
+ {"current_steps": 430, "total_steps": 5000, "loss": 2.7867, "lr": 4.3e-05, "epoch": 0.086, "percentage": 8.6, "elapsed_time": "0:02:40", "remaining_time": "0:28:26"}
44
+ {"current_steps": 440, "total_steps": 5000, "loss": 6.2247, "lr": 4.4000000000000006e-05, "epoch": 0.088, "percentage": 8.8, "elapsed_time": "0:02:43", "remaining_time": "0:28:15"}
45
+ {"current_steps": 450, "total_steps": 5000, "loss": 1.8291, "lr": 4.5e-05, "epoch": 0.09, "percentage": 9.0, "elapsed_time": "0:02:46", "remaining_time": "0:28:04"}
46
+ {"current_steps": 460, "total_steps": 5000, "loss": 5.4416, "lr": 4.600000000000001e-05, "epoch": 0.092, "percentage": 9.2, "elapsed_time": "0:02:49", "remaining_time": "0:27:53"}
47
+ {"current_steps": 470, "total_steps": 5000, "loss": 6.0983, "lr": 4.7e-05, "epoch": 0.094, "percentage": 9.4, "elapsed_time": "0:02:52", "remaining_time": "0:27:43"}
48
+ {"current_steps": 480, "total_steps": 5000, "loss": 1.8501, "lr": 4.8e-05, "epoch": 0.096, "percentage": 9.6, "elapsed_time": "0:02:55", "remaining_time": "0:27:33"}
49
+ {"current_steps": 490, "total_steps": 5000, "loss": 1.5013, "lr": 4.9e-05, "epoch": 0.098, "percentage": 9.8, "elapsed_time": "0:02:58", "remaining_time": "0:27:23"}
50
+ {"current_steps": 500, "total_steps": 5000, "loss": 2.6987, "lr": 5e-05, "epoch": 0.1, "percentage": 10.0, "elapsed_time": "0:03:01", "remaining_time": "0:27:12"}
51
+ {"current_steps": 510, "total_steps": 5000, "loss": 2.268, "lr": 4.999939076763487e-05, "epoch": 0.102, "percentage": 10.2, "elapsed_time": "0:03:04", "remaining_time": "0:27:02"}
52
+ {"current_steps": 520, "total_steps": 5000, "loss": 2.1733, "lr": 4.999756310023261e-05, "epoch": 0.104, "percentage": 10.4, "elapsed_time": "0:03:07", "remaining_time": "0:26:53"}
53
+ {"current_steps": 530, "total_steps": 5000, "loss": 6.0941, "lr": 4.999451708687114e-05, "epoch": 0.106, "percentage": 10.6, "elapsed_time": "0:03:10", "remaining_time": "0:26:45"}
54
+ {"current_steps": 540, "total_steps": 5000, "loss": 5.3397, "lr": 4.999025287600886e-05, "epoch": 0.108, "percentage": 10.8, "elapsed_time": "0:03:13", "remaining_time": "0:26:36"}
55
+ {"current_steps": 550, "total_steps": 5000, "loss": 1.2973, "lr": 4.99847706754774e-05, "epoch": 0.11, "percentage": 11.0, "elapsed_time": "0:03:16", "remaining_time": "0:26:26"}
56
+ {"current_steps": 560, "total_steps": 5000, "loss": 2.9408, "lr": 4.997807075247146e-05, "epoch": 0.112, "percentage": 11.2, "elapsed_time": "0:03:19", "remaining_time": "0:26:18"}
57
+ {"current_steps": 570, "total_steps": 5000, "loss": 1.5613, "lr": 4.997015343353585e-05, "epoch": 0.114, "percentage": 11.4, "elapsed_time": "0:03:22", "remaining_time": "0:26:10"}
58
+ {"current_steps": 580, "total_steps": 5000, "loss": 1.5912, "lr": 4.996101910454953e-05, "epoch": 0.116, "percentage": 11.6, "elapsed_time": "0:03:25", "remaining_time": "0:26:02"}
59
+ {"current_steps": 590, "total_steps": 5000, "loss": 3.5306, "lr": 4.995066821070679e-05, "epoch": 0.118, "percentage": 11.8, "elapsed_time": "0:03:28", "remaining_time": "0:25:54"}
60
+ {"current_steps": 600, "total_steps": 5000, "loss": 2.1385, "lr": 4.993910125649561e-05, "epoch": 0.12, "percentage": 12.0, "elapsed_time": "0:03:31", "remaining_time": "0:25:47"}
61
+ {"current_steps": 610, "total_steps": 5000, "loss": 1.5452, "lr": 4.992631880567301e-05, "epoch": 0.122, "percentage": 12.2, "elapsed_time": "0:03:34", "remaining_time": "0:25:40"}
62
+ {"current_steps": 620, "total_steps": 5000, "loss": 3.0193, "lr": 4.991232148123761e-05, "epoch": 0.124, "percentage": 12.4, "elapsed_time": "0:03:37", "remaining_time": "0:25:33"}
63
+ {"current_steps": 630, "total_steps": 5000, "loss": 2.3063, "lr": 4.989710996539926e-05, "epoch": 0.126, "percentage": 12.6, "elapsed_time": "0:03:40", "remaining_time": "0:25:26"}
64
+ {"current_steps": 640, "total_steps": 5000, "loss": 2.2441, "lr": 4.988068499954578e-05, "epoch": 0.128, "percentage": 12.8, "elapsed_time": "0:03:42", "remaining_time": "0:25:19"}
65
+ {"current_steps": 650, "total_steps": 5000, "loss": 2.3075, "lr": 4.9863047384206835e-05, "epoch": 0.13, "percentage": 13.0, "elapsed_time": "0:03:45", "remaining_time": "0:25:12"}
66
+ {"current_steps": 660, "total_steps": 5000, "loss": 2.9721, "lr": 4.984419797901491e-05, "epoch": 0.132, "percentage": 13.2, "elapsed_time": "0:03:48", "remaining_time": "0:25:05"}
67
+ {"current_steps": 670, "total_steps": 5000, "loss": 2.7736, "lr": 4.982413770266342e-05, "epoch": 0.134, "percentage": 13.4, "elapsed_time": "0:03:51", "remaining_time": "0:24:59"}
68
+ {"current_steps": 680, "total_steps": 5000, "loss": 1.8991, "lr": 4.980286753286195e-05, "epoch": 0.136, "percentage": 13.6, "elapsed_time": "0:03:54", "remaining_time": "0:24:52"}
69
+ {"current_steps": 690, "total_steps": 5000, "loss": 2.2606, "lr": 4.978038850628854e-05, "epoch": 0.138, "percentage": 13.8, "elapsed_time": "0:03:58", "remaining_time": "0:24:46"}
70
+ {"current_steps": 700, "total_steps": 5000, "loss": 1.5491, "lr": 4.975670171853926e-05, "epoch": 0.14, "percentage": 14.0, "elapsed_time": "0:04:01", "remaining_time": "0:24:40"}
71
+ {"current_steps": 710, "total_steps": 5000, "loss": 2.2913, "lr": 4.9731808324074717e-05, "epoch": 0.142, "percentage": 14.2, "elapsed_time": "0:04:04", "remaining_time": "0:24:34"}
72
+ {"current_steps": 720, "total_steps": 5000, "loss": 2.6315, "lr": 4.9705709536163824e-05, "epoch": 0.144, "percentage": 14.4, "elapsed_time": "0:04:06", "remaining_time": "0:24:28"}
73
+ {"current_steps": 730, "total_steps": 5000, "loss": 2.5128, "lr": 4.96784066268247e-05, "epoch": 0.146, "percentage": 14.6, "elapsed_time": "0:04:09", "remaining_time": "0:24:22"}
74
+ {"current_steps": 740, "total_steps": 5000, "loss": 5.8145, "lr": 4.964990092676263e-05, "epoch": 0.148, "percentage": 14.8, "elapsed_time": "0:04:12", "remaining_time": "0:24:16"}
75
+ {"current_steps": 750, "total_steps": 5000, "loss": 2.3354, "lr": 4.962019382530521e-05, "epoch": 0.15, "percentage": 15.0, "elapsed_time": "0:04:15", "remaining_time": "0:24:10"}
76
+ {"current_steps": 760, "total_steps": 5000, "loss": 4.621, "lr": 4.9589286770334654e-05, "epoch": 0.152, "percentage": 15.2, "elapsed_time": "0:04:19", "remaining_time": "0:24:05"}
77
+ {"current_steps": 770, "total_steps": 5000, "loss": 2.7795, "lr": 4.9557181268217227e-05, "epoch": 0.154, "percentage": 15.4, "elapsed_time": "0:04:22", "remaining_time": "0:23:59"}
78
+ {"current_steps": 780, "total_steps": 5000, "loss": 3.0171, "lr": 4.952387888372979e-05, "epoch": 0.156, "percentage": 15.6, "elapsed_time": "0:04:25", "remaining_time": "0:23:54"}
79
+ {"current_steps": 790, "total_steps": 5000, "loss": 2.1028, "lr": 4.94893812399836e-05, "epoch": 0.158, "percentage": 15.8, "elapsed_time": "0:04:28", "remaining_time": "0:23:48"}
80
+ {"current_steps": 800, "total_steps": 5000, "loss": 13.4531, "lr": 4.9453690018345144e-05, "epoch": 0.16, "percentage": 16.0, "elapsed_time": "0:04:31", "remaining_time": "0:23:42"}
81
+ {"current_steps": 810, "total_steps": 5000, "loss": 2.0661, "lr": 4.94168069583542e-05, "epoch": 0.162, "percentage": 16.2, "elapsed_time": "0:04:33", "remaining_time": "0:23:37"}
82
+ {"current_steps": 820, "total_steps": 5000, "loss": 2.6598, "lr": 4.937873385763908e-05, "epoch": 0.164, "percentage": 16.4, "elapsed_time": "0:04:37", "remaining_time": "0:23:32"}
83
+ {"current_steps": 830, "total_steps": 5000, "loss": 2.58, "lr": 4.933947257182901e-05, "epoch": 0.166, "percentage": 16.6, "elapsed_time": "0:04:39", "remaining_time": "0:23:26"}
84
+ {"current_steps": 840, "total_steps": 5000, "loss": 2.9303, "lr": 4.929902501446366e-05, "epoch": 0.168, "percentage": 16.8, "elapsed_time": "0:04:42", "remaining_time": "0:23:21"}
85
+ {"current_steps": 850, "total_steps": 5000, "loss": 3.0212, "lr": 4.925739315689991e-05, "epoch": 0.17, "percentage": 17.0, "elapsed_time": "0:04:45", "remaining_time": "0:23:16"}
86
+ {"current_steps": 860, "total_steps": 5000, "loss": 4.8252, "lr": 4.9214579028215776e-05, "epoch": 0.172, "percentage": 17.2, "elapsed_time": "0:04:48", "remaining_time": "0:23:11"}
87
+ {"current_steps": 870, "total_steps": 5000, "loss": 2.5644, "lr": 4.917058471511149e-05, "epoch": 0.174, "percentage": 17.4, "elapsed_time": "0:04:51", "remaining_time": "0:23:05"}
88
+ {"current_steps": 880, "total_steps": 5000, "loss": 1.8755, "lr": 4.912541236180779e-05, "epoch": 0.176, "percentage": 17.6, "elapsed_time": "0:04:54", "remaining_time": "0:23:00"}
89
+ {"current_steps": 890, "total_steps": 5000, "loss": 0.7075, "lr": 4.907906416994146e-05, "epoch": 0.178, "percentage": 17.8, "elapsed_time": "0:04:58", "remaining_time": "0:22:56"}
90
+ {"current_steps": 900, "total_steps": 5000, "loss": 7.253, "lr": 4.9031542398457974e-05, "epoch": 0.18, "percentage": 18.0, "elapsed_time": "0:05:01", "remaining_time": "0:22:51"}
91
+ {"current_steps": 910, "total_steps": 5000, "loss": 3.3396, "lr": 4.898284936350144e-05, "epoch": 0.182, "percentage": 18.2, "elapsed_time": "0:05:04", "remaining_time": "0:22:47"}
92
+ {"current_steps": 920, "total_steps": 5000, "loss": 2.9798, "lr": 4.893298743830168e-05, "epoch": 0.184, "percentage": 18.4, "elapsed_time": "0:05:07", "remaining_time": "0:22:42"}
93
+ {"current_steps": 930, "total_steps": 5000, "loss": 1.153, "lr": 4.888195905305859e-05, "epoch": 0.186, "percentage": 18.6, "elapsed_time": "0:05:10", "remaining_time": "0:22:37"}
94
+ {"current_steps": 940, "total_steps": 5000, "loss": 3.1989, "lr": 4.882976669482367e-05, "epoch": 0.188, "percentage": 18.8, "elapsed_time": "0:05:13", "remaining_time": "0:22:32"}
95
+ {"current_steps": 950, "total_steps": 5000, "loss": 2.7478, "lr": 4.877641290737884e-05, "epoch": 0.19, "percentage": 19.0, "elapsed_time": "0:05:16", "remaining_time": "0:22:27"}
96
+ {"current_steps": 960, "total_steps": 5000, "loss": 2.09, "lr": 4.8721900291112415e-05, "epoch": 0.192, "percentage": 19.2, "elapsed_time": "0:05:19", "remaining_time": "0:22:23"}
97
+ {"current_steps": 970, "total_steps": 5000, "loss": 1.7634, "lr": 4.8666231502892415e-05, "epoch": 0.194, "percentage": 19.4, "elapsed_time": "0:05:22", "remaining_time": "0:22:18"}
98
+ {"current_steps": 980, "total_steps": 5000, "loss": 1.6288, "lr": 4.860940925593703e-05, "epoch": 0.196, "percentage": 19.6, "elapsed_time": "0:05:25", "remaining_time": "0:22:13"}
99
+ {"current_steps": 990, "total_steps": 5000, "loss": 2.9691, "lr": 4.855143631968242e-05, "epoch": 0.198, "percentage": 19.8, "elapsed_time": "0:05:28", "remaining_time": "0:22:09"}
100
+ {"current_steps": 1000, "total_steps": 5000, "loss": 3.5196, "lr": 4.849231551964771e-05, "epoch": 0.2, "percentage": 20.0, "elapsed_time": "0:05:31", "remaining_time": "0:22:04"}
101
+ {"current_steps": 1010, "total_steps": 5000, "loss": 3.1836, "lr": 4.843204973729729e-05, "epoch": 0.202, "percentage": 20.2, "elapsed_time": "0:05:34", "remaining_time": "0:21:59"}
102
+ {"current_steps": 1020, "total_steps": 5000, "loss": 3.1554, "lr": 4.837064190990036e-05, "epoch": 0.204, "percentage": 20.4, "elapsed_time": "0:05:37", "remaining_time": "0:21:55"}
103
+ {"current_steps": 1030, "total_steps": 5000, "loss": 3.0401, "lr": 4.830809503038781e-05, "epoch": 0.206, "percentage": 20.6, "elapsed_time": "0:05:39", "remaining_time": "0:21:49"}
104
+ {"current_steps": 1040, "total_steps": 5000, "loss": 4.8929, "lr": 4.8244412147206284e-05, "epoch": 0.208, "percentage": 20.8, "elapsed_time": "0:05:42", "remaining_time": "0:21:45"}
105
+ {"current_steps": 1050, "total_steps": 5000, "loss": 2.9868, "lr": 4.817959636416969e-05, "epoch": 0.21, "percentage": 21.0, "elapsed_time": "0:05:45", "remaining_time": "0:21:40"}
106
+ {"current_steps": 1060, "total_steps": 5000, "loss": 5.5685, "lr": 4.8113650840307834e-05, "epoch": 0.212, "percentage": 21.2, "elapsed_time": "0:05:48", "remaining_time": "0:21:36"}
107
+ {"current_steps": 1070, "total_steps": 5000, "loss": 2.2926, "lr": 4.8046578789712515e-05, "epoch": 0.214, "percentage": 21.4, "elapsed_time": "0:05:51", "remaining_time": "0:21:31"}
108
+ {"current_steps": 1080, "total_steps": 5000, "loss": 3.3832, "lr": 4.797838348138086e-05, "epoch": 0.216, "percentage": 21.6, "elapsed_time": "0:05:54", "remaining_time": "0:21:27"}
109
+ {"current_steps": 1090, "total_steps": 5000, "loss": 4.5766, "lr": 4.790906823905599e-05, "epoch": 0.218, "percentage": 21.8, "elapsed_time": "0:05:57", "remaining_time": "0:21:23"}
110
+ {"current_steps": 1100, "total_steps": 5000, "loss": 3.2696, "lr": 4.783863644106502e-05, "epoch": 0.22, "percentage": 22.0, "elapsed_time": "0:06:00", "remaining_time": "0:21:18"}
111
+ {"current_steps": 1110, "total_steps": 5000, "loss": 1.583, "lr": 4.776709152015443e-05, "epoch": 0.222, "percentage": 22.2, "elapsed_time": "0:06:03", "remaining_time": "0:21:14"}
112
+ {"current_steps": 1120, "total_steps": 5000, "loss": 2.2554, "lr": 4.769443696332272e-05, "epoch": 0.224, "percentage": 22.4, "elapsed_time": "0:06:06", "remaining_time": "0:21:10"}
113
+ {"current_steps": 1130, "total_steps": 5000, "loss": 1.6651, "lr": 4.762067631165049e-05, "epoch": 0.226, "percentage": 22.6, "elapsed_time": "0:06:09", "remaining_time": "0:21:06"}
114
+ {"current_steps": 1140, "total_steps": 5000, "loss": 3.1578, "lr": 4.754581316012785e-05, "epoch": 0.228, "percentage": 22.8, "elapsed_time": "0:06:12", "remaining_time": "0:21:01"}
115
+ {"current_steps": 1150, "total_steps": 5000, "loss": 2.4297, "lr": 4.7469851157479177e-05, "epoch": 0.23, "percentage": 23.0, "elapsed_time": "0:06:15", "remaining_time": "0:20:57"}
116
+ {"current_steps": 1160, "total_steps": 5000, "loss": 2.9167, "lr": 4.7392794005985326e-05, "epoch": 0.232, "percentage": 23.2, "elapsed_time": "0:06:18", "remaining_time": "0:20:53"}
117
+ {"current_steps": 1170, "total_steps": 5000, "loss": 4.5697, "lr": 4.731464546130314e-05, "epoch": 0.234, "percentage": 23.4, "elapsed_time": "0:06:21", "remaining_time": "0:20:49"}
118
+ {"current_steps": 1180, "total_steps": 5000, "loss": 2.1219, "lr": 4.723540933228244e-05, "epoch": 0.236, "percentage": 23.6, "elapsed_time": "0:06:24", "remaining_time": "0:20:45"}
119
+ {"current_steps": 1190, "total_steps": 5000, "loss": 2.4125, "lr": 4.715508948078037e-05, "epoch": 0.238, "percentage": 23.8, "elapsed_time": "0:06:27", "remaining_time": "0:20:41"}
120
+ {"current_steps": 1200, "total_steps": 5000, "loss": 4.1166, "lr": 4.707368982147318e-05, "epoch": 0.24, "percentage": 24.0, "elapsed_time": "0:06:30", "remaining_time": "0:20:37"}
121
+ {"current_steps": 1210, "total_steps": 5000, "loss": 3.1524, "lr": 4.6991214321665414e-05, "epoch": 0.242, "percentage": 24.2, "elapsed_time": "0:06:33", "remaining_time": "0:20:32"}
122
+ {"current_steps": 1220, "total_steps": 5000, "loss": 1.5237, "lr": 4.690766700109659e-05, "epoch": 0.244, "percentage": 24.4, "elapsed_time": "0:06:36", "remaining_time": "0:20:28"}
123
+ {"current_steps": 1230, "total_steps": 5000, "loss": 2.8092, "lr": 4.682305193174524e-05, "epoch": 0.246, "percentage": 24.6, "elapsed_time": "0:06:39", "remaining_time": "0:20:24"}
124
+ {"current_steps": 1240, "total_steps": 5000, "loss": 2.256, "lr": 4.6737373237630476e-05, "epoch": 0.248, "percentage": 24.8, "elapsed_time": "0:06:42", "remaining_time": "0:20:20"}
125
+ {"current_steps": 1250, "total_steps": 5000, "loss": 2.5601, "lr": 4.665063509461097e-05, "epoch": 0.25, "percentage": 25.0, "elapsed_time": "0:06:45", "remaining_time": "0:20:16"}
126
+ {"current_steps": 1260, "total_steps": 5000, "loss": 2.5502, "lr": 4.656284173018144e-05, "epoch": 0.252, "percentage": 25.2, "elapsed_time": "0:06:48", "remaining_time": "0:20:12"}
127
+ {"current_steps": 1270, "total_steps": 5000, "loss": 3.4447, "lr": 4.6473997423266614e-05, "epoch": 0.254, "percentage": 25.4, "elapsed_time": "0:06:51", "remaining_time": "0:20:07"}
128
+ {"current_steps": 1280, "total_steps": 5000, "loss": 1.6954, "lr": 4.638410650401267e-05, "epoch": 0.256, "percentage": 25.6, "elapsed_time": "0:06:54", "remaining_time": "0:20:03"}
129
+ {"current_steps": 1290, "total_steps": 5000, "loss": 2.353, "lr": 4.629317335357619e-05, "epoch": 0.258, "percentage": 25.8, "elapsed_time": "0:06:57", "remaining_time": "0:19:59"}
130
+ {"current_steps": 1300, "total_steps": 5000, "loss": 2.1544, "lr": 4.620120240391065e-05, "epoch": 0.26, "percentage": 26.0, "elapsed_time": "0:07:00", "remaining_time": "0:19:55"}
131
+ {"current_steps": 1310, "total_steps": 5000, "loss": 1.2159, "lr": 4.610819813755038e-05, "epoch": 0.262, "percentage": 26.2, "elapsed_time": "0:07:02", "remaining_time": "0:19:51"}
132
+ {"current_steps": 1320, "total_steps": 5000, "loss": 1.9003, "lr": 4.601416508739211e-05, "epoch": 0.264, "percentage": 26.4, "elapsed_time": "0:07:05", "remaining_time": "0:19:47"}
133
+ {"current_steps": 1330, "total_steps": 5000, "loss": 4.5354, "lr": 4.591910783647404e-05, "epoch": 0.266, "percentage": 26.6, "elapsed_time": "0:07:08", "remaining_time": "0:19:43"}
134
+ {"current_steps": 1340, "total_steps": 5000, "loss": 1.9724, "lr": 4.5823031017752485e-05, "epoch": 0.268, "percentage": 26.8, "elapsed_time": "0:07:11", "remaining_time": "0:19:39"}
135
+ {"current_steps": 1350, "total_steps": 5000, "loss": 2.4934, "lr": 4.572593931387604e-05, "epoch": 0.27, "percentage": 27.0, "elapsed_time": "0:07:14", "remaining_time": "0:19:35"}
136
+ {"current_steps": 1360, "total_steps": 5000, "loss": 2.1056, "lr": 4.562783745695738e-05, "epoch": 0.272, "percentage": 27.2, "elapsed_time": "0:07:17", "remaining_time": "0:19:31"}
137
+ {"current_steps": 1370, "total_steps": 5000, "loss": 1.5175, "lr": 4.5528730228342605e-05, "epoch": 0.274, "percentage": 27.4, "elapsed_time": "0:07:20", "remaining_time": "0:19:28"}
138
+ {"current_steps": 1380, "total_steps": 5000, "loss": 2.1733, "lr": 4.542862245837821e-05, "epoch": 0.276, "percentage": 27.6, "elapsed_time": "0:07:23", "remaining_time": "0:19:24"}
139
+ {"current_steps": 1390, "total_steps": 5000, "loss": 1.431, "lr": 4.532751902617569e-05, "epoch": 0.278, "percentage": 27.8, "elapsed_time": "0:07:26", "remaining_time": "0:19:20"}
140
+ {"current_steps": 1400, "total_steps": 5000, "loss": 1.5888, "lr": 4.522542485937369e-05, "epoch": 0.28, "percentage": 28.0, "elapsed_time": "0:07:29", "remaining_time": "0:19:16"}
141
+ {"current_steps": 1410, "total_steps": 5000, "loss": 2.84, "lr": 4.512234493389785e-05, "epoch": 0.282, "percentage": 28.2, "elapsed_time": "0:07:32", "remaining_time": "0:19:13"}
142
+ {"current_steps": 1420, "total_steps": 5000, "loss": 3.0874, "lr": 4.5018284273718336e-05, "epoch": 0.284, "percentage": 28.4, "elapsed_time": "0:07:35", "remaining_time": "0:19:09"}
143
+ {"current_steps": 1430, "total_steps": 5000, "loss": 1.6856, "lr": 4.491324795060491e-05, "epoch": 0.286, "percentage": 28.6, "elapsed_time": "0:07:39", "remaining_time": "0:19:05"}
144
+ {"current_steps": 1440, "total_steps": 5000, "loss": 3.0233, "lr": 4.480724108387977e-05, "epoch": 0.288, "percentage": 28.8, "elapsed_time": "0:07:42", "remaining_time": "0:19:02"}
145
+ {"current_steps": 1450, "total_steps": 5000, "loss": 2.6897, "lr": 4.4700268840168045e-05, "epoch": 0.29, "percentage": 29.0, "elapsed_time": "0:07:45", "remaining_time": "0:18:58"}
146
+ {"current_steps": 1460, "total_steps": 5000, "loss": 5.0716, "lr": 4.4592336433146e-05, "epoch": 0.292, "percentage": 29.2, "elapsed_time": "0:07:48", "remaining_time": "0:18:54"}
147
+ {"current_steps": 1470, "total_steps": 5000, "loss": 1.5289, "lr": 4.448344912328686e-05, "epoch": 0.294, "percentage": 29.4, "elapsed_time": "0:07:51", "remaining_time": "0:18:51"}
148
+ {"current_steps": 1480, "total_steps": 5000, "loss": 5.0471, "lr": 4.4373612217604496e-05, "epoch": 0.296, "percentage": 29.6, "elapsed_time": "0:07:54", "remaining_time": "0:18:47"}
149
+ {"current_steps": 1490, "total_steps": 5000, "loss": 1.6411, "lr": 4.426283106939474e-05, "epoch": 0.298, "percentage": 29.8, "elapsed_time": "0:07:57", "remaining_time": "0:18:43"}
150
+ {"current_steps": 1500, "total_steps": 5000, "loss": 3.0973, "lr": 4.415111107797445e-05, "epoch": 0.3, "percentage": 30.0, "elapsed_time": "0:08:00", "remaining_time": "0:18:40"}
151
+ {"current_steps": 1510, "total_steps": 5000, "loss": 2.0314, "lr": 4.403845768841842e-05, "epoch": 0.302, "percentage": 30.2, "elapsed_time": "0:08:03", "remaining_time": "0:18:36"}
152
+ {"current_steps": 1520, "total_steps": 5000, "loss": 5.9555, "lr": 4.3924876391293915e-05, "epoch": 0.304, "percentage": 30.4, "elapsed_time": "0:08:06", "remaining_time": "0:18:33"}
153
+ {"current_steps": 1530, "total_steps": 5000, "loss": 2.5155, "lr": 4.381037272239311e-05, "epoch": 0.306, "percentage": 30.6, "elapsed_time": "0:08:08", "remaining_time": "0:18:28"}
154
+ {"current_steps": 1540, "total_steps": 5000, "loss": 3.6459, "lr": 4.36949522624633e-05, "epoch": 0.308, "percentage": 30.8, "elapsed_time": "0:08:11", "remaining_time": "0:18:25"}
155
+ {"current_steps": 1550, "total_steps": 5000, "loss": 3.0688, "lr": 4.357862063693486e-05, "epoch": 0.31, "percentage": 31.0, "elapsed_time": "0:08:14", "remaining_time": "0:18:21"}
156
+ {"current_steps": 1560, "total_steps": 5000, "loss": 5.096, "lr": 4.3461383515647106e-05, "epoch": 0.312, "percentage": 31.2, "elapsed_time": "0:08:17", "remaining_time": "0:18:17"}
157
+ {"current_steps": 1570, "total_steps": 5000, "loss": 3.9776, "lr": 4.334324661257191e-05, "epoch": 0.314, "percentage": 31.4, "elapsed_time": "0:08:20", "remaining_time": "0:18:14"}
158
+ {"current_steps": 1580, "total_steps": 5000, "loss": 1.8869, "lr": 4.3224215685535294e-05, "epoch": 0.316, "percentage": 31.6, "elapsed_time": "0:08:23", "remaining_time": "0:18:10"}
159
+ {"current_steps": 1590, "total_steps": 5000, "loss": 2.1454, "lr": 4.3104296535936695e-05, "epoch": 0.318, "percentage": 31.8, "elapsed_time": "0:08:26", "remaining_time": "0:18:06"}
160
+ {"current_steps": 1600, "total_steps": 5000, "loss": 3.0165, "lr": 4.2983495008466276e-05, "epoch": 0.32, "percentage": 32.0, "elapsed_time": "0:08:29", "remaining_time": "0:18:03"}
161
+ {"current_steps": 1610, "total_steps": 5000, "loss": 2.9248, "lr": 4.2861816990820084e-05, "epoch": 0.322, "percentage": 32.2, "elapsed_time": "0:08:32", "remaining_time": "0:17:59"}
162
+ {"current_steps": 1620, "total_steps": 5000, "loss": 5.9823, "lr": 4.273926841341302e-05, "epoch": 0.324, "percentage": 32.4, "elapsed_time": "0:08:35", "remaining_time": "0:17:55"}
163
+ {"current_steps": 1630, "total_steps": 5000, "loss": 4.5845, "lr": 4.261585524908987e-05, "epoch": 0.326, "percentage": 32.6, "elapsed_time": "0:08:38", "remaining_time": "0:17:52"}
164
+ {"current_steps": 1640, "total_steps": 5000, "loss": 2.5418, "lr": 4.249158351283414e-05, "epoch": 0.328, "percentage": 32.8, "elapsed_time": "0:08:41", "remaining_time": "0:17:48"}
165
+ {"current_steps": 1650, "total_steps": 5000, "loss": 3.9026, "lr": 4.2366459261474933e-05, "epoch": 0.33, "percentage": 33.0, "elapsed_time": "0:08:44", "remaining_time": "0:17:45"}
166
+ {"current_steps": 1660, "total_steps": 5000, "loss": 3.7564, "lr": 4.224048859339175e-05, "epoch": 0.332, "percentage": 33.2, "elapsed_time": "0:08:47", "remaining_time": "0:17:41"}
167
+ {"current_steps": 1670, "total_steps": 5000, "loss": 1.7162, "lr": 4.211367764821722e-05, "epoch": 0.334, "percentage": 33.4, "elapsed_time": "0:08:50", "remaining_time": "0:17:37"}
168
+ {"current_steps": 1680, "total_steps": 5000, "loss": 3.2491, "lr": 4.198603260653792e-05, "epoch": 0.336, "percentage": 33.6, "elapsed_time": "0:08:53", "remaining_time": "0:17:34"}
169
+ {"current_steps": 1690, "total_steps": 5000, "loss": 1.595, "lr": 4.185755968959308e-05, "epoch": 0.338, "percentage": 33.8, "elapsed_time": "0:08:56", "remaining_time": "0:17:30"}
170
+ {"current_steps": 1700, "total_steps": 5000, "loss": 2.5093, "lr": 4.172826515897146e-05, "epoch": 0.34, "percentage": 34.0, "elapsed_time": "0:08:59", "remaining_time": "0:17:27"}
171
+ {"current_steps": 1710, "total_steps": 5000, "loss": 5.9477, "lr": 4.1598155316306044e-05, "epoch": 0.342, "percentage": 34.2, "elapsed_time": "0:09:02", "remaining_time": "0:17:23"}
172
+ {"current_steps": 1720, "total_steps": 5000, "loss": 2.6358, "lr": 4.146723650296701e-05, "epoch": 0.344, "percentage": 34.4, "elapsed_time": "0:09:05", "remaining_time": "0:17:20"}
173
+ {"current_steps": 1730, "total_steps": 5000, "loss": 1.4777, "lr": 4.133551509975264e-05, "epoch": 0.346, "percentage": 34.6, "elapsed_time": "0:09:08", "remaining_time": "0:17:16"}
174
+ {"current_steps": 1740, "total_steps": 5000, "loss": 2.132, "lr": 4.1202997526578276e-05, "epoch": 0.348, "percentage": 34.8, "elapsed_time": "0:09:11", "remaining_time": "0:17:13"}
175
+ {"current_steps": 1750, "total_steps": 5000, "loss": 1.4507, "lr": 4.1069690242163484e-05, "epoch": 0.35, "percentage": 35.0, "elapsed_time": "0:09:14", "remaining_time": "0:17:09"}
176
+ {"current_steps": 1760, "total_steps": 5000, "loss": 1.2368, "lr": 4.093559974371725e-05, "epoch": 0.352, "percentage": 35.2, "elapsed_time": "0:09:17", "remaining_time": "0:17:06"}
177
+ {"current_steps": 1770, "total_steps": 5000, "loss": 0.9967, "lr": 4.080073256662127e-05, "epoch": 0.354, "percentage": 35.4, "elapsed_time": "0:09:20", "remaining_time": "0:17:02"}
178
+ {"current_steps": 1780, "total_steps": 5000, "loss": 5.9177, "lr": 4.066509528411152e-05, "epoch": 0.356, "percentage": 35.6, "elapsed_time": "0:09:23", "remaining_time": "0:16:59"}
179
+ {"current_steps": 1790, "total_steps": 5000, "loss": 2.3273, "lr": 4.052869450695776e-05, "epoch": 0.358, "percentage": 35.8, "elapsed_time": "0:09:26", "remaining_time": "0:16:55"}
180
+ {"current_steps": 1800, "total_steps": 5000, "loss": 2.7299, "lr": 4.039153688314145e-05, "epoch": 0.36, "percentage": 36.0, "elapsed_time": "0:09:29", "remaining_time": "0:16:52"}
181
+ {"current_steps": 1810, "total_steps": 5000, "loss": 2.0494, "lr": 4.02536290975317e-05, "epoch": 0.362, "percentage": 36.2, "elapsed_time": "0:09:32", "remaining_time": "0:16:48"}
182
+ {"current_steps": 1820, "total_steps": 5000, "loss": 1.9595, "lr": 4.011497787155938e-05, "epoch": 0.364, "percentage": 36.4, "elapsed_time": "0:09:35", "remaining_time": "0:16:45"}
183
+ {"current_steps": 1830, "total_steps": 5000, "loss": 2.4506, "lr": 3.997558996288965e-05, "epoch": 0.366, "percentage": 36.6, "elapsed_time": "0:09:38", "remaining_time": "0:16:41"}
184
+ {"current_steps": 1840, "total_steps": 5000, "loss": 3.6176, "lr": 3.983547216509254e-05, "epoch": 0.368, "percentage": 36.8, "elapsed_time": "0:09:41", "remaining_time": "0:16:38"}
185
+ {"current_steps": 1850, "total_steps": 5000, "loss": 3.2954, "lr": 3.969463130731183e-05, "epoch": 0.37, "percentage": 37.0, "elapsed_time": "0:09:44", "remaining_time": "0:16:34"}
186
+ {"current_steps": 1860, "total_steps": 5000, "loss": 2.0271, "lr": 3.955307425393224e-05, "epoch": 0.372, "percentage": 37.2, "elapsed_time": "0:09:47", "remaining_time": "0:16:31"}
187
+ {"current_steps": 1870, "total_steps": 5000, "loss": 1.6445, "lr": 3.941080790424484e-05, "epoch": 0.374, "percentage": 37.4, "elapsed_time": "0:09:49", "remaining_time": "0:16:27"}
188
+ {"current_steps": 1880, "total_steps": 5000, "loss": 2.4891, "lr": 3.92678391921108e-05, "epoch": 0.376, "percentage": 37.6, "elapsed_time": "0:09:52", "remaining_time": "0:16:24"}
189
+ {"current_steps": 1890, "total_steps": 5000, "loss": 1.9795, "lr": 3.912417508562345e-05, "epoch": 0.378, "percentage": 37.8, "elapsed_time": "0:09:55", "remaining_time": "0:16:20"}
190
+ {"current_steps": 1900, "total_steps": 5000, "loss": 1.7931, "lr": 3.897982258676867e-05, "epoch": 0.38, "percentage": 38.0, "elapsed_time": "0:09:58", "remaining_time": "0:16:17"}
191
+ {"current_steps": 1910, "total_steps": 5000, "loss": 2.9522, "lr": 3.883478873108361e-05, "epoch": 0.382, "percentage": 38.2, "elapsed_time": "0:10:01", "remaining_time": "0:16:13"}
192
+ {"current_steps": 1920, "total_steps": 5000, "loss": 2.0274, "lr": 3.868908058731376e-05, "epoch": 0.384, "percentage": 38.4, "elapsed_time": "0:10:04", "remaining_time": "0:16:10"}
193
+ {"current_steps": 1930, "total_steps": 5000, "loss": 3.1124, "lr": 3.85427052570685e-05, "epoch": 0.386, "percentage": 38.6, "elapsed_time": "0:10:07", "remaining_time": "0:16:07"}
194
+ {"current_steps": 1940, "total_steps": 5000, "loss": 2.101, "lr": 3.8395669874474915e-05, "epoch": 0.388, "percentage": 38.8, "elapsed_time": "0:10:10", "remaining_time": "0:16:03"}
195
+ {"current_steps": 1950, "total_steps": 5000, "loss": 3.1344, "lr": 3.824798160583012e-05, "epoch": 0.39, "percentage": 39.0, "elapsed_time": "0:10:13", "remaining_time": "0:16:00"}
196
+ {"current_steps": 1960, "total_steps": 5000, "loss": 1.7248, "lr": 3.8099647649251986e-05, "epoch": 0.392, "percentage": 39.2, "elapsed_time": "0:10:16", "remaining_time": "0:15:56"}
197
+ {"current_steps": 1970, "total_steps": 5000, "loss": 1.9146, "lr": 3.795067523432826e-05, "epoch": 0.394, "percentage": 39.4, "elapsed_time": "0:10:19", "remaining_time": "0:15:53"}
198
+ {"current_steps": 1980, "total_steps": 5000, "loss": 2.3492, "lr": 3.780107162176429e-05, "epoch": 0.396, "percentage": 39.6, "elapsed_time": "0:10:22", "remaining_time": "0:15:49"}
199
+ {"current_steps": 1990, "total_steps": 5000, "loss": 1.5525, "lr": 3.765084410302909e-05, "epoch": 0.398, "percentage": 39.8, "elapsed_time": "0:10:25", "remaining_time": "0:15:46"}
200
+ {"current_steps": 2000, "total_steps": 5000, "loss": 2.8312, "lr": 3.7500000000000003e-05, "epoch": 0.4, "percentage": 40.0, "elapsed_time": "0:10:28", "remaining_time": "0:15:43"}
201
+ {"current_steps": 2010, "total_steps": 5000, "loss": 3.3568, "lr": 3.7348546664605777e-05, "epoch": 0.402, "percentage": 40.2, "elapsed_time": "0:10:31", "remaining_time": "0:15:39"}
202
+ {"current_steps": 2020, "total_steps": 5000, "loss": 1.1778, "lr": 3.719649147846832e-05, "epoch": 0.404, "percentage": 40.4, "elapsed_time": "0:10:34", "remaining_time": "0:15:36"}
203
+ {"current_steps": 2030, "total_steps": 5000, "loss": 2.3964, "lr": 3.704384185254288e-05, "epoch": 0.406, "percentage": 40.6, "elapsed_time": "0:10:37", "remaining_time": "0:15:33"}
204
+ {"current_steps": 2040, "total_steps": 5000, "loss": 2.0172, "lr": 3.689060522675689e-05, "epoch": 0.408, "percentage": 40.8, "elapsed_time": "0:10:40", "remaining_time": "0:15:29"}
205
+ {"current_steps": 2050, "total_steps": 5000, "loss": 2.1464, "lr": 3.673678906964727e-05, "epoch": 0.41, "percentage": 41.0, "elapsed_time": "0:10:43", "remaining_time": "0:15:26"}
206
+ {"current_steps": 2060, "total_steps": 5000, "loss": 2.1743, "lr": 3.6582400877996546e-05, "epoch": 0.412, "percentage": 41.2, "elapsed_time": "0:10:46", "remaining_time": "0:15:23"}
207
+ {"current_steps": 2070, "total_steps": 5000, "loss": 2.785, "lr": 3.642744817646736e-05, "epoch": 0.414, "percentage": 41.4, "elapsed_time": "0:10:49", "remaining_time": "0:15:19"}
208
+ {"current_steps": 2080, "total_steps": 5000, "loss": 1.5782, "lr": 3.627193851723577e-05, "epoch": 0.416, "percentage": 41.6, "elapsed_time": "0:10:52", "remaining_time": "0:15:16"}
209
+ {"current_steps": 2090, "total_steps": 5000, "loss": 3.6745, "lr": 3.611587947962319e-05, "epoch": 0.418, "percentage": 41.8, "elapsed_time": "0:10:55", "remaining_time": "0:15:13"}
210
+ {"current_steps": 2100, "total_steps": 5000, "loss": 1.7939, "lr": 3.5959278669726935e-05, "epoch": 0.42, "percentage": 42.0, "elapsed_time": "0:10:58", "remaining_time": "0:15:09"}
211
+ {"current_steps": 2110, "total_steps": 5000, "loss": 1.9243, "lr": 3.580214372004956e-05, "epoch": 0.422, "percentage": 42.2, "elapsed_time": "0:11:01", "remaining_time": "0:15:06"}
212
+ {"current_steps": 2120, "total_steps": 5000, "loss": 3.6028, "lr": 3.564448228912682e-05, "epoch": 0.424, "percentage": 42.4, "elapsed_time": "0:11:04", "remaining_time": "0:15:03"}
213
+ {"current_steps": 2130, "total_steps": 5000, "loss": 1.3939, "lr": 3.548630206115443e-05, "epoch": 0.426, "percentage": 42.6, "elapsed_time": "0:11:07", "remaining_time": "0:14:59"}
214
+ {"current_steps": 2140, "total_steps": 5000, "loss": 2.1193, "lr": 3.532761074561355e-05, "epoch": 0.428, "percentage": 42.8, "elapsed_time": "0:11:10", "remaining_time": "0:14:56"}
215
+ {"current_steps": 2150, "total_steps": 5000, "loss": 2.0047, "lr": 3.516841607689501e-05, "epoch": 0.43, "percentage": 43.0, "elapsed_time": "0:11:13", "remaining_time": "0:14:53"}
216
+ {"current_steps": 2160, "total_steps": 5000, "loss": 5.2666, "lr": 3.5008725813922386e-05, "epoch": 0.432, "percentage": 43.2, "elapsed_time": "0:11:16", "remaining_time": "0:14:49"}
217
+ {"current_steps": 2170, "total_steps": 5000, "loss": 1.8129, "lr": 3.484854773977378e-05, "epoch": 0.434, "percentage": 43.4, "elapsed_time": "0:11:19", "remaining_time": "0:14:46"}
218
+ {"current_steps": 2180, "total_steps": 5000, "loss": 1.3969, "lr": 3.4687889661302576e-05, "epoch": 0.436, "percentage": 43.6, "elapsed_time": "0:11:23", "remaining_time": "0:14:43"}
219
+ {"current_steps": 2190, "total_steps": 5000, "loss": 1.9788, "lr": 3.452675940875686e-05, "epoch": 0.438, "percentage": 43.8, "elapsed_time": "0:11:25", "remaining_time": "0:14:39"}
220
+ {"current_steps": 2200, "total_steps": 5000, "loss": 1.5522, "lr": 3.436516483539781e-05, "epoch": 0.44, "percentage": 44.0, "elapsed_time": "0:11:28", "remaining_time": "0:14:36"}
221
+ {"current_steps": 2210, "total_steps": 5000, "loss": 2.1951, "lr": 3.4203113817116957e-05, "epoch": 0.442, "percentage": 44.2, "elapsed_time": "0:11:31", "remaining_time": "0:14:33"}
222
+ {"current_steps": 2220, "total_steps": 5000, "loss": 5.2669, "lr": 3.4040614252052305e-05, "epoch": 0.444, "percentage": 44.4, "elapsed_time": "0:11:34", "remaining_time": "0:14:29"}
223
+ {"current_steps": 2230, "total_steps": 5000, "loss": 2.5184, "lr": 3.387767406020343e-05, "epoch": 0.446, "percentage": 44.6, "elapsed_time": "0:11:37", "remaining_time": "0:14:26"}
224
+ {"current_steps": 2240, "total_steps": 5000, "loss": 2.0463, "lr": 3.3714301183045385e-05, "epoch": 0.448, "percentage": 44.8, "elapsed_time": "0:11:40", "remaining_time": "0:14:23"}
225
+ {"current_steps": 2250, "total_steps": 5000, "loss": 2.9354, "lr": 3.355050358314172e-05, "epoch": 0.45, "percentage": 45.0, "elapsed_time": "0:11:43", "remaining_time": "0:14:20"}
226
+ {"current_steps": 2260, "total_steps": 5000, "loss": 2.1297, "lr": 3.338628924375638e-05, "epoch": 0.452, "percentage": 45.2, "elapsed_time": "0:11:46", "remaining_time": "0:14:16"}
227
+ {"current_steps": 2270, "total_steps": 5000, "loss": 3.2082, "lr": 3.322166616846458e-05, "epoch": 0.454, "percentage": 45.4, "elapsed_time": "0:11:49", "remaining_time": "0:14:13"}
228
+ {"current_steps": 2280, "total_steps": 5000, "loss": 2.0667, "lr": 3.305664238076278e-05, "epoch": 0.456, "percentage": 45.6, "elapsed_time": "0:11:52", "remaining_time": "0:14:10"}
229
+ {"current_steps": 2290, "total_steps": 5000, "loss": 2.4089, "lr": 3.289122592367757e-05, "epoch": 0.458, "percentage": 45.8, "elapsed_time": "0:11:55", "remaining_time": "0:14:06"}
230
+ {"current_steps": 2300, "total_steps": 5000, "loss": 2.0842, "lr": 3.272542485937369e-05, "epoch": 0.46, "percentage": 46.0, "elapsed_time": "0:11:58", "remaining_time": "0:14:03"}
231
+ {"current_steps": 2310, "total_steps": 5000, "loss": 3.9489, "lr": 3.2559247268761115e-05, "epoch": 0.462, "percentage": 46.2, "elapsed_time": "0:12:01", "remaining_time": "0:14:00"}
232
+ {"current_steps": 2320, "total_steps": 5000, "loss": 1.8036, "lr": 3.239270125110117e-05, "epoch": 0.464, "percentage": 46.4, "elapsed_time": "0:12:04", "remaining_time": "0:13:57"}
233
+ {"current_steps": 2330, "total_steps": 5000, "loss": 2.8004, "lr": 3.222579492361179e-05, "epoch": 0.466, "percentage": 46.6, "elapsed_time": "0:12:07", "remaining_time": "0:13:53"}
234
+ {"current_steps": 2340, "total_steps": 5000, "loss": 1.0807, "lr": 3.205853642107192e-05, "epoch": 0.468, "percentage": 46.8, "elapsed_time": "0:12:10", "remaining_time": "0:13:50"}
235
+ {"current_steps": 2350, "total_steps": 5000, "loss": 4.2218, "lr": 3.1890933895424976e-05, "epoch": 0.47, "percentage": 47.0, "elapsed_time": "0:12:13", "remaining_time": "0:13:47"}
236
+ {"current_steps": 2360, "total_steps": 5000, "loss": 1.9778, "lr": 3.172299551538164e-05, "epoch": 0.472, "percentage": 47.2, "elapsed_time": "0:12:16", "remaining_time": "0:13:44"}
237
+ {"current_steps": 2370, "total_steps": 5000, "loss": 2.7487, "lr": 3.155472946602162e-05, "epoch": 0.474, "percentage": 47.4, "elapsed_time": "0:12:19", "remaining_time": "0:13:40"}
238
+ {"current_steps": 2380, "total_steps": 5000, "loss": 2.199, "lr": 3.138614394839476e-05, "epoch": 0.476, "percentage": 47.6, "elapsed_time": "0:12:22", "remaining_time": "0:13:37"}
239
+ {"current_steps": 2390, "total_steps": 5000, "loss": 3.5763, "lr": 3.121724717912138e-05, "epoch": 0.478, "percentage": 47.8, "elapsed_time": "0:12:25", "remaining_time": "0:13:34"}
240
+ {"current_steps": 2400, "total_steps": 5000, "loss": 3.4331, "lr": 3.104804738999169e-05, "epoch": 0.48, "percentage": 48.0, "elapsed_time": "0:12:28", "remaining_time": "0:13:31"}
241
+ {"current_steps": 2410, "total_steps": 5000, "loss": 2.2419, "lr": 3.087855282756475e-05, "epoch": 0.482, "percentage": 48.2, "elapsed_time": "0:12:31", "remaining_time": "0:13:27"}
242
+ {"current_steps": 2420, "total_steps": 5000, "loss": 2.0476, "lr": 3.0708771752766394e-05, "epoch": 0.484, "percentage": 48.4, "elapsed_time": "0:12:34", "remaining_time": "0:13:24"}
243
+ {"current_steps": 2430, "total_steps": 5000, "loss": 1.3934, "lr": 3.053871244048669e-05, "epoch": 0.486, "percentage": 48.6, "elapsed_time": "0:12:37", "remaining_time": "0:13:21"}
244
+ {"current_steps": 2440, "total_steps": 5000, "loss": 2.7532, "lr": 3.0368383179176585e-05, "epoch": 0.488, "percentage": 48.8, "elapsed_time": "0:12:40", "remaining_time": "0:13:18"}
245
+ {"current_steps": 2450, "total_steps": 5000, "loss": 2.112, "lr": 3.0197792270443982e-05, "epoch": 0.49, "percentage": 49.0, "elapsed_time": "0:12:43", "remaining_time": "0:13:14"}
246
+ {"current_steps": 2460, "total_steps": 5000, "loss": 1.8119, "lr": 3.002694802864912e-05, "epoch": 0.492, "percentage": 49.2, "elapsed_time": "0:12:46", "remaining_time": "0:13:11"}
247
+ {"current_steps": 2470, "total_steps": 5000, "loss": 1.5163, "lr": 2.98558587804993e-05, "epoch": 0.494, "percentage": 49.4, "elapsed_time": "0:12:49", "remaining_time": "0:13:08"}
248
+ {"current_steps": 2480, "total_steps": 5000, "loss": 3.372, "lr": 2.9684532864643122e-05, "epoch": 0.496, "percentage": 49.6, "elapsed_time": "0:12:52", "remaining_time": "0:13:05"}
249
+ {"current_steps": 2490, "total_steps": 5000, "loss": 1.5534, "lr": 2.9512978631264006e-05, "epoch": 0.498, "percentage": 49.8, "elapsed_time": "0:12:55", "remaining_time": "0:13:01"}
250
+ {"current_steps": 2500, "total_steps": 5000, "loss": 1.8644, "lr": 2.9341204441673266e-05, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:12:58", "remaining_time": "0:12:58"}
251
+ {"current_steps": 2510, "total_steps": 5000, "loss": 4.3985, "lr": 2.916921866790256e-05, "epoch": 0.502, "percentage": 50.2, "elapsed_time": "0:13:01", "remaining_time": "0:12:55"}
252
+ {"current_steps": 2520, "total_steps": 5000, "loss": 2.0158, "lr": 2.8997029692295874e-05, "epoch": 0.504, "percentage": 50.4, "elapsed_time": "0:13:04", "remaining_time": "0:12:51"}
253
+ {"current_steps": 2530, "total_steps": 5000, "loss": 1.3677, "lr": 2.8824645907100954e-05, "epoch": 0.506, "percentage": 50.6, "elapsed_time": "0:13:07", "remaining_time": "0:12:48"}
254
+ {"current_steps": 2540, "total_steps": 5000, "loss": 2.399, "lr": 2.8652075714060295e-05, "epoch": 0.508, "percentage": 50.8, "elapsed_time": "0:13:10", "remaining_time": "0:12:45"}
255
+ {"current_steps": 2550, "total_steps": 5000, "loss": 4.2765, "lr": 2.8479327524001636e-05, "epoch": 0.51, "percentage": 51.0, "elapsed_time": "0:13:13", "remaining_time": "0:12:42"}
256
+ {"current_steps": 2560, "total_steps": 5000, "loss": 3.2872, "lr": 2.8306409756428064e-05, "epoch": 0.512, "percentage": 51.2, "elapsed_time": "0:13:16", "remaining_time": "0:12:38"}
257
+ {"current_steps": 2570, "total_steps": 5000, "loss": 3.1682, "lr": 2.8133330839107608e-05, "epoch": 0.514, "percentage": 51.4, "elapsed_time": "0:13:19", "remaining_time": "0:12:35"}
258
+ {"current_steps": 2580, "total_steps": 5000, "loss": 3.3066, "lr": 2.7960099207662532e-05, "epoch": 0.516, "percentage": 51.6, "elapsed_time": "0:13:22", "remaining_time": "0:12:32"}
259
+ {"current_steps": 2590, "total_steps": 5000, "loss": 1.8911, "lr": 2.7786723305158136e-05, "epoch": 0.518, "percentage": 51.8, "elapsed_time": "0:13:25", "remaining_time": "0:12:29"}
260
+ {"current_steps": 2600, "total_steps": 5000, "loss": 1.7402, "lr": 2.761321158169134e-05, "epoch": 0.52, "percentage": 52.0, "elapsed_time": "0:13:28", "remaining_time": "0:12:26"}
261
+ {"current_steps": 2610, "total_steps": 5000, "loss": 1.1721, "lr": 2.7439572493978736e-05, "epoch": 0.522, "percentage": 52.2, "elapsed_time": "0:13:31", "remaining_time": "0:12:22"}
262
+ {"current_steps": 2620, "total_steps": 5000, "loss": 1.2482, "lr": 2.726581450494451e-05, "epoch": 0.524, "percentage": 52.4, "elapsed_time": "0:13:34", "remaining_time": "0:12:19"}
263
+ {"current_steps": 2630, "total_steps": 5000, "loss": 3.3641, "lr": 2.7091946083307896e-05, "epoch": 0.526, "percentage": 52.6, "elapsed_time": "0:13:37", "remaining_time": "0:12:16"}
264
+ {"current_steps": 2640, "total_steps": 5000, "loss": 1.9946, "lr": 2.6917975703170466e-05, "epoch": 0.528, "percentage": 52.8, "elapsed_time": "0:13:40", "remaining_time": "0:12:13"}
265
+ {"current_steps": 2650, "total_steps": 5000, "loss": 1.0218, "lr": 2.674391184360313e-05, "epoch": 0.53, "percentage": 53.0, "elapsed_time": "0:13:43", "remaining_time": "0:12:10"}
266
+ {"current_steps": 2660, "total_steps": 5000, "loss": 2.336, "lr": 2.656976298823284e-05, "epoch": 0.532, "percentage": 53.2, "elapsed_time": "0:13:46", "remaining_time": "0:12:06"}
267
+ {"current_steps": 2670, "total_steps": 5000, "loss": 4.6423, "lr": 2.6395537624829096e-05, "epoch": 0.534, "percentage": 53.4, "elapsed_time": "0:13:49", "remaining_time": "0:12:03"}
268
+ {"current_steps": 2680, "total_steps": 5000, "loss": 1.845, "lr": 2.6221244244890336e-05, "epoch": 0.536, "percentage": 53.6, "elapsed_time": "0:13:52", "remaining_time": "0:12:00"}
269
+ {"current_steps": 2690, "total_steps": 5000, "loss": 2.3117, "lr": 2.604689134322999e-05, "epoch": 0.538, "percentage": 53.8, "elapsed_time": "0:13:55", "remaining_time": "0:11:57"}
270
+ {"current_steps": 2700, "total_steps": 5000, "loss": 3.3991, "lr": 2.587248741756253e-05, "epoch": 0.54, "percentage": 54.0, "elapsed_time": "0:13:58", "remaining_time": "0:11:53"}
271
+ {"current_steps": 2710, "total_steps": 5000, "loss": 3.7802, "lr": 2.5698040968089225e-05, "epoch": 0.542, "percentage": 54.2, "elapsed_time": "0:14:01", "remaining_time": "0:11:50"}
272
+ {"current_steps": 2720, "total_steps": 5000, "loss": 1.4431, "lr": 2.5523560497083926e-05, "epoch": 0.544, "percentage": 54.4, "elapsed_time": "0:14:04", "remaining_time": "0:11:47"}
273
+ {"current_steps": 2730, "total_steps": 5000, "loss": 0.8369, "lr": 2.5349054508478637e-05, "epoch": 0.546, "percentage": 54.6, "elapsed_time": "0:14:07", "remaining_time": "0:11:44"}
274
+ {"current_steps": 2740, "total_steps": 5000, "loss": 2.658, "lr": 2.517453150744904e-05, "epoch": 0.548, "percentage": 54.8, "elapsed_time": "0:14:10", "remaining_time": "0:11:41"}
275
+ {"current_steps": 2750, "total_steps": 5000, "loss": 2.12, "lr": 2.5e-05, "epoch": 0.55, "percentage": 55.0, "elapsed_time": "0:14:13", "remaining_time": "0:11:37"}
276
+ {"current_steps": 2760, "total_steps": 5000, "loss": 2.1272, "lr": 2.4825468492550964e-05, "epoch": 0.552, "percentage": 55.2, "elapsed_time": "0:14:16", "remaining_time": "0:11:34"}
277
+ {"current_steps": 2770, "total_steps": 5000, "loss": 1.5502, "lr": 2.4650945491521372e-05, "epoch": 0.554, "percentage": 55.4, "elapsed_time": "0:14:18", "remaining_time": "0:11:31"}
278
+ {"current_steps": 2780, "total_steps": 5000, "loss": 1.4815, "lr": 2.447643950291608e-05, "epoch": 0.556, "percentage": 55.6, "elapsed_time": "0:14:21", "remaining_time": "0:11:28"}
279
+ {"current_steps": 2790, "total_steps": 5000, "loss": 1.3745, "lr": 2.4301959031910784e-05, "epoch": 0.558, "percentage": 55.8, "elapsed_time": "0:14:24", "remaining_time": "0:11:25"}
280
+ {"current_steps": 2800, "total_steps": 5000, "loss": 2.9765, "lr": 2.4127512582437485e-05, "epoch": 0.56, "percentage": 56.0, "elapsed_time": "0:14:28", "remaining_time": "0:11:22"}
281
+ {"current_steps": 2810, "total_steps": 5000, "loss": 1.2392, "lr": 2.3953108656770016e-05, "epoch": 0.562, "percentage": 56.2, "elapsed_time": "0:14:31", "remaining_time": "0:11:18"}
282
+ {"current_steps": 2820, "total_steps": 5000, "loss": 1.6202, "lr": 2.377875575510967e-05, "epoch": 0.564, "percentage": 56.4, "elapsed_time": "0:14:33", "remaining_time": "0:11:15"}
283
+ {"current_steps": 2830, "total_steps": 5000, "loss": 4.6898, "lr": 2.3604462375170906e-05, "epoch": 0.566, "percentage": 56.6, "elapsed_time": "0:14:36", "remaining_time": "0:11:12"}
284
+ {"current_steps": 2840, "total_steps": 5000, "loss": 1.3387, "lr": 2.3430237011767167e-05, "epoch": 0.568, "percentage": 56.8, "elapsed_time": "0:14:39", "remaining_time": "0:11:09"}
285
+ {"current_steps": 2850, "total_steps": 5000, "loss": 3.6984, "lr": 2.3256088156396868e-05, "epoch": 0.57, "percentage": 57.0, "elapsed_time": "0:14:42", "remaining_time": "0:11:06"}
286
+ {"current_steps": 2860, "total_steps": 5000, "loss": 1.874, "lr": 2.3082024296829536e-05, "epoch": 0.572, "percentage": 57.2, "elapsed_time": "0:14:45", "remaining_time": "0:11:02"}
287
+ {"current_steps": 2870, "total_steps": 5000, "loss": 1.7756, "lr": 2.2908053916692117e-05, "epoch": 0.574, "percentage": 57.4, "elapsed_time": "0:14:48", "remaining_time": "0:10:59"}
288
+ {"current_steps": 2880, "total_steps": 5000, "loss": 8.2453, "lr": 2.2734185495055503e-05, "epoch": 0.576, "percentage": 57.6, "elapsed_time": "0:14:51", "remaining_time": "0:10:56"}
289
+ {"current_steps": 2890, "total_steps": 5000, "loss": 1.6711, "lr": 2.2560427506021266e-05, "epoch": 0.578, "percentage": 57.8, "elapsed_time": "0:14:54", "remaining_time": "0:10:53"}
290
+ {"current_steps": 2900, "total_steps": 5000, "loss": 2.9633, "lr": 2.238678841830867e-05, "epoch": 0.58, "percentage": 58.0, "elapsed_time": "0:14:57", "remaining_time": "0:10:50"}
291
+ {"current_steps": 2910, "total_steps": 5000, "loss": 1.882, "lr": 2.2213276694841866e-05, "epoch": 0.582, "percentage": 58.2, "elapsed_time": "0:15:00", "remaining_time": "0:10:47"}
292
+ {"current_steps": 2920, "total_steps": 5000, "loss": 2.1105, "lr": 2.2039900792337474e-05, "epoch": 0.584, "percentage": 58.4, "elapsed_time": "0:15:03", "remaining_time": "0:10:43"}
293
+ {"current_steps": 2930, "total_steps": 5000, "loss": 2.285, "lr": 2.186666916089239e-05, "epoch": 0.586, "percentage": 58.6, "elapsed_time": "0:15:06", "remaining_time": "0:10:40"}
294
+ {"current_steps": 2940, "total_steps": 5000, "loss": 2.7095, "lr": 2.1693590243571938e-05, "epoch": 0.588, "percentage": 58.8, "elapsed_time": "0:15:09", "remaining_time": "0:10:37"}
295
+ {"current_steps": 2950, "total_steps": 5000, "loss": 1.4936, "lr": 2.1520672475998373e-05, "epoch": 0.59, "percentage": 59.0, "elapsed_time": "0:15:12", "remaining_time": "0:10:34"}
296
+ {"current_steps": 2960, "total_steps": 5000, "loss": 2.9807, "lr": 2.1347924285939714e-05, "epoch": 0.592, "percentage": 59.2, "elapsed_time": "0:15:15", "remaining_time": "0:10:31"}
297
+ {"current_steps": 2970, "total_steps": 5000, "loss": 1.8499, "lr": 2.117535409289905e-05, "epoch": 0.594, "percentage": 59.4, "elapsed_time": "0:15:18", "remaining_time": "0:10:28"}
298
+ {"current_steps": 2980, "total_steps": 5000, "loss": 2.6104, "lr": 2.1002970307704132e-05, "epoch": 0.596, "percentage": 59.6, "elapsed_time": "0:15:21", "remaining_time": "0:10:24"}
299
+ {"current_steps": 2990, "total_steps": 5000, "loss": 4.632, "lr": 2.0830781332097446e-05, "epoch": 0.598, "percentage": 59.8, "elapsed_time": "0:15:24", "remaining_time": "0:10:21"}
300
+ {"current_steps": 3000, "total_steps": 5000, "loss": 2.1349, "lr": 2.0658795558326743e-05, "epoch": 0.6, "percentage": 60.0, "elapsed_time": "0:15:27", "remaining_time": "0:10:18"}
301
+ {"current_steps": 3010, "total_steps": 5000, "loss": 2.881, "lr": 2.0487021368736003e-05, "epoch": 0.602, "percentage": 60.2, "elapsed_time": "0:15:30", "remaining_time": "0:10:15"}
302
+ {"current_steps": 3020, "total_steps": 5000, "loss": 2.999, "lr": 2.031546713535688e-05, "epoch": 0.604, "percentage": 60.4, "elapsed_time": "0:15:33", "remaining_time": "0:10:12"}
303
+ {"current_steps": 3030, "total_steps": 5000, "loss": 3.0427, "lr": 2.0144141219500705e-05, "epoch": 0.606, "percentage": 60.6, "elapsed_time": "0:15:36", "remaining_time": "0:10:09"}
304
+ {"current_steps": 3040, "total_steps": 5000, "loss": 2.5017, "lr": 1.9973051971350888e-05, "epoch": 0.608, "percentage": 60.8, "elapsed_time": "0:15:39", "remaining_time": "0:10:05"}
305
+ {"current_steps": 3050, "total_steps": 5000, "loss": 1.5811, "lr": 1.980220772955602e-05, "epoch": 0.61, "percentage": 61.0, "elapsed_time": "0:15:42", "remaining_time": "0:10:02"}
306
+ {"current_steps": 3060, "total_steps": 5000, "loss": 1.6409, "lr": 1.963161682082342e-05, "epoch": 0.612, "percentage": 61.2, "elapsed_time": "0:15:45", "remaining_time": "0:09:59"}
307
+ {"current_steps": 3070, "total_steps": 5000, "loss": 3.8443, "lr": 1.946128755951332e-05, "epoch": 0.614, "percentage": 61.4, "elapsed_time": "0:15:48", "remaining_time": "0:09:56"}
308
+ {"current_steps": 3080, "total_steps": 5000, "loss": 2.3784, "lr": 1.9291228247233605e-05, "epoch": 0.616, "percentage": 61.6, "elapsed_time": "0:15:51", "remaining_time": "0:09:53"}
309
+ {"current_steps": 3090, "total_steps": 5000, "loss": 1.9809, "lr": 1.912144717243525e-05, "epoch": 0.618, "percentage": 61.8, "elapsed_time": "0:15:54", "remaining_time": "0:09:50"}
310
+ {"current_steps": 3100, "total_steps": 5000, "loss": 1.5875, "lr": 1.895195261000831e-05, "epoch": 0.62, "percentage": 62.0, "elapsed_time": "0:15:57", "remaining_time": "0:09:46"}
311
+ {"current_steps": 3110, "total_steps": 5000, "loss": 2.2161, "lr": 1.8782752820878634e-05, "epoch": 0.622, "percentage": 62.2, "elapsed_time": "0:16:00", "remaining_time": "0:09:43"}
312
+ {"current_steps": 3120, "total_steps": 5000, "loss": 6.9032, "lr": 1.8613856051605243e-05, "epoch": 0.624, "percentage": 62.4, "elapsed_time": "0:16:03", "remaining_time": "0:09:40"}
313
+ {"current_steps": 3130, "total_steps": 5000, "loss": 2.9789, "lr": 1.8445270533978388e-05, "epoch": 0.626, "percentage": 62.6, "elapsed_time": "0:16:06", "remaining_time": "0:09:37"}
314
+ {"current_steps": 3140, "total_steps": 5000, "loss": 2.0478, "lr": 1.827700448461836e-05, "epoch": 0.628, "percentage": 62.8, "elapsed_time": "0:16:09", "remaining_time": "0:09:34"}
315
+ {"current_steps": 3150, "total_steps": 5000, "loss": 1.2295, "lr": 1.8109066104575023e-05, "epoch": 0.63, "percentage": 63.0, "elapsed_time": "0:16:12", "remaining_time": "0:09:31"}
316
+ {"current_steps": 3160, "total_steps": 5000, "loss": 2.6306, "lr": 1.7941463578928086e-05, "epoch": 0.632, "percentage": 63.2, "elapsed_time": "0:16:15", "remaining_time": "0:09:28"}
317
+ {"current_steps": 3170, "total_steps": 5000, "loss": 1.2663, "lr": 1.7774205076388206e-05, "epoch": 0.634, "percentage": 63.4, "elapsed_time": "0:16:18", "remaining_time": "0:09:24"}
318
+ {"current_steps": 3180, "total_steps": 5000, "loss": 3.5995, "lr": 1.7607298748898842e-05, "epoch": 0.636, "percentage": 63.6, "elapsed_time": "0:16:21", "remaining_time": "0:09:21"}
319
+ {"current_steps": 3190, "total_steps": 5000, "loss": 3.4226, "lr": 1.744075273123889e-05, "epoch": 0.638, "percentage": 63.8, "elapsed_time": "0:16:24", "remaining_time": "0:09:18"}
320
+ {"current_steps": 3200, "total_steps": 5000, "loss": 0.9354, "lr": 1.7274575140626318e-05, "epoch": 0.64, "percentage": 64.0, "elapsed_time": "0:16:27", "remaining_time": "0:09:15"}
321
+ {"current_steps": 3210, "total_steps": 5000, "loss": 4.6267, "lr": 1.7108774076322443e-05, "epoch": 0.642, "percentage": 64.2, "elapsed_time": "0:16:30", "remaining_time": "0:09:12"}
322
+ {"current_steps": 3220, "total_steps": 5000, "loss": 0.9114, "lr": 1.6943357619237226e-05, "epoch": 0.644, "percentage": 64.4, "elapsed_time": "0:16:33", "remaining_time": "0:09:09"}
323
+ {"current_steps": 3230, "total_steps": 5000, "loss": 2.5474, "lr": 1.677833383153542e-05, "epoch": 0.646, "percentage": 64.6, "elapsed_time": "0:16:36", "remaining_time": "0:09:06"}
324
+ {"current_steps": 3240, "total_steps": 5000, "loss": 2.7762, "lr": 1.6613710756243626e-05, "epoch": 0.648, "percentage": 64.8, "elapsed_time": "0:16:39", "remaining_time": "0:09:02"}
325
+ {"current_steps": 3250, "total_steps": 5000, "loss": 1.6468, "lr": 1.6449496416858284e-05, "epoch": 0.65, "percentage": 65.0, "elapsed_time": "0:16:42", "remaining_time": "0:08:59"}
326
+ {"current_steps": 3260, "total_steps": 5000, "loss": 1.2256, "lr": 1.6285698816954624e-05, "epoch": 0.652, "percentage": 65.2, "elapsed_time": "0:16:45", "remaining_time": "0:08:56"}
327
+ {"current_steps": 3270, "total_steps": 5000, "loss": 1.5563, "lr": 1.612232593979658e-05, "epoch": 0.654, "percentage": 65.4, "elapsed_time": "0:16:48", "remaining_time": "0:08:53"}
328
+ {"current_steps": 3280, "total_steps": 5000, "loss": 3.3846, "lr": 1.5959385747947698e-05, "epoch": 0.656, "percentage": 65.6, "elapsed_time": "0:16:51", "remaining_time": "0:08:50"}
329
+ {"current_steps": 3290, "total_steps": 5000, "loss": 2.2551, "lr": 1.5796886182883053e-05, "epoch": 0.658, "percentage": 65.8, "elapsed_time": "0:16:54", "remaining_time": "0:08:47"}
330
+ {"current_steps": 3300, "total_steps": 5000, "loss": 0.8103, "lr": 1.56348351646022e-05, "epoch": 0.66, "percentage": 66.0, "elapsed_time": "0:16:57", "remaining_time": "0:08:44"}
331
+ {"current_steps": 3310, "total_steps": 5000, "loss": 1.7983, "lr": 1.547324059124315e-05, "epoch": 0.662, "percentage": 66.2, "elapsed_time": "0:17:00", "remaining_time": "0:08:41"}
332
+ {"current_steps": 3320, "total_steps": 5000, "loss": 4.4483, "lr": 1.5312110338697426e-05, "epoch": 0.664, "percentage": 66.4, "elapsed_time": "0:17:03", "remaining_time": "0:08:37"}
333
+ {"current_steps": 3330, "total_steps": 5000, "loss": 6.7593, "lr": 1.5151452260226224e-05, "epoch": 0.666, "percentage": 66.6, "elapsed_time": "0:17:06", "remaining_time": "0:08:34"}
334
+ {"current_steps": 3340, "total_steps": 5000, "loss": 3.4548, "lr": 1.4991274186077632e-05, "epoch": 0.668, "percentage": 66.8, "elapsed_time": "0:17:09", "remaining_time": "0:08:31"}
335
+ {"current_steps": 3350, "total_steps": 5000, "loss": 2.0242, "lr": 1.4831583923104999e-05, "epoch": 0.67, "percentage": 67.0, "elapsed_time": "0:17:12", "remaining_time": "0:08:28"}
336
+ {"current_steps": 3360, "total_steps": 5000, "loss": 2.3468, "lr": 1.467238925438646e-05, "epoch": 0.672, "percentage": 67.2, "elapsed_time": "0:17:15", "remaining_time": "0:08:25"}
337
+ {"current_steps": 3370, "total_steps": 5000, "loss": 1.3924, "lr": 1.4513697938845572e-05, "epoch": 0.674, "percentage": 67.4, "elapsed_time": "0:17:18", "remaining_time": "0:08:22"}
338
+ {"current_steps": 3380, "total_steps": 5000, "loss": 2.3896, "lr": 1.4355517710873184e-05, "epoch": 0.676, "percentage": 67.6, "elapsed_time": "0:17:21", "remaining_time": "0:08:19"}
339
+ {"current_steps": 3390, "total_steps": 5000, "loss": 2.2237, "lr": 1.4197856279950438e-05, "epoch": 0.678, "percentage": 67.8, "elapsed_time": "0:17:24", "remaining_time": "0:08:16"}
340
+ {"current_steps": 3400, "total_steps": 5000, "loss": 6.2669, "lr": 1.4040721330273062e-05, "epoch": 0.68, "percentage": 68.0, "elapsed_time": "0:17:27", "remaining_time": "0:08:12"}
341
+ {"current_steps": 3410, "total_steps": 5000, "loss": 2.271, "lr": 1.388412052037682e-05, "epoch": 0.682, "percentage": 68.2, "elapsed_time": "0:17:30", "remaining_time": "0:08:09"}
342
+ {"current_steps": 3420, "total_steps": 5000, "loss": 1.7512, "lr": 1.3728061482764238e-05, "epoch": 0.684, "percentage": 68.4, "elapsed_time": "0:17:33", "remaining_time": "0:08:06"}
343
+ {"current_steps": 3430, "total_steps": 5000, "loss": 1.3673, "lr": 1.3572551823532654e-05, "epoch": 0.686, "percentage": 68.6, "elapsed_time": "0:17:36", "remaining_time": "0:08:03"}
344
+ {"current_steps": 3440, "total_steps": 5000, "loss": 3.8359, "lr": 1.3417599122003464e-05, "epoch": 0.688, "percentage": 68.8, "elapsed_time": "0:17:39", "remaining_time": "0:08:00"}
345
+ {"current_steps": 3450, "total_steps": 5000, "loss": 1.6534, "lr": 1.3263210930352737e-05, "epoch": 0.69, "percentage": 69.0, "elapsed_time": "0:17:42", "remaining_time": "0:07:57"}
346
+ {"current_steps": 3460, "total_steps": 5000, "loss": 0.955, "lr": 1.3109394773243117e-05, "epoch": 0.692, "percentage": 69.2, "elapsed_time": "0:17:45", "remaining_time": "0:07:54"}
347
+ {"current_steps": 3470, "total_steps": 5000, "loss": 1.9489, "lr": 1.2956158147457115e-05, "epoch": 0.694, "percentage": 69.4, "elapsed_time": "0:17:48", "remaining_time": "0:07:51"}
348
+ {"current_steps": 3480, "total_steps": 5000, "loss": 2.5898, "lr": 1.280350852153168e-05, "epoch": 0.696, "percentage": 69.6, "elapsed_time": "0:17:51", "remaining_time": "0:07:47"}
349
+ {"current_steps": 3490, "total_steps": 5000, "loss": 2.9695, "lr": 1.2651453335394231e-05, "epoch": 0.698, "percentage": 69.8, "elapsed_time": "0:17:54", "remaining_time": "0:07:44"}
350
+ {"current_steps": 3500, "total_steps": 5000, "loss": 1.8141, "lr": 1.2500000000000006e-05, "epoch": 0.7, "percentage": 70.0, "elapsed_time": "0:17:57", "remaining_time": "0:07:41"}
351
+ {"current_steps": 3510, "total_steps": 5000, "loss": 1.6577, "lr": 1.234915589697091e-05, "epoch": 0.702, "percentage": 70.2, "elapsed_time": "0:18:00", "remaining_time": "0:07:38"}
352
+ {"current_steps": 3520, "total_steps": 5000, "loss": 1.0784, "lr": 1.2198928378235716e-05, "epoch": 0.704, "percentage": 70.4, "elapsed_time": "0:18:03", "remaining_time": "0:07:35"}
353
+ {"current_steps": 3530, "total_steps": 5000, "loss": 1.4946, "lr": 1.2049324765671749e-05, "epoch": 0.706, "percentage": 70.6, "elapsed_time": "0:18:06", "remaining_time": "0:07:32"}
354
+ {"current_steps": 3540, "total_steps": 5000, "loss": 2.9905, "lr": 1.1900352350748026e-05, "epoch": 0.708, "percentage": 70.8, "elapsed_time": "0:18:09", "remaining_time": "0:07:29"}
355
+ {"current_steps": 3550, "total_steps": 5000, "loss": 2.2346, "lr": 1.175201839416988e-05, "epoch": 0.71, "percentage": 71.0, "elapsed_time": "0:18:12", "remaining_time": "0:07:26"}
356
+ {"current_steps": 3560, "total_steps": 5000, "loss": 1.6511, "lr": 1.1604330125525079e-05, "epoch": 0.712, "percentage": 71.2, "elapsed_time": "0:18:15", "remaining_time": "0:07:22"}
357
+ {"current_steps": 3570, "total_steps": 5000, "loss": 3.0413, "lr": 1.1457294742931507e-05, "epoch": 0.714, "percentage": 71.4, "elapsed_time": "0:18:18", "remaining_time": "0:07:19"}
358
+ {"current_steps": 3580, "total_steps": 5000, "loss": 2.2105, "lr": 1.1310919412686247e-05, "epoch": 0.716, "percentage": 71.6, "elapsed_time": "0:18:21", "remaining_time": "0:07:16"}
359
+ {"current_steps": 3590, "total_steps": 5000, "loss": 2.0821, "lr": 1.11652112689164e-05, "epoch": 0.718, "percentage": 71.8, "elapsed_time": "0:18:24", "remaining_time": "0:07:13"}
360
+ {"current_steps": 3600, "total_steps": 5000, "loss": 3.4402, "lr": 1.1020177413231334e-05, "epoch": 0.72, "percentage": 72.0, "elapsed_time": "0:18:27", "remaining_time": "0:07:10"}
361
+ {"current_steps": 3610, "total_steps": 5000, "loss": 1.5246, "lr": 1.0875824914376553e-05, "epoch": 0.722, "percentage": 72.2, "elapsed_time": "0:18:30", "remaining_time": "0:07:07"}
362
+ {"current_steps": 3620, "total_steps": 5000, "loss": 4.4784, "lr": 1.0732160807889211e-05, "epoch": 0.724, "percentage": 72.4, "elapsed_time": "0:18:33", "remaining_time": "0:07:04"}
363
+ {"current_steps": 3630, "total_steps": 5000, "loss": 2.8935, "lr": 1.058919209575517e-05, "epoch": 0.726, "percentage": 72.6, "elapsed_time": "0:18:36", "remaining_time": "0:07:01"}
364
+ {"current_steps": 3640, "total_steps": 5000, "loss": 1.4045, "lr": 1.0446925746067768e-05, "epoch": 0.728, "percentage": 72.8, "elapsed_time": "0:18:39", "remaining_time": "0:06:58"}
365
+ {"current_steps": 3650, "total_steps": 5000, "loss": 2.0619, "lr": 1.0305368692688174e-05, "epoch": 0.73, "percentage": 73.0, "elapsed_time": "0:18:42", "remaining_time": "0:06:55"}
366
+ {"current_steps": 3660, "total_steps": 5000, "loss": 1.8641, "lr": 1.0164527834907467e-05, "epoch": 0.732, "percentage": 73.2, "elapsed_time": "0:18:45", "remaining_time": "0:06:51"}
367
+ {"current_steps": 3670, "total_steps": 5000, "loss": 3.3477, "lr": 1.0024410037110357e-05, "epoch": 0.734, "percentage": 73.4, "elapsed_time": "0:18:48", "remaining_time": "0:06:48"}
368
+ {"current_steps": 3680, "total_steps": 5000, "loss": 1.6977, "lr": 9.88502212844063e-06, "epoch": 0.736, "percentage": 73.6, "elapsed_time": "0:18:50", "remaining_time": "0:06:45"}
369
+ {"current_steps": 3690, "total_steps": 5000, "loss": 2.4541, "lr": 9.746370902468311e-06, "epoch": 0.738, "percentage": 73.8, "elapsed_time": "0:18:53", "remaining_time": "0:06:42"}
370
+ {"current_steps": 3700, "total_steps": 5000, "loss": 1.0491, "lr": 9.608463116858542e-06, "epoch": 0.74, "percentage": 74.0, "elapsed_time": "0:18:56", "remaining_time": "0:06:39"}
371
+ {"current_steps": 3710, "total_steps": 5000, "loss": 1.7615, "lr": 9.471305493042243e-06, "epoch": 0.742, "percentage": 74.2, "elapsed_time": "0:18:59", "remaining_time": "0:06:36"}
372
+ {"current_steps": 3720, "total_steps": 5000, "loss": 1.2825, "lr": 9.334904715888495e-06, "epoch": 0.744, "percentage": 74.4, "elapsed_time": "0:19:02", "remaining_time": "0:06:33"}
373
+ {"current_steps": 3730, "total_steps": 5000, "loss": 2.0753, "lr": 9.199267433378727e-06, "epoch": 0.746, "percentage": 74.6, "elapsed_time": "0:19:05", "remaining_time": "0:06:30"}
374
+ {"current_steps": 3740, "total_steps": 5000, "loss": 2.5678, "lr": 9.064400256282757e-06, "epoch": 0.748, "percentage": 74.8, "elapsed_time": "0:19:08", "remaining_time": "0:06:27"}
375
+ {"current_steps": 3750, "total_steps": 5000, "loss": 1.0403, "lr": 8.930309757836517e-06, "epoch": 0.75, "percentage": 75.0, "elapsed_time": "0:19:11", "remaining_time": "0:06:23"}
376
+ {"current_steps": 3760, "total_steps": 5000, "loss": 2.1364, "lr": 8.797002473421728e-06, "epoch": 0.752, "percentage": 75.2, "elapsed_time": "0:19:14", "remaining_time": "0:06:20"}
377
+ {"current_steps": 3770, "total_steps": 5000, "loss": 1.4156, "lr": 8.664484900247363e-06, "epoch": 0.754, "percentage": 75.4, "elapsed_time": "0:19:17", "remaining_time": "0:06:17"}
378
+ {"current_steps": 3780, "total_steps": 5000, "loss": 2.463, "lr": 8.532763497032987e-06, "epoch": 0.756, "percentage": 75.6, "elapsed_time": "0:19:20", "remaining_time": "0:06:14"}
379
+ {"current_steps": 3790, "total_steps": 5000, "loss": 1.8851, "lr": 8.40184468369396e-06, "epoch": 0.758, "percentage": 75.8, "elapsed_time": "0:19:23", "remaining_time": "0:06:11"}
380
+ {"current_steps": 3800, "total_steps": 5000, "loss": 2.3422, "lr": 8.271734841028553e-06, "epoch": 0.76, "percentage": 76.0, "elapsed_time": "0:19:26", "remaining_time": "0:06:08"}
381
+ {"current_steps": 3810, "total_steps": 5000, "loss": 2.0351, "lr": 8.142440310406924e-06, "epoch": 0.762, "percentage": 76.2, "elapsed_time": "0:19:29", "remaining_time": "0:06:05"}
382
+ {"current_steps": 3820, "total_steps": 5000, "loss": 2.6205, "lr": 8.013967393462094e-06, "epoch": 0.764, "percentage": 76.4, "elapsed_time": "0:19:32", "remaining_time": "0:06:02"}
383
+ {"current_steps": 3830, "total_steps": 5000, "loss": 1.4258, "lr": 7.886322351782783e-06, "epoch": 0.766, "percentage": 76.6, "elapsed_time": "0:19:35", "remaining_time": "0:05:59"}
384
+ {"current_steps": 3840, "total_steps": 5000, "loss": 2.0398, "lr": 7.759511406608255e-06, "epoch": 0.768, "percentage": 76.8, "elapsed_time": "0:19:38", "remaining_time": "0:05:56"}
385
+ {"current_steps": 3850, "total_steps": 5000, "loss": 2.7082, "lr": 7.633540738525066e-06, "epoch": 0.77, "percentage": 77.0, "elapsed_time": "0:19:41", "remaining_time": "0:05:52"}
386
+ {"current_steps": 3860, "total_steps": 5000, "loss": 2.2596, "lr": 7.508416487165862e-06, "epoch": 0.772, "percentage": 77.2, "elapsed_time": "0:19:44", "remaining_time": "0:05:49"}
387
+ {"current_steps": 3870, "total_steps": 5000, "loss": 1.665, "lr": 7.384144750910133e-06, "epoch": 0.774, "percentage": 77.4, "elapsed_time": "0:19:47", "remaining_time": "0:05:46"}
388
+ {"current_steps": 3880, "total_steps": 5000, "loss": 3.3368, "lr": 7.260731586586983e-06, "epoch": 0.776, "percentage": 77.6, "elapsed_time": "0:19:50", "remaining_time": "0:05:43"}
389
+ {"current_steps": 3890, "total_steps": 5000, "loss": 2.165, "lr": 7.138183009179922e-06, "epoch": 0.778, "percentage": 77.8, "elapsed_time": "0:19:53", "remaining_time": "0:05:40"}
390
+ {"current_steps": 3900, "total_steps": 5000, "loss": 0.9795, "lr": 7.016504991533726e-06, "epoch": 0.78, "percentage": 78.0, "elapsed_time": "0:19:56", "remaining_time": "0:05:37"}
391
+ {"current_steps": 3910, "total_steps": 5000, "loss": 4.0774, "lr": 6.895703464063319e-06, "epoch": 0.782, "percentage": 78.2, "elapsed_time": "0:19:59", "remaining_time": "0:05:34"}
392
+ {"current_steps": 3920, "total_steps": 5000, "loss": 1.838, "lr": 6.775784314464717e-06, "epoch": 0.784, "percentage": 78.4, "elapsed_time": "0:20:02", "remaining_time": "0:05:31"}
393
+ {"current_steps": 3930, "total_steps": 5000, "loss": 2.2859, "lr": 6.656753387428089e-06, "epoch": 0.786, "percentage": 78.6, "elapsed_time": "0:20:05", "remaining_time": "0:05:28"}
394
+ {"current_steps": 3940, "total_steps": 5000, "loss": 4.2013, "lr": 6.538616484352902e-06, "epoch": 0.788, "percentage": 78.8, "elapsed_time": "0:20:08", "remaining_time": "0:05:25"}
395
+ {"current_steps": 3950, "total_steps": 5000, "loss": 3.8002, "lr": 6.421379363065142e-06, "epoch": 0.79, "percentage": 79.0, "elapsed_time": "0:20:11", "remaining_time": "0:05:22"}
396
+ {"current_steps": 3960, "total_steps": 5000, "loss": 2.5436, "lr": 6.305047737536707e-06, "epoch": 0.792, "percentage": 79.2, "elapsed_time": "0:20:14", "remaining_time": "0:05:18"}
397
+ {"current_steps": 3970, "total_steps": 5000, "loss": 2.4645, "lr": 6.189627277606894e-06, "epoch": 0.794, "percentage": 79.4, "elapsed_time": "0:20:17", "remaining_time": "0:05:15"}
398
+ {"current_steps": 3980, "total_steps": 5000, "loss": 1.1404, "lr": 6.075123608706093e-06, "epoch": 0.796, "percentage": 79.6, "elapsed_time": "0:20:20", "remaining_time": "0:05:12"}
399
+ {"current_steps": 3990, "total_steps": 5000, "loss": 4.2125, "lr": 5.961542311581586e-06, "epoch": 0.798, "percentage": 79.8, "elapsed_time": "0:20:23", "remaining_time": "0:05:09"}
400
+ {"current_steps": 4000, "total_steps": 5000, "loss": 1.4396, "lr": 5.848888922025553e-06, "epoch": 0.8, "percentage": 80.0, "elapsed_time": "0:20:26", "remaining_time": "0:05:06"}
401
+ {"current_steps": 4010, "total_steps": 5000, "loss": 2.321, "lr": 5.737168930605272e-06, "epoch": 0.802, "percentage": 80.2, "elapsed_time": "0:20:29", "remaining_time": "0:05:03"}
402
+ {"current_steps": 4020, "total_steps": 5000, "loss": 2.803, "lr": 5.626387782395512e-06, "epoch": 0.804, "percentage": 80.4, "elapsed_time": "0:20:32", "remaining_time": "0:05:00"}
403
+ {"current_steps": 4030, "total_steps": 5000, "loss": 1.024, "lr": 5.5165508767131415e-06, "epoch": 0.806, "percentage": 80.6, "elapsed_time": "0:20:35", "remaining_time": "0:04:57"}
404
+ {"current_steps": 4040, "total_steps": 5000, "loss": 2.4534, "lr": 5.4076635668540075e-06, "epoch": 0.808, "percentage": 80.8, "elapsed_time": "0:20:38", "remaining_time": "0:04:54"}
405
+ {"current_steps": 4050, "total_steps": 5000, "loss": 2.1075, "lr": 5.299731159831953e-06, "epoch": 0.81, "percentage": 81.0, "elapsed_time": "0:20:41", "remaining_time": "0:04:51"}
406
+ {"current_steps": 4060, "total_steps": 5000, "loss": 3.3703, "lr": 5.192758916120236e-06, "epoch": 0.812, "percentage": 81.2, "elapsed_time": "0:20:44", "remaining_time": "0:04:48"}
407
+ {"current_steps": 4070, "total_steps": 5000, "loss": 2.9377, "lr": 5.086752049395094e-06, "epoch": 0.814, "percentage": 81.4, "elapsed_time": "0:20:47", "remaining_time": "0:04:45"}
408
+ {"current_steps": 4080, "total_steps": 5000, "loss": 4.4126, "lr": 4.981715726281666e-06, "epoch": 0.816, "percentage": 81.6, "elapsed_time": "0:20:50", "remaining_time": "0:04:42"}
409
+ {"current_steps": 4090, "total_steps": 5000, "loss": 2.5896, "lr": 4.877655066102149e-06, "epoch": 0.818, "percentage": 81.8, "elapsed_time": "0:20:53", "remaining_time": "0:04:38"}
410
+ {"current_steps": 4100, "total_steps": 5000, "loss": 1.4516, "lr": 4.7745751406263165e-06, "epoch": 0.82, "percentage": 82.0, "elapsed_time": "0:20:56", "remaining_time": "0:04:35"}
411
+ {"current_steps": 4110, "total_steps": 5000, "loss": 1.3904, "lr": 4.672480973824311e-06, "epoch": 0.822, "percentage": 82.2, "elapsed_time": "0:20:59", "remaining_time": "0:04:32"}
412
+ {"current_steps": 4120, "total_steps": 5000, "loss": 1.6246, "lr": 4.571377541621788e-06, "epoch": 0.824, "percentage": 82.4, "elapsed_time": "0:21:02", "remaining_time": "0:04:29"}
413
+ {"current_steps": 4130, "total_steps": 5000, "loss": 3.125, "lr": 4.4712697716574e-06, "epoch": 0.826, "percentage": 82.6, "elapsed_time": "0:21:05", "remaining_time": "0:04:26"}
414
+ {"current_steps": 4140, "total_steps": 5000, "loss": 1.1379, "lr": 4.372162543042624e-06, "epoch": 0.828, "percentage": 82.8, "elapsed_time": "0:21:08", "remaining_time": "0:04:23"}
415
+ {"current_steps": 4150, "total_steps": 5000, "loss": 1.723, "lr": 4.274060686123959e-06, "epoch": 0.83, "percentage": 83.0, "elapsed_time": "0:21:11", "remaining_time": "0:04:20"}
416
+ {"current_steps": 4160, "total_steps": 5000, "loss": 2.2388, "lr": 4.176968982247514e-06, "epoch": 0.832, "percentage": 83.2, "elapsed_time": "0:21:14", "remaining_time": "0:04:17"}
417
+ {"current_steps": 4170, "total_steps": 5000, "loss": 1.7591, "lr": 4.08089216352596e-06, "epoch": 0.834, "percentage": 83.4, "elapsed_time": "0:21:17", "remaining_time": "0:04:14"}
418
+ {"current_steps": 4180, "total_steps": 5000, "loss": 0.9926, "lr": 3.985834912607894e-06, "epoch": 0.836, "percentage": 83.6, "elapsed_time": "0:21:20", "remaining_time": "0:04:11"}
419
+ {"current_steps": 4190, "total_steps": 5000, "loss": 2.8662, "lr": 3.891801862449629e-06, "epoch": 0.838, "percentage": 83.8, "elapsed_time": "0:21:23", "remaining_time": "0:04:08"}
420
+ {"current_steps": 4200, "total_steps": 5000, "loss": 2.1153, "lr": 3.798797596089351e-06, "epoch": 0.84, "percentage": 84.0, "elapsed_time": "0:21:26", "remaining_time": "0:04:05"}
421
+ {"current_steps": 4210, "total_steps": 5000, "loss": 1.9178, "lr": 3.7068266464238084e-06, "epoch": 0.842, "percentage": 84.2, "elapsed_time": "0:21:29", "remaining_time": "0:04:01"}
422
+ {"current_steps": 4220, "total_steps": 5000, "loss": 1.9438, "lr": 3.6158934959873353e-06, "epoch": 0.844, "percentage": 84.4, "elapsed_time": "0:21:32", "remaining_time": "0:03:58"}
423
+ {"current_steps": 4230, "total_steps": 5000, "loss": 2.5589, "lr": 3.5260025767333893e-06, "epoch": 0.846, "percentage": 84.6, "elapsed_time": "0:21:35", "remaining_time": "0:03:55"}
424
+ {"current_steps": 4240, "total_steps": 5000, "loss": 2.031, "lr": 3.4371582698185633e-06, "epoch": 0.848, "percentage": 84.8, "elapsed_time": "0:21:38", "remaining_time": "0:03:52"}
425
+ {"current_steps": 4250, "total_steps": 5000, "loss": 1.8701, "lr": 3.3493649053890326e-06, "epoch": 0.85, "percentage": 85.0, "elapsed_time": "0:21:41", "remaining_time": "0:03:49"}
426
+ {"current_steps": 4260, "total_steps": 5000, "loss": 2.4573, "lr": 3.262626762369525e-06, "epoch": 0.852, "percentage": 85.2, "elapsed_time": "0:21:44", "remaining_time": "0:03:46"}
427
+ {"current_steps": 4270, "total_steps": 5000, "loss": 1.5529, "lr": 3.176948068254762e-06, "epoch": 0.854, "percentage": 85.4, "elapsed_time": "0:21:47", "remaining_time": "0:03:43"}
428
+ {"current_steps": 4280, "total_steps": 5000, "loss": 4.1672, "lr": 3.092332998903416e-06, "epoch": 0.856, "percentage": 85.6, "elapsed_time": "0:21:50", "remaining_time": "0:03:40"}
429
+ {"current_steps": 4290, "total_steps": 5000, "loss": 2.5184, "lr": 3.0087856783345914e-06, "epoch": 0.858, "percentage": 85.8, "elapsed_time": "0:21:53", "remaining_time": "0:03:37"}
430
+ {"current_steps": 4300, "total_steps": 5000, "loss": 1.5187, "lr": 2.9263101785268254e-06, "epoch": 0.86, "percentage": 86.0, "elapsed_time": "0:21:56", "remaining_time": "0:03:34"}
431
+ {"current_steps": 4310, "total_steps": 5000, "loss": 2.3649, "lr": 2.8449105192196316e-06, "epoch": 0.862, "percentage": 86.2, "elapsed_time": "0:21:59", "remaining_time": "0:03:31"}
432
+ {"current_steps": 4320, "total_steps": 5000, "loss": 1.8013, "lr": 2.764590667717562e-06, "epoch": 0.864, "percentage": 86.4, "elapsed_time": "0:22:02", "remaining_time": "0:03:28"}
433
+ {"current_steps": 4330, "total_steps": 5000, "loss": 1.5805, "lr": 2.6853545386968606e-06, "epoch": 0.866, "percentage": 86.6, "elapsed_time": "0:22:05", "remaining_time": "0:03:25"}
434
+ {"current_steps": 4340, "total_steps": 5000, "loss": 1.8378, "lr": 2.6072059940146775e-06, "epoch": 0.868, "percentage": 86.8, "elapsed_time": "0:22:08", "remaining_time": "0:03:22"}
435
+ {"current_steps": 4350, "total_steps": 5000, "loss": 1.502, "lr": 2.5301488425208296e-06, "epoch": 0.87, "percentage": 87.0, "elapsed_time": "0:22:11", "remaining_time": "0:03:18"}
436
+ {"current_steps": 4360, "total_steps": 5000, "loss": 3.0615, "lr": 2.454186839872158e-06, "epoch": 0.872, "percentage": 87.2, "elapsed_time": "0:22:14", "remaining_time": "0:03:15"}
437
+ {"current_steps": 4370, "total_steps": 5000, "loss": 2.0572, "lr": 2.379323688349516e-06, "epoch": 0.874, "percentage": 87.4, "elapsed_time": "0:22:17", "remaining_time": "0:03:12"}
438
+ {"current_steps": 4380, "total_steps": 5000, "loss": 1.5945, "lr": 2.3055630366772856e-06, "epoch": 0.876, "percentage": 87.6, "elapsed_time": "0:22:20", "remaining_time": "0:03:09"}
439
+ {"current_steps": 4390, "total_steps": 5000, "loss": 1.6811, "lr": 2.2329084798455746e-06, "epoch": 0.878, "percentage": 87.8, "elapsed_time": "0:22:23", "remaining_time": "0:03:06"}
440
+ {"current_steps": 4400, "total_steps": 5000, "loss": 1.5655, "lr": 2.1613635589349756e-06, "epoch": 0.88, "percentage": 88.0, "elapsed_time": "0:22:26", "remaining_time": "0:03:03"}
441
+ {"current_steps": 4410, "total_steps": 5000, "loss": 2.1422, "lr": 2.0909317609440095e-06, "epoch": 0.882, "percentage": 88.2, "elapsed_time": "0:22:29", "remaining_time": "0:03:00"}
442
+ {"current_steps": 4420, "total_steps": 5000, "loss": 1.403, "lr": 2.0216165186191407e-06, "epoch": 0.884, "percentage": 88.4, "elapsed_time": "0:22:32", "remaining_time": "0:02:57"}
443
+ {"current_steps": 4430, "total_steps": 5000, "loss": 2.6121, "lr": 1.95342121028749e-06, "epoch": 0.886, "percentage": 88.6, "elapsed_time": "0:22:35", "remaining_time": "0:02:54"}
444
+ {"current_steps": 4440, "total_steps": 5000, "loss": 5.1337, "lr": 1.8863491596921745e-06, "epoch": 0.888, "percentage": 88.8, "elapsed_time": "0:22:38", "remaining_time": "0:02:51"}
445
+ {"current_steps": 4450, "total_steps": 5000, "loss": 2.1225, "lr": 1.8204036358303173e-06, "epoch": 0.89, "percentage": 89.0, "elapsed_time": "0:22:41", "remaining_time": "0:02:48"}
446
+ {"current_steps": 4460, "total_steps": 5000, "loss": 2.8199, "lr": 1.7555878527937164e-06, "epoch": 0.892, "percentage": 89.2, "elapsed_time": "0:22:44", "remaining_time": "0:02:45"}
447
+ {"current_steps": 4470, "total_steps": 5000, "loss": 2.1345, "lr": 1.6919049696121958e-06, "epoch": 0.894, "percentage": 89.4, "elapsed_time": "0:22:47", "remaining_time": "0:02:42"}
448
+ {"current_steps": 4480, "total_steps": 5000, "loss": 1.671, "lr": 1.629358090099639e-06, "epoch": 0.896, "percentage": 89.6, "elapsed_time": "0:22:50", "remaining_time": "0:02:39"}
449
+ {"current_steps": 4490, "total_steps": 5000, "loss": 2.7573, "lr": 1.5679502627027136e-06, "epoch": 0.898, "percentage": 89.8, "elapsed_time": "0:22:53", "remaining_time": "0:02:36"}
450
+ {"current_steps": 4500, "total_steps": 5000, "loss": 1.8508, "lr": 1.5076844803522922e-06, "epoch": 0.9, "percentage": 90.0, "elapsed_time": "0:22:56", "remaining_time": "0:02:32"}
451
+ {"current_steps": 4510, "total_steps": 5000, "loss": 4.521, "lr": 1.4485636803175829e-06, "epoch": 0.902, "percentage": 90.2, "elapsed_time": "0:22:59", "remaining_time": "0:02:29"}
452
+ {"current_steps": 4520, "total_steps": 5000, "loss": 3.2342, "lr": 1.3905907440629752e-06, "epoch": 0.904, "percentage": 90.4, "elapsed_time": "0:23:02", "remaining_time": "0:02:26"}
453
+ {"current_steps": 4530, "total_steps": 5000, "loss": 3.0516, "lr": 1.333768497107593e-06, "epoch": 0.906, "percentage": 90.6, "elapsed_time": "0:23:05", "remaining_time": "0:02:23"}
454
+ {"current_steps": 4540, "total_steps": 5000, "loss": 2.0561, "lr": 1.2780997088875869e-06, "epoch": 0.908, "percentage": 90.8, "elapsed_time": "0:23:08", "remaining_time": "0:02:20"}
455
+ {"current_steps": 4550, "total_steps": 5000, "loss": 1.5619, "lr": 1.2235870926211619e-06, "epoch": 0.91, "percentage": 91.0, "elapsed_time": "0:23:11", "remaining_time": "0:02:17"}
456
+ {"current_steps": 4560, "total_steps": 5000, "loss": 2.0134, "lr": 1.170233305176327e-06, "epoch": 0.912, "percentage": 91.2, "elapsed_time": "0:23:14", "remaining_time": "0:02:14"}
457
+ {"current_steps": 4570, "total_steps": 5000, "loss": 0.8128, "lr": 1.1180409469414094e-06, "epoch": 0.914, "percentage": 91.4, "elapsed_time": "0:23:17", "remaining_time": "0:02:11"}
458
+ {"current_steps": 4580, "total_steps": 5000, "loss": 2.5157, "lr": 1.067012561698319e-06, "epoch": 0.916, "percentage": 91.6, "elapsed_time": "0:23:20", "remaining_time": "0:02:08"}
459
+ {"current_steps": 4590, "total_steps": 5000, "loss": 1.7379, "lr": 1.0171506364985622e-06, "epoch": 0.918, "percentage": 91.8, "elapsed_time": "0:23:23", "remaining_time": "0:02:05"}
460
+ {"current_steps": 4600, "total_steps": 5000, "loss": 1.3243, "lr": 9.684576015420278e-07, "epoch": 0.92, "percentage": 92.0, "elapsed_time": "0:23:26", "remaining_time": "0:02:02"}
461
+ {"current_steps": 4610, "total_steps": 5000, "loss": 4.207, "lr": 9.209358300585474e-07, "epoch": 0.922, "percentage": 92.2, "elapsed_time": "0:23:29", "remaining_time": "0:01:59"}
462
+ {"current_steps": 4620, "total_steps": 5000, "loss": 2.8713, "lr": 8.745876381922147e-07, "epoch": 0.924, "percentage": 92.4, "elapsed_time": "0:23:32", "remaining_time": "0:01:56"}
463
+ {"current_steps": 4630, "total_steps": 5000, "loss": 2.2642, "lr": 8.294152848885157e-07, "epoch": 0.926, "percentage": 92.6, "elapsed_time": "0:23:35", "remaining_time": "0:01:53"}
464
+ {"current_steps": 4640, "total_steps": 5000, "loss": 3.0498, "lr": 7.854209717842231e-07, "epoch": 0.928, "percentage": 92.8, "elapsed_time": "0:23:38", "remaining_time": "0:01:50"}
465
+ {"current_steps": 4650, "total_steps": 5000, "loss": 3.1851, "lr": 7.426068431000882e-07, "epoch": 0.93, "percentage": 93.0, "elapsed_time": "0:23:41", "remaining_time": "0:01:47"}
466
+ {"current_steps": 4660, "total_steps": 5000, "loss": 1.5354, "lr": 7.009749855363456e-07, "epoch": 0.932, "percentage": 93.2, "elapsed_time": "0:23:44", "remaining_time": "0:01:43"}
467
+ {"current_steps": 4670, "total_steps": 5000, "loss": 1.7694, "lr": 6.605274281709928e-07, "epoch": 0.934, "percentage": 93.4, "elapsed_time": "0:23:47", "remaining_time": "0:01:40"}
468
+ {"current_steps": 4680, "total_steps": 5000, "loss": 1.4321, "lr": 6.212661423609184e-07, "epoch": 0.936, "percentage": 93.6, "elapsed_time": "0:23:50", "remaining_time": "0:01:37"}
469
+ {"current_steps": 4690, "total_steps": 5000, "loss": 1.4432, "lr": 5.83193041645802e-07, "epoch": 0.938, "percentage": 93.8, "elapsed_time": "0:23:53", "remaining_time": "0:01:34"}
470
+ {"current_steps": 4700, "total_steps": 5000, "loss": 1.7524, "lr": 5.463099816548579e-07, "epoch": 0.94, "percentage": 94.0, "elapsed_time": "0:23:56", "remaining_time": "0:01:31"}
471
+ {"current_steps": 4710, "total_steps": 5000, "loss": 3.8866, "lr": 5.106187600163987e-07, "epoch": 0.942, "percentage": 94.2, "elapsed_time": "0:23:59", "remaining_time": "0:01:28"}
472
+ {"current_steps": 4720, "total_steps": 5000, "loss": 1.618, "lr": 4.7612111627021175e-07, "epoch": 0.944, "percentage": 94.4, "elapsed_time": "0:24:02", "remaining_time": "0:01:25"}
473
+ {"current_steps": 4730, "total_steps": 5000, "loss": 2.4191, "lr": 4.4281873178278475e-07, "epoch": 0.946, "percentage": 94.6, "elapsed_time": "0:24:05", "remaining_time": "0:01:22"}
474
+ {"current_steps": 4740, "total_steps": 5000, "loss": 1.4241, "lr": 4.107132296653549e-07, "epoch": 0.948, "percentage": 94.8, "elapsed_time": "0:24:08", "remaining_time": "0:01:19"}
475
+ {"current_steps": 4750, "total_steps": 5000, "loss": 1.8877, "lr": 3.7980617469479953e-07, "epoch": 0.95, "percentage": 95.0, "elapsed_time": "0:24:11", "remaining_time": "0:01:16"}
476
+ {"current_steps": 4760, "total_steps": 5000, "loss": 1.4828, "lr": 3.5009907323737825e-07, "epoch": 0.952, "percentage": 95.2, "elapsed_time": "0:24:14", "remaining_time": "0:01:13"}
477
+ {"current_steps": 4770, "total_steps": 5000, "loss": 2.1333, "lr": 3.215933731753024e-07, "epoch": 0.954, "percentage": 95.4, "elapsed_time": "0:24:17", "remaining_time": "0:01:10"}
478
+ {"current_steps": 4780, "total_steps": 5000, "loss": 2.0255, "lr": 2.942904638361804e-07, "epoch": 0.956, "percentage": 95.6, "elapsed_time": "0:24:20", "remaining_time": "0:01:07"}
479
+ {"current_steps": 4790, "total_steps": 5000, "loss": 3.1469, "lr": 2.681916759252917e-07, "epoch": 0.958, "percentage": 95.8, "elapsed_time": "0:24:23", "remaining_time": "0:01:04"}
480
+ {"current_steps": 4800, "total_steps": 5000, "loss": 2.5756, "lr": 2.4329828146074095e-07, "epoch": 0.96, "percentage": 96.0, "elapsed_time": "0:24:26", "remaining_time": "0:01:01"}
481
+ {"current_steps": 4810, "total_steps": 5000, "loss": 2.2082, "lr": 2.1961149371145795e-07, "epoch": 0.962, "percentage": 96.2, "elapsed_time": "0:24:29", "remaining_time": "0:00:58"}
482
+ {"current_steps": 4820, "total_steps": 5000, "loss": 1.2087, "lr": 1.9713246713805588e-07, "epoch": 0.964, "percentage": 96.4, "elapsed_time": "0:24:32", "remaining_time": "0:00:54"}
483
+ {"current_steps": 4830, "total_steps": 5000, "loss": 3.4375, "lr": 1.7586229733657644e-07, "epoch": 0.966, "percentage": 96.6, "elapsed_time": "0:24:35", "remaining_time": "0:00:51"}
484
+ {"current_steps": 4840, "total_steps": 5000, "loss": 1.2341, "lr": 1.5580202098509077e-07, "epoch": 0.968, "percentage": 96.8, "elapsed_time": "0:24:38", "remaining_time": "0:00:48"}
485
+ {"current_steps": 4850, "total_steps": 5000, "loss": 1.7228, "lr": 1.3695261579316777e-07, "epoch": 0.97, "percentage": 97.0, "elapsed_time": "0:24:41", "remaining_time": "0:00:45"}
486
+ {"current_steps": 4860, "total_steps": 5000, "loss": 1.4849, "lr": 1.193150004542204e-07, "epoch": 0.972, "percentage": 97.2, "elapsed_time": "0:24:44", "remaining_time": "0:00:42"}
487
+ {"current_steps": 4870, "total_steps": 5000, "loss": 1.5513, "lr": 1.0289003460074165e-07, "epoch": 0.974, "percentage": 97.4, "elapsed_time": "0:24:47", "remaining_time": "0:00:39"}
488
+ {"current_steps": 4880, "total_steps": 5000, "loss": 1.3479, "lr": 8.767851876239074e-08, "epoch": 0.976, "percentage": 97.6, "elapsed_time": "0:24:50", "remaining_time": "0:00:36"}
489
+ {"current_steps": 4890, "total_steps": 5000, "loss": 2.6461, "lr": 7.368119432699383e-08, "epoch": 0.978, "percentage": 97.8, "elapsed_time": "0:24:53", "remaining_time": "0:00:33"}
490
+ {"current_steps": 4900, "total_steps": 5000, "loss": 1.5137, "lr": 6.089874350439506e-08, "epoch": 0.98, "percentage": 98.0, "elapsed_time": "0:24:56", "remaining_time": "0:00:30"}
491
+ {"current_steps": 4910, "total_steps": 5000, "loss": 2.9775, "lr": 4.9331789293211026e-08, "epoch": 0.982, "percentage": 98.2, "elapsed_time": "0:24:59", "remaining_time": "0:00:27"}
492
+ {"current_steps": 4920, "total_steps": 5000, "loss": 1.0868, "lr": 3.8980895450474455e-08, "epoch": 0.984, "percentage": 98.4, "elapsed_time": "0:25:02", "remaining_time": "0:00:24"}
493
+ {"current_steps": 4930, "total_steps": 5000, "loss": 2.3111, "lr": 2.9846566464150626e-08, "epoch": 0.986, "percentage": 98.6, "elapsed_time": "0:25:05", "remaining_time": "0:00:21"}
494
+ {"current_steps": 4940, "total_steps": 5000, "loss": 2.8528, "lr": 2.192924752854042e-08, "epoch": 0.988, "percentage": 98.8, "elapsed_time": "0:25:08", "remaining_time": "0:00:18"}
495
+ {"current_steps": 4950, "total_steps": 5000, "loss": 7.4089, "lr": 1.522932452260595e-08, "epoch": 0.99, "percentage": 99.0, "elapsed_time": "0:25:11", "remaining_time": "0:00:15"}
496
+ {"current_steps": 4960, "total_steps": 5000, "loss": 5.3353, "lr": 9.747123991141194e-09, "epoch": 0.992, "percentage": 99.2, "elapsed_time": "0:25:14", "remaining_time": "0:00:12"}
497
+ {"current_steps": 4970, "total_steps": 5000, "loss": 1.8168, "lr": 5.48291312886251e-09, "epoch": 0.994, "percentage": 99.4, "elapsed_time": "0:25:17", "remaining_time": "0:00:09"}
498
+ {"current_steps": 4980, "total_steps": 5000, "loss": 2.34, "lr": 2.4368997673940297e-09, "epoch": 0.996, "percentage": 99.6, "elapsed_time": "0:25:20", "remaining_time": "0:00:06"}
499
+ {"current_steps": 4990, "total_steps": 5000, "loss": 1.8474, "lr": 6.092323651313292e-10, "epoch": 0.998, "percentage": 99.8, "elapsed_time": "0:25:23", "remaining_time": "0:00:03"}
500
+ {"current_steps": 5000, "total_steps": 5000, "loss": 5.5777, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:25:26", "remaining_time": "0:00:00"}
501
+ {"current_steps": 5000, "total_steps": 5000, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:25:26", "remaining_time": "0:00:00"}
Llama-2-13b-chat-hf/DomainBench/Agriculture/trainer_state.json ADDED
@@ -0,0 +1,3542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002,
13
+ "grad_norm": 1.010804533958435,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 10.8135,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.004,
20
+ "grad_norm": 5.864879131317139,
21
+ "learning_rate": 2.0000000000000003e-06,
22
+ "loss": 8.4638,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.006,
27
+ "grad_norm": 2.968351364135742,
28
+ "learning_rate": 3e-06,
29
+ "loss": 15.4904,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.008,
34
+ "grad_norm": 7.154600143432617,
35
+ "learning_rate": 4.000000000000001e-06,
36
+ "loss": 11.4875,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.01,
41
+ "grad_norm": 2.5448992252349854,
42
+ "learning_rate": 5e-06,
43
+ "loss": 14.2003,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.012,
48
+ "grad_norm": 3.4361155033111572,
49
+ "learning_rate": 6e-06,
50
+ "loss": 12.1374,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.014,
55
+ "grad_norm": 6.9021172523498535,
56
+ "learning_rate": 7.000000000000001e-06,
57
+ "loss": 11.6844,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.016,
62
+ "grad_norm": 0.4121188521385193,
63
+ "learning_rate": 8.000000000000001e-06,
64
+ "loss": 10.4387,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.018,
69
+ "grad_norm": 2.346395492553711,
70
+ "learning_rate": 9e-06,
71
+ "loss": 5.5739,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.02,
76
+ "grad_norm": 7.073041915893555,
77
+ "learning_rate": 1e-05,
78
+ "loss": 12.7118,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.022,
83
+ "grad_norm": 3.0463881492614746,
84
+ "learning_rate": 1.1000000000000001e-05,
85
+ "loss": 15.3528,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.024,
90
+ "grad_norm": 1.735839605331421,
91
+ "learning_rate": 1.2e-05,
92
+ "loss": 14.2922,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.026,
97
+ "grad_norm": 0.4800063371658325,
98
+ "learning_rate": 1.3000000000000001e-05,
99
+ "loss": 6.3563,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.028,
104
+ "grad_norm": 25.26578140258789,
105
+ "learning_rate": 1.4000000000000001e-05,
106
+ "loss": 7.9494,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.03,
111
+ "grad_norm": 1.725184440612793,
112
+ "learning_rate": 1.5e-05,
113
+ "loss": 10.7366,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.032,
118
+ "grad_norm": 5.55291223526001,
119
+ "learning_rate": 1.6000000000000003e-05,
120
+ "loss": 4.7961,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.034,
125
+ "grad_norm": 0.3070022761821747,
126
+ "learning_rate": 1.7000000000000003e-05,
127
+ "loss": 2.8594,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.036,
132
+ "grad_norm": 4.781795501708984,
133
+ "learning_rate": 1.8e-05,
134
+ "loss": 4.2593,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.038,
139
+ "grad_norm": 7.462250232696533,
140
+ "learning_rate": 1.9e-05,
141
+ "loss": 5.3202,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.04,
146
+ "grad_norm": 5.073458194732666,
147
+ "learning_rate": 2e-05,
148
+ "loss": 8.7095,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.042,
153
+ "grad_norm": 10.127187728881836,
154
+ "learning_rate": 2.1e-05,
155
+ "loss": 4.7786,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.044,
160
+ "grad_norm": 6.5535125732421875,
161
+ "learning_rate": 2.2000000000000003e-05,
162
+ "loss": 2.5694,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.046,
167
+ "grad_norm": 5.452541828155518,
168
+ "learning_rate": 2.3000000000000003e-05,
169
+ "loss": 4.3152,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.048,
174
+ "grad_norm": 1.630622386932373,
175
+ "learning_rate": 2.4e-05,
176
+ "loss": 4.0996,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.05,
181
+ "grad_norm": 3.042468547821045,
182
+ "learning_rate": 2.5e-05,
183
+ "loss": 4.9146,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.052,
188
+ "grad_norm": 2.458193063735962,
189
+ "learning_rate": 2.6000000000000002e-05,
190
+ "loss": 1.8707,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.054,
195
+ "grad_norm": 1.6752468347549438,
196
+ "learning_rate": 2.7000000000000002e-05,
197
+ "loss": 3.1247,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.056,
202
+ "grad_norm": 4.470595359802246,
203
+ "learning_rate": 2.8000000000000003e-05,
204
+ "loss": 3.8507,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.058,
209
+ "grad_norm": 6.865239143371582,
210
+ "learning_rate": 2.9e-05,
211
+ "loss": 2.8481,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.06,
216
+ "grad_norm": 20.699951171875,
217
+ "learning_rate": 3e-05,
218
+ "loss": 4.4567,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.062,
223
+ "grad_norm": 8.376248359680176,
224
+ "learning_rate": 3.1e-05,
225
+ "loss": 3.544,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.064,
230
+ "grad_norm": 0.6330814361572266,
231
+ "learning_rate": 3.2000000000000005e-05,
232
+ "loss": 2.028,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.066,
237
+ "grad_norm": 1.892885684967041,
238
+ "learning_rate": 3.3e-05,
239
+ "loss": 3.4244,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.068,
244
+ "grad_norm": 6.533657073974609,
245
+ "learning_rate": 3.4000000000000007e-05,
246
+ "loss": 5.216,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.07,
251
+ "grad_norm": 1.1567013263702393,
252
+ "learning_rate": 3.5e-05,
253
+ "loss": 2.7441,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.072,
258
+ "grad_norm": 5.744589328765869,
259
+ "learning_rate": 3.6e-05,
260
+ "loss": 2.6191,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.074,
265
+ "grad_norm": 7.04871129989624,
266
+ "learning_rate": 3.7e-05,
267
+ "loss": 5.3131,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.076,
272
+ "grad_norm": 13.293014526367188,
273
+ "learning_rate": 3.8e-05,
274
+ "loss": 5.2818,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.078,
279
+ "grad_norm": 3.2866837978363037,
280
+ "learning_rate": 3.9000000000000006e-05,
281
+ "loss": 3.086,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.08,
286
+ "grad_norm": 2.1428515911102295,
287
+ "learning_rate": 4e-05,
288
+ "loss": 2.6475,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.082,
293
+ "grad_norm": 46.64889907836914,
294
+ "learning_rate": 4.1e-05,
295
+ "loss": 2.0594,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.084,
300
+ "grad_norm": 2.84214448928833,
301
+ "learning_rate": 4.2e-05,
302
+ "loss": 3.5431,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.086,
307
+ "grad_norm": 5.909653186798096,
308
+ "learning_rate": 4.3e-05,
309
+ "loss": 2.7867,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.088,
314
+ "grad_norm": 4.650357246398926,
315
+ "learning_rate": 4.4000000000000006e-05,
316
+ "loss": 6.2247,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.09,
321
+ "grad_norm": 5.519321918487549,
322
+ "learning_rate": 4.5e-05,
323
+ "loss": 1.8291,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.092,
328
+ "grad_norm": 135.69471740722656,
329
+ "learning_rate": 4.600000000000001e-05,
330
+ "loss": 5.4416,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.094,
335
+ "grad_norm": 4.472829341888428,
336
+ "learning_rate": 4.7e-05,
337
+ "loss": 6.0983,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.096,
342
+ "grad_norm": 3.7479896545410156,
343
+ "learning_rate": 4.8e-05,
344
+ "loss": 1.8501,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.098,
349
+ "grad_norm": 5.382004261016846,
350
+ "learning_rate": 4.9e-05,
351
+ "loss": 1.5013,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.1,
356
+ "grad_norm": 16.679519653320312,
357
+ "learning_rate": 5e-05,
358
+ "loss": 2.6987,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.102,
363
+ "grad_norm": 2.9823508262634277,
364
+ "learning_rate": 4.999939076763487e-05,
365
+ "loss": 2.268,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 0.104,
370
+ "grad_norm": 5.970380783081055,
371
+ "learning_rate": 4.999756310023261e-05,
372
+ "loss": 2.1733,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 0.106,
377
+ "grad_norm": 0.8665391206741333,
378
+ "learning_rate": 4.999451708687114e-05,
379
+ "loss": 6.0941,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 0.108,
384
+ "grad_norm": 12.031403541564941,
385
+ "learning_rate": 4.999025287600886e-05,
386
+ "loss": 5.3397,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 0.11,
391
+ "grad_norm": 6.884283542633057,
392
+ "learning_rate": 4.99847706754774e-05,
393
+ "loss": 1.2973,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 0.112,
398
+ "grad_norm": 6.688343524932861,
399
+ "learning_rate": 4.997807075247146e-05,
400
+ "loss": 2.9408,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 0.114,
405
+ "grad_norm": 2.265263319015503,
406
+ "learning_rate": 4.997015343353585e-05,
407
+ "loss": 1.5613,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 0.116,
412
+ "grad_norm": 0.2830611765384674,
413
+ "learning_rate": 4.996101910454953e-05,
414
+ "loss": 1.5912,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 0.118,
419
+ "grad_norm": 6.979793071746826,
420
+ "learning_rate": 4.995066821070679e-05,
421
+ "loss": 3.5306,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 0.12,
426
+ "grad_norm": 17.920307159423828,
427
+ "learning_rate": 4.993910125649561e-05,
428
+ "loss": 2.1385,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 0.122,
433
+ "grad_norm": 3.7927424907684326,
434
+ "learning_rate": 4.992631880567301e-05,
435
+ "loss": 1.5452,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 0.124,
440
+ "grad_norm": 6.910824298858643,
441
+ "learning_rate": 4.991232148123761e-05,
442
+ "loss": 3.0193,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 0.126,
447
+ "grad_norm": 6.665999412536621,
448
+ "learning_rate": 4.989710996539926e-05,
449
+ "loss": 2.3063,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 0.128,
454
+ "grad_norm": 1.253774642944336,
455
+ "learning_rate": 4.988068499954578e-05,
456
+ "loss": 2.2441,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 0.13,
461
+ "grad_norm": 1.61366605758667,
462
+ "learning_rate": 4.9863047384206835e-05,
463
+ "loss": 2.3075,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 0.132,
468
+ "grad_norm": 0.7466872334480286,
469
+ "learning_rate": 4.984419797901491e-05,
470
+ "loss": 2.9721,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 0.134,
475
+ "grad_norm": 27.337018966674805,
476
+ "learning_rate": 4.982413770266342e-05,
477
+ "loss": 2.7736,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 0.136,
482
+ "grad_norm": 7.90925407409668,
483
+ "learning_rate": 4.980286753286195e-05,
484
+ "loss": 1.8991,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 0.138,
489
+ "grad_norm": 1.1065661907196045,
490
+ "learning_rate": 4.978038850628854e-05,
491
+ "loss": 2.2606,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 0.14,
496
+ "grad_norm": 1.1839981079101562,
497
+ "learning_rate": 4.975670171853926e-05,
498
+ "loss": 1.5491,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 0.142,
503
+ "grad_norm": 4.375758171081543,
504
+ "learning_rate": 4.9731808324074717e-05,
505
+ "loss": 2.2913,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 0.144,
510
+ "grad_norm": 6.079190731048584,
511
+ "learning_rate": 4.9705709536163824e-05,
512
+ "loss": 2.6315,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 0.146,
517
+ "grad_norm": 9.416289329528809,
518
+ "learning_rate": 4.96784066268247e-05,
519
+ "loss": 2.5128,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 0.148,
524
+ "grad_norm": 4.381228446960449,
525
+ "learning_rate": 4.964990092676263e-05,
526
+ "loss": 5.8145,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 0.15,
531
+ "grad_norm": 1.1350055932998657,
532
+ "learning_rate": 4.962019382530521e-05,
533
+ "loss": 2.3354,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 0.152,
538
+ "grad_norm": 3.0520100593566895,
539
+ "learning_rate": 4.9589286770334654e-05,
540
+ "loss": 4.621,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 0.154,
545
+ "grad_norm": 10.430975914001465,
546
+ "learning_rate": 4.9557181268217227e-05,
547
+ "loss": 2.7795,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 0.156,
552
+ "grad_norm": 1.6189144849777222,
553
+ "learning_rate": 4.952387888372979e-05,
554
+ "loss": 3.0171,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 0.158,
559
+ "grad_norm": 14.97889518737793,
560
+ "learning_rate": 4.94893812399836e-05,
561
+ "loss": 2.1028,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 0.16,
566
+ "grad_norm": 47.81957244873047,
567
+ "learning_rate": 4.9453690018345144e-05,
568
+ "loss": 13.4531,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 0.162,
573
+ "grad_norm": 1.6978071928024292,
574
+ "learning_rate": 4.94168069583542e-05,
575
+ "loss": 2.0661,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 0.164,
580
+ "grad_norm": 14.654315948486328,
581
+ "learning_rate": 4.937873385763908e-05,
582
+ "loss": 2.6598,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 0.166,
587
+ "grad_norm": 9.5645112991333,
588
+ "learning_rate": 4.933947257182901e-05,
589
+ "loss": 2.58,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 0.168,
594
+ "grad_norm": 6.204939365386963,
595
+ "learning_rate": 4.929902501446366e-05,
596
+ "loss": 2.9303,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 0.17,
601
+ "grad_norm": 3.7468295097351074,
602
+ "learning_rate": 4.925739315689991e-05,
603
+ "loss": 3.0212,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 0.172,
608
+ "grad_norm": 6.3915276527404785,
609
+ "learning_rate": 4.9214579028215776e-05,
610
+ "loss": 4.8252,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 0.174,
615
+ "grad_norm": 4.012444019317627,
616
+ "learning_rate": 4.917058471511149e-05,
617
+ "loss": 2.5644,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 0.176,
622
+ "grad_norm": 5.753321647644043,
623
+ "learning_rate": 4.912541236180779e-05,
624
+ "loss": 1.8755,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 0.178,
629
+ "grad_norm": 1.3646568059921265,
630
+ "learning_rate": 4.907906416994146e-05,
631
+ "loss": 0.7075,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 0.18,
636
+ "grad_norm": 7.185975551605225,
637
+ "learning_rate": 4.9031542398457974e-05,
638
+ "loss": 7.253,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 0.182,
643
+ "grad_norm": 9.094167709350586,
644
+ "learning_rate": 4.898284936350144e-05,
645
+ "loss": 3.3396,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 0.184,
650
+ "grad_norm": 2.7633776664733887,
651
+ "learning_rate": 4.893298743830168e-05,
652
+ "loss": 2.9798,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 0.186,
657
+ "grad_norm": 6.744080543518066,
658
+ "learning_rate": 4.888195905305859e-05,
659
+ "loss": 1.153,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 0.188,
664
+ "grad_norm": 55.569053649902344,
665
+ "learning_rate": 4.882976669482367e-05,
666
+ "loss": 3.1989,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 0.19,
671
+ "grad_norm": 20.480836868286133,
672
+ "learning_rate": 4.877641290737884e-05,
673
+ "loss": 2.7478,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 0.192,
678
+ "grad_norm": 3.698868751525879,
679
+ "learning_rate": 4.8721900291112415e-05,
680
+ "loss": 2.09,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 0.194,
685
+ "grad_norm": 5.127801418304443,
686
+ "learning_rate": 4.8666231502892415e-05,
687
+ "loss": 1.7634,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 0.196,
692
+ "grad_norm": 3.1002347469329834,
693
+ "learning_rate": 4.860940925593703e-05,
694
+ "loss": 1.6288,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 0.198,
699
+ "grad_norm": 2.849256753921509,
700
+ "learning_rate": 4.855143631968242e-05,
701
+ "loss": 2.9691,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 0.2,
706
+ "grad_norm": 15.38534164428711,
707
+ "learning_rate": 4.849231551964771e-05,
708
+ "loss": 3.5196,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 0.202,
713
+ "grad_norm": 6.950368881225586,
714
+ "learning_rate": 4.843204973729729e-05,
715
+ "loss": 3.1836,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 0.204,
720
+ "grad_norm": 19.31244659423828,
721
+ "learning_rate": 4.837064190990036e-05,
722
+ "loss": 3.1554,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 0.206,
727
+ "grad_norm": 52.91188430786133,
728
+ "learning_rate": 4.830809503038781e-05,
729
+ "loss": 3.0401,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 0.208,
734
+ "grad_norm": 3.394774913787842,
735
+ "learning_rate": 4.8244412147206284e-05,
736
+ "loss": 4.8929,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 0.21,
741
+ "grad_norm": 6.458067417144775,
742
+ "learning_rate": 4.817959636416969e-05,
743
+ "loss": 2.9868,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 0.212,
748
+ "grad_norm": 13.139763832092285,
749
+ "learning_rate": 4.8113650840307834e-05,
750
+ "loss": 5.5685,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 0.214,
755
+ "grad_norm": 7.955199718475342,
756
+ "learning_rate": 4.8046578789712515e-05,
757
+ "loss": 2.2926,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 0.216,
762
+ "grad_norm": 3.7902987003326416,
763
+ "learning_rate": 4.797838348138086e-05,
764
+ "loss": 3.3832,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 0.218,
769
+ "grad_norm": 7.483827590942383,
770
+ "learning_rate": 4.790906823905599e-05,
771
+ "loss": 4.5766,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 0.22,
776
+ "grad_norm": 2.907489538192749,
777
+ "learning_rate": 4.783863644106502e-05,
778
+ "loss": 3.2696,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 0.222,
783
+ "grad_norm": 2.127671003341675,
784
+ "learning_rate": 4.776709152015443e-05,
785
+ "loss": 1.583,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 0.224,
790
+ "grad_norm": 9.034016609191895,
791
+ "learning_rate": 4.769443696332272e-05,
792
+ "loss": 2.2554,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 0.226,
797
+ "grad_norm": 0.7481829524040222,
798
+ "learning_rate": 4.762067631165049e-05,
799
+ "loss": 1.6651,
800
+ "step": 1130
801
+ },
802
+ {
803
+ "epoch": 0.228,
804
+ "grad_norm": 19.638362884521484,
805
+ "learning_rate": 4.754581316012785e-05,
806
+ "loss": 3.1578,
807
+ "step": 1140
808
+ },
809
+ {
810
+ "epoch": 0.23,
811
+ "grad_norm": 2.9221441745758057,
812
+ "learning_rate": 4.7469851157479177e-05,
813
+ "loss": 2.4297,
814
+ "step": 1150
815
+ },
816
+ {
817
+ "epoch": 0.232,
818
+ "grad_norm": 13.748416900634766,
819
+ "learning_rate": 4.7392794005985326e-05,
820
+ "loss": 2.9167,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 0.234,
825
+ "grad_norm": 2.525977611541748,
826
+ "learning_rate": 4.731464546130314e-05,
827
+ "loss": 4.5697,
828
+ "step": 1170
829
+ },
830
+ {
831
+ "epoch": 0.236,
832
+ "grad_norm": 1.523427963256836,
833
+ "learning_rate": 4.723540933228244e-05,
834
+ "loss": 2.1219,
835
+ "step": 1180
836
+ },
837
+ {
838
+ "epoch": 0.238,
839
+ "grad_norm": 7.397847652435303,
840
+ "learning_rate": 4.715508948078037e-05,
841
+ "loss": 2.4125,
842
+ "step": 1190
843
+ },
844
+ {
845
+ "epoch": 0.24,
846
+ "grad_norm": 1.8037457466125488,
847
+ "learning_rate": 4.707368982147318e-05,
848
+ "loss": 4.1166,
849
+ "step": 1200
850
+ },
851
+ {
852
+ "epoch": 0.242,
853
+ "grad_norm": 2.4662930965423584,
854
+ "learning_rate": 4.6991214321665414e-05,
855
+ "loss": 3.1524,
856
+ "step": 1210
857
+ },
858
+ {
859
+ "epoch": 0.244,
860
+ "grad_norm": 5.675219535827637,
861
+ "learning_rate": 4.690766700109659e-05,
862
+ "loss": 1.5237,
863
+ "step": 1220
864
+ },
865
+ {
866
+ "epoch": 0.246,
867
+ "grad_norm": 11.182186126708984,
868
+ "learning_rate": 4.682305193174524e-05,
869
+ "loss": 2.8092,
870
+ "step": 1230
871
+ },
872
+ {
873
+ "epoch": 0.248,
874
+ "grad_norm": 13.100793838500977,
875
+ "learning_rate": 4.6737373237630476e-05,
876
+ "loss": 2.256,
877
+ "step": 1240
878
+ },
879
+ {
880
+ "epoch": 0.25,
881
+ "grad_norm": 0.881151556968689,
882
+ "learning_rate": 4.665063509461097e-05,
883
+ "loss": 2.5601,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 0.252,
888
+ "grad_norm": 1.001516342163086,
889
+ "learning_rate": 4.656284173018144e-05,
890
+ "loss": 2.5502,
891
+ "step": 1260
892
+ },
893
+ {
894
+ "epoch": 0.254,
895
+ "grad_norm": 2.2227511405944824,
896
+ "learning_rate": 4.6473997423266614e-05,
897
+ "loss": 3.4447,
898
+ "step": 1270
899
+ },
900
+ {
901
+ "epoch": 0.256,
902
+ "grad_norm": 3.1253716945648193,
903
+ "learning_rate": 4.638410650401267e-05,
904
+ "loss": 1.6954,
905
+ "step": 1280
906
+ },
907
+ {
908
+ "epoch": 0.258,
909
+ "grad_norm": 4.01163387298584,
910
+ "learning_rate": 4.629317335357619e-05,
911
+ "loss": 2.353,
912
+ "step": 1290
913
+ },
914
+ {
915
+ "epoch": 0.26,
916
+ "grad_norm": 14.949892044067383,
917
+ "learning_rate": 4.620120240391065e-05,
918
+ "loss": 2.1544,
919
+ "step": 1300
920
+ },
921
+ {
922
+ "epoch": 0.262,
923
+ "grad_norm": 14.187519073486328,
924
+ "learning_rate": 4.610819813755038e-05,
925
+ "loss": 1.2159,
926
+ "step": 1310
927
+ },
928
+ {
929
+ "epoch": 0.264,
930
+ "grad_norm": 0.4033137559890747,
931
+ "learning_rate": 4.601416508739211e-05,
932
+ "loss": 1.9003,
933
+ "step": 1320
934
+ },
935
+ {
936
+ "epoch": 0.266,
937
+ "grad_norm": 97.27405548095703,
938
+ "learning_rate": 4.591910783647404e-05,
939
+ "loss": 4.5354,
940
+ "step": 1330
941
+ },
942
+ {
943
+ "epoch": 0.268,
944
+ "grad_norm": 1.0602211952209473,
945
+ "learning_rate": 4.5823031017752485e-05,
946
+ "loss": 1.9724,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 0.27,
951
+ "grad_norm": 13.463448524475098,
952
+ "learning_rate": 4.572593931387604e-05,
953
+ "loss": 2.4934,
954
+ "step": 1350
955
+ },
956
+ {
957
+ "epoch": 0.272,
958
+ "grad_norm": 21.87057876586914,
959
+ "learning_rate": 4.562783745695738e-05,
960
+ "loss": 2.1056,
961
+ "step": 1360
962
+ },
963
+ {
964
+ "epoch": 0.274,
965
+ "grad_norm": 1.0790810585021973,
966
+ "learning_rate": 4.5528730228342605e-05,
967
+ "loss": 1.5175,
968
+ "step": 1370
969
+ },
970
+ {
971
+ "epoch": 0.276,
972
+ "grad_norm": 20.827247619628906,
973
+ "learning_rate": 4.542862245837821e-05,
974
+ "loss": 2.1733,
975
+ "step": 1380
976
+ },
977
+ {
978
+ "epoch": 0.278,
979
+ "grad_norm": 3.0733835697174072,
980
+ "learning_rate": 4.532751902617569e-05,
981
+ "loss": 1.431,
982
+ "step": 1390
983
+ },
984
+ {
985
+ "epoch": 0.28,
986
+ "grad_norm": 5.851264476776123,
987
+ "learning_rate": 4.522542485937369e-05,
988
+ "loss": 1.5888,
989
+ "step": 1400
990
+ },
991
+ {
992
+ "epoch": 0.282,
993
+ "grad_norm": 5.039645195007324,
994
+ "learning_rate": 4.512234493389785e-05,
995
+ "loss": 2.84,
996
+ "step": 1410
997
+ },
998
+ {
999
+ "epoch": 0.284,
1000
+ "grad_norm": 2.501115322113037,
1001
+ "learning_rate": 4.5018284273718336e-05,
1002
+ "loss": 3.0874,
1003
+ "step": 1420
1004
+ },
1005
+ {
1006
+ "epoch": 0.286,
1007
+ "grad_norm": 10.413125038146973,
1008
+ "learning_rate": 4.491324795060491e-05,
1009
+ "loss": 1.6856,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 0.288,
1014
+ "grad_norm": 4.448335647583008,
1015
+ "learning_rate": 4.480724108387977e-05,
1016
+ "loss": 3.0233,
1017
+ "step": 1440
1018
+ },
1019
+ {
1020
+ "epoch": 0.29,
1021
+ "grad_norm": 4.848165988922119,
1022
+ "learning_rate": 4.4700268840168045e-05,
1023
+ "loss": 2.6897,
1024
+ "step": 1450
1025
+ },
1026
+ {
1027
+ "epoch": 0.292,
1028
+ "grad_norm": 1.7471359968185425,
1029
+ "learning_rate": 4.4592336433146e-05,
1030
+ "loss": 5.0716,
1031
+ "step": 1460
1032
+ },
1033
+ {
1034
+ "epoch": 0.294,
1035
+ "grad_norm": 0.7191880345344543,
1036
+ "learning_rate": 4.448344912328686e-05,
1037
+ "loss": 1.5289,
1038
+ "step": 1470
1039
+ },
1040
+ {
1041
+ "epoch": 0.296,
1042
+ "grad_norm": 29.567447662353516,
1043
+ "learning_rate": 4.4373612217604496e-05,
1044
+ "loss": 5.0471,
1045
+ "step": 1480
1046
+ },
1047
+ {
1048
+ "epoch": 0.298,
1049
+ "grad_norm": 5.8198723793029785,
1050
+ "learning_rate": 4.426283106939474e-05,
1051
+ "loss": 1.6411,
1052
+ "step": 1490
1053
+ },
1054
+ {
1055
+ "epoch": 0.3,
1056
+ "grad_norm": 8.184475898742676,
1057
+ "learning_rate": 4.415111107797445e-05,
1058
+ "loss": 3.0973,
1059
+ "step": 1500
1060
+ },
1061
+ {
1062
+ "epoch": 0.302,
1063
+ "grad_norm": 14.548653602600098,
1064
+ "learning_rate": 4.403845768841842e-05,
1065
+ "loss": 2.0314,
1066
+ "step": 1510
1067
+ },
1068
+ {
1069
+ "epoch": 0.304,
1070
+ "grad_norm": 20.58685302734375,
1071
+ "learning_rate": 4.3924876391293915e-05,
1072
+ "loss": 5.9555,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 0.306,
1077
+ "grad_norm": 4.702794551849365,
1078
+ "learning_rate": 4.381037272239311e-05,
1079
+ "loss": 2.5155,
1080
+ "step": 1530
1081
+ },
1082
+ {
1083
+ "epoch": 0.308,
1084
+ "grad_norm": 2.8011796474456787,
1085
+ "learning_rate": 4.36949522624633e-05,
1086
+ "loss": 3.6459,
1087
+ "step": 1540
1088
+ },
1089
+ {
1090
+ "epoch": 0.31,
1091
+ "grad_norm": 2.2072603702545166,
1092
+ "learning_rate": 4.357862063693486e-05,
1093
+ "loss": 3.0688,
1094
+ "step": 1550
1095
+ },
1096
+ {
1097
+ "epoch": 0.312,
1098
+ "grad_norm": 8.792641639709473,
1099
+ "learning_rate": 4.3461383515647106e-05,
1100
+ "loss": 5.096,
1101
+ "step": 1560
1102
+ },
1103
+ {
1104
+ "epoch": 0.314,
1105
+ "grad_norm": 23.349695205688477,
1106
+ "learning_rate": 4.334324661257191e-05,
1107
+ "loss": 3.9776,
1108
+ "step": 1570
1109
+ },
1110
+ {
1111
+ "epoch": 0.316,
1112
+ "grad_norm": 4.662014484405518,
1113
+ "learning_rate": 4.3224215685535294e-05,
1114
+ "loss": 1.8869,
1115
+ "step": 1580
1116
+ },
1117
+ {
1118
+ "epoch": 0.318,
1119
+ "grad_norm": 5.489886283874512,
1120
+ "learning_rate": 4.3104296535936695e-05,
1121
+ "loss": 2.1454,
1122
+ "step": 1590
1123
+ },
1124
+ {
1125
+ "epoch": 0.32,
1126
+ "grad_norm": 8.950507164001465,
1127
+ "learning_rate": 4.2983495008466276e-05,
1128
+ "loss": 3.0165,
1129
+ "step": 1600
1130
+ },
1131
+ {
1132
+ "epoch": 0.322,
1133
+ "grad_norm": 39.95512390136719,
1134
+ "learning_rate": 4.2861816990820084e-05,
1135
+ "loss": 2.9248,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 0.324,
1140
+ "grad_norm": 101.46671295166016,
1141
+ "learning_rate": 4.273926841341302e-05,
1142
+ "loss": 5.9823,
1143
+ "step": 1620
1144
+ },
1145
+ {
1146
+ "epoch": 0.326,
1147
+ "grad_norm": 2.456148862838745,
1148
+ "learning_rate": 4.261585524908987e-05,
1149
+ "loss": 4.5845,
1150
+ "step": 1630
1151
+ },
1152
+ {
1153
+ "epoch": 0.328,
1154
+ "grad_norm": 11.807568550109863,
1155
+ "learning_rate": 4.249158351283414e-05,
1156
+ "loss": 2.5418,
1157
+ "step": 1640
1158
+ },
1159
+ {
1160
+ "epoch": 0.33,
1161
+ "grad_norm": 2.619940996170044,
1162
+ "learning_rate": 4.2366459261474933e-05,
1163
+ "loss": 3.9026,
1164
+ "step": 1650
1165
+ },
1166
+ {
1167
+ "epoch": 0.332,
1168
+ "grad_norm": 6.112611770629883,
1169
+ "learning_rate": 4.224048859339175e-05,
1170
+ "loss": 3.7564,
1171
+ "step": 1660
1172
+ },
1173
+ {
1174
+ "epoch": 0.334,
1175
+ "grad_norm": 5.871501922607422,
1176
+ "learning_rate": 4.211367764821722e-05,
1177
+ "loss": 1.7162,
1178
+ "step": 1670
1179
+ },
1180
+ {
1181
+ "epoch": 0.336,
1182
+ "grad_norm": 2.189948081970215,
1183
+ "learning_rate": 4.198603260653792e-05,
1184
+ "loss": 3.2491,
1185
+ "step": 1680
1186
+ },
1187
+ {
1188
+ "epoch": 0.338,
1189
+ "grad_norm": 22.6879825592041,
1190
+ "learning_rate": 4.185755968959308e-05,
1191
+ "loss": 1.595,
1192
+ "step": 1690
1193
+ },
1194
+ {
1195
+ "epoch": 0.34,
1196
+ "grad_norm": 15.22309684753418,
1197
+ "learning_rate": 4.172826515897146e-05,
1198
+ "loss": 2.5093,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 0.342,
1203
+ "grad_norm": 5.424519062042236,
1204
+ "learning_rate": 4.1598155316306044e-05,
1205
+ "loss": 5.9477,
1206
+ "step": 1710
1207
+ },
1208
+ {
1209
+ "epoch": 0.344,
1210
+ "grad_norm": 10.794046401977539,
1211
+ "learning_rate": 4.146723650296701e-05,
1212
+ "loss": 2.6358,
1213
+ "step": 1720
1214
+ },
1215
+ {
1216
+ "epoch": 0.346,
1217
+ "grad_norm": 3.659132719039917,
1218
+ "learning_rate": 4.133551509975264e-05,
1219
+ "loss": 1.4777,
1220
+ "step": 1730
1221
+ },
1222
+ {
1223
+ "epoch": 0.348,
1224
+ "grad_norm": 3.7974021434783936,
1225
+ "learning_rate": 4.1202997526578276e-05,
1226
+ "loss": 2.132,
1227
+ "step": 1740
1228
+ },
1229
+ {
1230
+ "epoch": 0.35,
1231
+ "grad_norm": 5.216747283935547,
1232
+ "learning_rate": 4.1069690242163484e-05,
1233
+ "loss": 1.4507,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 0.352,
1238
+ "grad_norm": 5.669449329376221,
1239
+ "learning_rate": 4.093559974371725e-05,
1240
+ "loss": 1.2368,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 0.354,
1245
+ "grad_norm": 3.354207992553711,
1246
+ "learning_rate": 4.080073256662127e-05,
1247
+ "loss": 0.9967,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 0.356,
1252
+ "grad_norm": 10.8993558883667,
1253
+ "learning_rate": 4.066509528411152e-05,
1254
+ "loss": 5.9177,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 0.358,
1259
+ "grad_norm": 13.498696327209473,
1260
+ "learning_rate": 4.052869450695776e-05,
1261
+ "loss": 2.3273,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 0.36,
1266
+ "grad_norm": 1.2119014263153076,
1267
+ "learning_rate": 4.039153688314145e-05,
1268
+ "loss": 2.7299,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 0.362,
1273
+ "grad_norm": 1.2092421054840088,
1274
+ "learning_rate": 4.02536290975317e-05,
1275
+ "loss": 2.0494,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 0.364,
1280
+ "grad_norm": 15.161354064941406,
1281
+ "learning_rate": 4.011497787155938e-05,
1282
+ "loss": 1.9595,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 0.366,
1287
+ "grad_norm": 13.400138854980469,
1288
+ "learning_rate": 3.997558996288965e-05,
1289
+ "loss": 2.4506,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 0.368,
1294
+ "grad_norm": 0.7680931687355042,
1295
+ "learning_rate": 3.983547216509254e-05,
1296
+ "loss": 3.6176,
1297
+ "step": 1840
1298
+ },
1299
+ {
1300
+ "epoch": 0.37,
1301
+ "grad_norm": 2.35080623626709,
1302
+ "learning_rate": 3.969463130731183e-05,
1303
+ "loss": 3.2954,
1304
+ "step": 1850
1305
+ },
1306
+ {
1307
+ "epoch": 0.372,
1308
+ "grad_norm": 0.0,
1309
+ "learning_rate": 3.955307425393224e-05,
1310
+ "loss": 2.0271,
1311
+ "step": 1860
1312
+ },
1313
+ {
1314
+ "epoch": 0.374,
1315
+ "grad_norm": 9.143167495727539,
1316
+ "learning_rate": 3.941080790424484e-05,
1317
+ "loss": 1.6445,
1318
+ "step": 1870
1319
+ },
1320
+ {
1321
+ "epoch": 0.376,
1322
+ "grad_norm": 9.658303260803223,
1323
+ "learning_rate": 3.92678391921108e-05,
1324
+ "loss": 2.4891,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 0.378,
1329
+ "grad_norm": 6.326284885406494,
1330
+ "learning_rate": 3.912417508562345e-05,
1331
+ "loss": 1.9795,
1332
+ "step": 1890
1333
+ },
1334
+ {
1335
+ "epoch": 0.38,
1336
+ "grad_norm": 1.721880555152893,
1337
+ "learning_rate": 3.897982258676867e-05,
1338
+ "loss": 1.7931,
1339
+ "step": 1900
1340
+ },
1341
+ {
1342
+ "epoch": 0.382,
1343
+ "grad_norm": 14.906951904296875,
1344
+ "learning_rate": 3.883478873108361e-05,
1345
+ "loss": 2.9522,
1346
+ "step": 1910
1347
+ },
1348
+ {
1349
+ "epoch": 0.384,
1350
+ "grad_norm": 3.197312593460083,
1351
+ "learning_rate": 3.868908058731376e-05,
1352
+ "loss": 2.0274,
1353
+ "step": 1920
1354
+ },
1355
+ {
1356
+ "epoch": 0.386,
1357
+ "grad_norm": 12.083334922790527,
1358
+ "learning_rate": 3.85427052570685e-05,
1359
+ "loss": 3.1124,
1360
+ "step": 1930
1361
+ },
1362
+ {
1363
+ "epoch": 0.388,
1364
+ "grad_norm": 7.249095439910889,
1365
+ "learning_rate": 3.8395669874474915e-05,
1366
+ "loss": 2.101,
1367
+ "step": 1940
1368
+ },
1369
+ {
1370
+ "epoch": 0.39,
1371
+ "grad_norm": 1.5133755207061768,
1372
+ "learning_rate": 3.824798160583012e-05,
1373
+ "loss": 3.1344,
1374
+ "step": 1950
1375
+ },
1376
+ {
1377
+ "epoch": 0.392,
1378
+ "grad_norm": 1.0909286737442017,
1379
+ "learning_rate": 3.8099647649251986e-05,
1380
+ "loss": 1.7248,
1381
+ "step": 1960
1382
+ },
1383
+ {
1384
+ "epoch": 0.394,
1385
+ "grad_norm": 1.1263455152511597,
1386
+ "learning_rate": 3.795067523432826e-05,
1387
+ "loss": 1.9146,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 0.396,
1392
+ "grad_norm": 5.309441089630127,
1393
+ "learning_rate": 3.780107162176429e-05,
1394
+ "loss": 2.3492,
1395
+ "step": 1980
1396
+ },
1397
+ {
1398
+ "epoch": 0.398,
1399
+ "grad_norm": 5.851804733276367,
1400
+ "learning_rate": 3.765084410302909e-05,
1401
+ "loss": 1.5525,
1402
+ "step": 1990
1403
+ },
1404
+ {
1405
+ "epoch": 0.4,
1406
+ "grad_norm": 6.578305244445801,
1407
+ "learning_rate": 3.7500000000000003e-05,
1408
+ "loss": 2.8312,
1409
+ "step": 2000
1410
+ },
1411
+ {
1412
+ "epoch": 0.402,
1413
+ "grad_norm": 8.410406112670898,
1414
+ "learning_rate": 3.7348546664605777e-05,
1415
+ "loss": 3.3568,
1416
+ "step": 2010
1417
+ },
1418
+ {
1419
+ "epoch": 0.404,
1420
+ "grad_norm": 1.6206369400024414,
1421
+ "learning_rate": 3.719649147846832e-05,
1422
+ "loss": 1.1778,
1423
+ "step": 2020
1424
+ },
1425
+ {
1426
+ "epoch": 0.406,
1427
+ "grad_norm": 3.7382287979125977,
1428
+ "learning_rate": 3.704384185254288e-05,
1429
+ "loss": 2.3964,
1430
+ "step": 2030
1431
+ },
1432
+ {
1433
+ "epoch": 0.408,
1434
+ "grad_norm": 2.0517022609710693,
1435
+ "learning_rate": 3.689060522675689e-05,
1436
+ "loss": 2.0172,
1437
+ "step": 2040
1438
+ },
1439
+ {
1440
+ "epoch": 0.41,
1441
+ "grad_norm": 2.8966128826141357,
1442
+ "learning_rate": 3.673678906964727e-05,
1443
+ "loss": 2.1464,
1444
+ "step": 2050
1445
+ },
1446
+ {
1447
+ "epoch": 0.412,
1448
+ "grad_norm": 0.9577096104621887,
1449
+ "learning_rate": 3.6582400877996546e-05,
1450
+ "loss": 2.1743,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 0.414,
1455
+ "grad_norm": 2.7587995529174805,
1456
+ "learning_rate": 3.642744817646736e-05,
1457
+ "loss": 2.785,
1458
+ "step": 2070
1459
+ },
1460
+ {
1461
+ "epoch": 0.416,
1462
+ "grad_norm": 6.39242696762085,
1463
+ "learning_rate": 3.627193851723577e-05,
1464
+ "loss": 1.5782,
1465
+ "step": 2080
1466
+ },
1467
+ {
1468
+ "epoch": 0.418,
1469
+ "grad_norm": 19.898326873779297,
1470
+ "learning_rate": 3.611587947962319e-05,
1471
+ "loss": 3.6745,
1472
+ "step": 2090
1473
+ },
1474
+ {
1475
+ "epoch": 0.42,
1476
+ "grad_norm": 10.48559856414795,
1477
+ "learning_rate": 3.5959278669726935e-05,
1478
+ "loss": 1.7939,
1479
+ "step": 2100
1480
+ },
1481
+ {
1482
+ "epoch": 0.422,
1483
+ "grad_norm": 7.97528076171875,
1484
+ "learning_rate": 3.580214372004956e-05,
1485
+ "loss": 1.9243,
1486
+ "step": 2110
1487
+ },
1488
+ {
1489
+ "epoch": 0.424,
1490
+ "grad_norm": 4.453253269195557,
1491
+ "learning_rate": 3.564448228912682e-05,
1492
+ "loss": 3.6028,
1493
+ "step": 2120
1494
+ },
1495
+ {
1496
+ "epoch": 0.426,
1497
+ "grad_norm": 3.2717721462249756,
1498
+ "learning_rate": 3.548630206115443e-05,
1499
+ "loss": 1.3939,
1500
+ "step": 2130
1501
+ },
1502
+ {
1503
+ "epoch": 0.428,
1504
+ "grad_norm": 5.499331951141357,
1505
+ "learning_rate": 3.532761074561355e-05,
1506
+ "loss": 2.1193,
1507
+ "step": 2140
1508
+ },
1509
+ {
1510
+ "epoch": 0.43,
1511
+ "grad_norm": 2.989884853363037,
1512
+ "learning_rate": 3.516841607689501e-05,
1513
+ "loss": 2.0047,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 0.432,
1518
+ "grad_norm": 25.216196060180664,
1519
+ "learning_rate": 3.5008725813922386e-05,
1520
+ "loss": 5.2666,
1521
+ "step": 2160
1522
+ },
1523
+ {
1524
+ "epoch": 0.434,
1525
+ "grad_norm": 2.533149003982544,
1526
+ "learning_rate": 3.484854773977378e-05,
1527
+ "loss": 1.8129,
1528
+ "step": 2170
1529
+ },
1530
+ {
1531
+ "epoch": 0.436,
1532
+ "grad_norm": 1.6373289823532104,
1533
+ "learning_rate": 3.4687889661302576e-05,
1534
+ "loss": 1.3969,
1535
+ "step": 2180
1536
+ },
1537
+ {
1538
+ "epoch": 0.438,
1539
+ "grad_norm": 10.936837196350098,
1540
+ "learning_rate": 3.452675940875686e-05,
1541
+ "loss": 1.9788,
1542
+ "step": 2190
1543
+ },
1544
+ {
1545
+ "epoch": 0.44,
1546
+ "grad_norm": 5.747529983520508,
1547
+ "learning_rate": 3.436516483539781e-05,
1548
+ "loss": 1.5522,
1549
+ "step": 2200
1550
+ },
1551
+ {
1552
+ "epoch": 0.442,
1553
+ "grad_norm": 22.60832405090332,
1554
+ "learning_rate": 3.4203113817116957e-05,
1555
+ "loss": 2.1951,
1556
+ "step": 2210
1557
+ },
1558
+ {
1559
+ "epoch": 0.444,
1560
+ "grad_norm": 6.1106953620910645,
1561
+ "learning_rate": 3.4040614252052305e-05,
1562
+ "loss": 5.2669,
1563
+ "step": 2220
1564
+ },
1565
+ {
1566
+ "epoch": 0.446,
1567
+ "grad_norm": 3.707664728164673,
1568
+ "learning_rate": 3.387767406020343e-05,
1569
+ "loss": 2.5184,
1570
+ "step": 2230
1571
+ },
1572
+ {
1573
+ "epoch": 0.448,
1574
+ "grad_norm": 5.510468006134033,
1575
+ "learning_rate": 3.3714301183045385e-05,
1576
+ "loss": 2.0463,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 0.45,
1581
+ "grad_norm": 43.166866302490234,
1582
+ "learning_rate": 3.355050358314172e-05,
1583
+ "loss": 2.9354,
1584
+ "step": 2250
1585
+ },
1586
+ {
1587
+ "epoch": 0.452,
1588
+ "grad_norm": 0.45324602723121643,
1589
+ "learning_rate": 3.338628924375638e-05,
1590
+ "loss": 2.1297,
1591
+ "step": 2260
1592
+ },
1593
+ {
1594
+ "epoch": 0.454,
1595
+ "grad_norm": 8.361921310424805,
1596
+ "learning_rate": 3.322166616846458e-05,
1597
+ "loss": 3.2082,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 0.456,
1602
+ "grad_norm": 1.730605125427246,
1603
+ "learning_rate": 3.305664238076278e-05,
1604
+ "loss": 2.0667,
1605
+ "step": 2280
1606
+ },
1607
+ {
1608
+ "epoch": 0.458,
1609
+ "grad_norm": 13.287188529968262,
1610
+ "learning_rate": 3.289122592367757e-05,
1611
+ "loss": 2.4089,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 0.46,
1616
+ "grad_norm": 10.110196113586426,
1617
+ "learning_rate": 3.272542485937369e-05,
1618
+ "loss": 2.0842,
1619
+ "step": 2300
1620
+ },
1621
+ {
1622
+ "epoch": 0.462,
1623
+ "grad_norm": 13.76339340209961,
1624
+ "learning_rate": 3.2559247268761115e-05,
1625
+ "loss": 3.9489,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 0.464,
1630
+ "grad_norm": 8.27099895477295,
1631
+ "learning_rate": 3.239270125110117e-05,
1632
+ "loss": 1.8036,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 0.466,
1637
+ "grad_norm": 1.385971188545227,
1638
+ "learning_rate": 3.222579492361179e-05,
1639
+ "loss": 2.8004,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 0.468,
1644
+ "grad_norm": 10.99479866027832,
1645
+ "learning_rate": 3.205853642107192e-05,
1646
+ "loss": 1.0807,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 0.47,
1651
+ "grad_norm": 7.162081718444824,
1652
+ "learning_rate": 3.1890933895424976e-05,
1653
+ "loss": 4.2218,
1654
+ "step": 2350
1655
+ },
1656
+ {
1657
+ "epoch": 0.472,
1658
+ "grad_norm": 3.080836057662964,
1659
+ "learning_rate": 3.172299551538164e-05,
1660
+ "loss": 1.9778,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 0.474,
1665
+ "grad_norm": 1.8797277212142944,
1666
+ "learning_rate": 3.155472946602162e-05,
1667
+ "loss": 2.7487,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 0.476,
1672
+ "grad_norm": 7.540584087371826,
1673
+ "learning_rate": 3.138614394839476e-05,
1674
+ "loss": 2.199,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 0.478,
1679
+ "grad_norm": 5.368736743927002,
1680
+ "learning_rate": 3.121724717912138e-05,
1681
+ "loss": 3.5763,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 0.48,
1686
+ "grad_norm": 8.255654335021973,
1687
+ "learning_rate": 3.104804738999169e-05,
1688
+ "loss": 3.4331,
1689
+ "step": 2400
1690
+ },
1691
+ {
1692
+ "epoch": 0.482,
1693
+ "grad_norm": 7.196650981903076,
1694
+ "learning_rate": 3.087855282756475e-05,
1695
+ "loss": 2.2419,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 0.484,
1700
+ "grad_norm": 3.529343843460083,
1701
+ "learning_rate": 3.0708771752766394e-05,
1702
+ "loss": 2.0476,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 0.486,
1707
+ "grad_norm": 5.323751926422119,
1708
+ "learning_rate": 3.053871244048669e-05,
1709
+ "loss": 1.3934,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 0.488,
1714
+ "grad_norm": 53.67280197143555,
1715
+ "learning_rate": 3.0368383179176585e-05,
1716
+ "loss": 2.7532,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 0.49,
1721
+ "grad_norm": 21.385303497314453,
1722
+ "learning_rate": 3.0197792270443982e-05,
1723
+ "loss": 2.112,
1724
+ "step": 2450
1725
+ },
1726
+ {
1727
+ "epoch": 0.492,
1728
+ "grad_norm": 5.010990142822266,
1729
+ "learning_rate": 3.002694802864912e-05,
1730
+ "loss": 1.8119,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 0.494,
1735
+ "grad_norm": 7.916762351989746,
1736
+ "learning_rate": 2.98558587804993e-05,
1737
+ "loss": 1.5163,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 0.496,
1742
+ "grad_norm": 34.273319244384766,
1743
+ "learning_rate": 2.9684532864643122e-05,
1744
+ "loss": 3.372,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 0.498,
1749
+ "grad_norm": 3.292635440826416,
1750
+ "learning_rate": 2.9512978631264006e-05,
1751
+ "loss": 1.5534,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 0.5,
1756
+ "grad_norm": 9.055399894714355,
1757
+ "learning_rate": 2.9341204441673266e-05,
1758
+ "loss": 1.8644,
1759
+ "step": 2500
1760
+ },
1761
+ {
1762
+ "epoch": 0.502,
1763
+ "grad_norm": 51.29086685180664,
1764
+ "learning_rate": 2.916921866790256e-05,
1765
+ "loss": 4.3985,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 0.504,
1770
+ "grad_norm": 9.632088661193848,
1771
+ "learning_rate": 2.8997029692295874e-05,
1772
+ "loss": 2.0158,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 0.506,
1777
+ "grad_norm": 31.112043380737305,
1778
+ "learning_rate": 2.8824645907100954e-05,
1779
+ "loss": 1.3677,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 0.508,
1784
+ "grad_norm": 21.64225959777832,
1785
+ "learning_rate": 2.8652075714060295e-05,
1786
+ "loss": 2.399,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 0.51,
1791
+ "grad_norm": 1.216705322265625,
1792
+ "learning_rate": 2.8479327524001636e-05,
1793
+ "loss": 4.2765,
1794
+ "step": 2550
1795
+ },
1796
+ {
1797
+ "epoch": 0.512,
1798
+ "grad_norm": 12.95290470123291,
1799
+ "learning_rate": 2.8306409756428064e-05,
1800
+ "loss": 3.2872,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 0.514,
1805
+ "grad_norm": 7.114419460296631,
1806
+ "learning_rate": 2.8133330839107608e-05,
1807
+ "loss": 3.1682,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 0.516,
1812
+ "grad_norm": 111.07962036132812,
1813
+ "learning_rate": 2.7960099207662532e-05,
1814
+ "loss": 3.3066,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 0.518,
1819
+ "grad_norm": 10.673909187316895,
1820
+ "learning_rate": 2.7786723305158136e-05,
1821
+ "loss": 1.8911,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 0.52,
1826
+ "grad_norm": 1.493356466293335,
1827
+ "learning_rate": 2.761321158169134e-05,
1828
+ "loss": 1.7402,
1829
+ "step": 2600
1830
+ },
1831
+ {
1832
+ "epoch": 0.522,
1833
+ "grad_norm": 1.4890353679656982,
1834
+ "learning_rate": 2.7439572493978736e-05,
1835
+ "loss": 1.1721,
1836
+ "step": 2610
1837
+ },
1838
+ {
1839
+ "epoch": 0.524,
1840
+ "grad_norm": 1.4277186393737793,
1841
+ "learning_rate": 2.726581450494451e-05,
1842
+ "loss": 1.2482,
1843
+ "step": 2620
1844
+ },
1845
+ {
1846
+ "epoch": 0.526,
1847
+ "grad_norm": 2.4278600215911865,
1848
+ "learning_rate": 2.7091946083307896e-05,
1849
+ "loss": 3.3641,
1850
+ "step": 2630
1851
+ },
1852
+ {
1853
+ "epoch": 0.528,
1854
+ "grad_norm": 3.5468785762786865,
1855
+ "learning_rate": 2.6917975703170466e-05,
1856
+ "loss": 1.9946,
1857
+ "step": 2640
1858
+ },
1859
+ {
1860
+ "epoch": 0.53,
1861
+ "grad_norm": 1.9078953266143799,
1862
+ "learning_rate": 2.674391184360313e-05,
1863
+ "loss": 1.0218,
1864
+ "step": 2650
1865
+ },
1866
+ {
1867
+ "epoch": 0.532,
1868
+ "grad_norm": 5.883788108825684,
1869
+ "learning_rate": 2.656976298823284e-05,
1870
+ "loss": 2.336,
1871
+ "step": 2660
1872
+ },
1873
+ {
1874
+ "epoch": 0.534,
1875
+ "grad_norm": 58.67000961303711,
1876
+ "learning_rate": 2.6395537624829096e-05,
1877
+ "loss": 4.6423,
1878
+ "step": 2670
1879
+ },
1880
+ {
1881
+ "epoch": 0.536,
1882
+ "grad_norm": 8.353065490722656,
1883
+ "learning_rate": 2.6221244244890336e-05,
1884
+ "loss": 1.845,
1885
+ "step": 2680
1886
+ },
1887
+ {
1888
+ "epoch": 0.538,
1889
+ "grad_norm": 2.636930227279663,
1890
+ "learning_rate": 2.604689134322999e-05,
1891
+ "loss": 2.3117,
1892
+ "step": 2690
1893
+ },
1894
+ {
1895
+ "epoch": 0.54,
1896
+ "grad_norm": 21.744890213012695,
1897
+ "learning_rate": 2.587248741756253e-05,
1898
+ "loss": 3.3991,
1899
+ "step": 2700
1900
+ },
1901
+ {
1902
+ "epoch": 0.542,
1903
+ "grad_norm": 203.36412048339844,
1904
+ "learning_rate": 2.5698040968089225e-05,
1905
+ "loss": 3.7802,
1906
+ "step": 2710
1907
+ },
1908
+ {
1909
+ "epoch": 0.544,
1910
+ "grad_norm": 1.4143257141113281,
1911
+ "learning_rate": 2.5523560497083926e-05,
1912
+ "loss": 1.4431,
1913
+ "step": 2720
1914
+ },
1915
+ {
1916
+ "epoch": 0.546,
1917
+ "grad_norm": 0.6071110367774963,
1918
+ "learning_rate": 2.5349054508478637e-05,
1919
+ "loss": 0.8369,
1920
+ "step": 2730
1921
+ },
1922
+ {
1923
+ "epoch": 0.548,
1924
+ "grad_norm": 10.470783233642578,
1925
+ "learning_rate": 2.517453150744904e-05,
1926
+ "loss": 2.658,
1927
+ "step": 2740
1928
+ },
1929
+ {
1930
+ "epoch": 0.55,
1931
+ "grad_norm": 14.130455017089844,
1932
+ "learning_rate": 2.5e-05,
1933
+ "loss": 2.12,
1934
+ "step": 2750
1935
+ },
1936
+ {
1937
+ "epoch": 0.552,
1938
+ "grad_norm": 1.0785049200057983,
1939
+ "learning_rate": 2.4825468492550964e-05,
1940
+ "loss": 2.1272,
1941
+ "step": 2760
1942
+ },
1943
+ {
1944
+ "epoch": 0.554,
1945
+ "grad_norm": 3.327045440673828,
1946
+ "learning_rate": 2.4650945491521372e-05,
1947
+ "loss": 1.5502,
1948
+ "step": 2770
1949
+ },
1950
+ {
1951
+ "epoch": 0.556,
1952
+ "grad_norm": 27.231857299804688,
1953
+ "learning_rate": 2.447643950291608e-05,
1954
+ "loss": 1.4815,
1955
+ "step": 2780
1956
+ },
1957
+ {
1958
+ "epoch": 0.558,
1959
+ "grad_norm": 22.837984085083008,
1960
+ "learning_rate": 2.4301959031910784e-05,
1961
+ "loss": 1.3745,
1962
+ "step": 2790
1963
+ },
1964
+ {
1965
+ "epoch": 0.56,
1966
+ "grad_norm": 5.593969821929932,
1967
+ "learning_rate": 2.4127512582437485e-05,
1968
+ "loss": 2.9765,
1969
+ "step": 2800
1970
+ },
1971
+ {
1972
+ "epoch": 0.562,
1973
+ "grad_norm": 6.5615644454956055,
1974
+ "learning_rate": 2.3953108656770016e-05,
1975
+ "loss": 1.2392,
1976
+ "step": 2810
1977
+ },
1978
+ {
1979
+ "epoch": 0.564,
1980
+ "grad_norm": 8.840109825134277,
1981
+ "learning_rate": 2.377875575510967e-05,
1982
+ "loss": 1.6202,
1983
+ "step": 2820
1984
+ },
1985
+ {
1986
+ "epoch": 0.566,
1987
+ "grad_norm": 18.031835556030273,
1988
+ "learning_rate": 2.3604462375170906e-05,
1989
+ "loss": 4.6898,
1990
+ "step": 2830
1991
+ },
1992
+ {
1993
+ "epoch": 0.568,
1994
+ "grad_norm": 5.239192008972168,
1995
+ "learning_rate": 2.3430237011767167e-05,
1996
+ "loss": 1.3387,
1997
+ "step": 2840
1998
+ },
1999
+ {
2000
+ "epoch": 0.57,
2001
+ "grad_norm": 5.165606498718262,
2002
+ "learning_rate": 2.3256088156396868e-05,
2003
+ "loss": 3.6984,
2004
+ "step": 2850
2005
+ },
2006
+ {
2007
+ "epoch": 0.572,
2008
+ "grad_norm": 11.281644821166992,
2009
+ "learning_rate": 2.3082024296829536e-05,
2010
+ "loss": 1.874,
2011
+ "step": 2860
2012
+ },
2013
+ {
2014
+ "epoch": 0.574,
2015
+ "grad_norm": 31.151283264160156,
2016
+ "learning_rate": 2.2908053916692117e-05,
2017
+ "loss": 1.7756,
2018
+ "step": 2870
2019
+ },
2020
+ {
2021
+ "epoch": 0.576,
2022
+ "grad_norm": 18.31266212463379,
2023
+ "learning_rate": 2.2734185495055503e-05,
2024
+ "loss": 8.2453,
2025
+ "step": 2880
2026
+ },
2027
+ {
2028
+ "epoch": 0.578,
2029
+ "grad_norm": 19.712526321411133,
2030
+ "learning_rate": 2.2560427506021266e-05,
2031
+ "loss": 1.6711,
2032
+ "step": 2890
2033
+ },
2034
+ {
2035
+ "epoch": 0.58,
2036
+ "grad_norm": 12.909187316894531,
2037
+ "learning_rate": 2.238678841830867e-05,
2038
+ "loss": 2.9633,
2039
+ "step": 2900
2040
+ },
2041
+ {
2042
+ "epoch": 0.582,
2043
+ "grad_norm": 3.159407615661621,
2044
+ "learning_rate": 2.2213276694841866e-05,
2045
+ "loss": 1.882,
2046
+ "step": 2910
2047
+ },
2048
+ {
2049
+ "epoch": 0.584,
2050
+ "grad_norm": 20.577537536621094,
2051
+ "learning_rate": 2.2039900792337474e-05,
2052
+ "loss": 2.1105,
2053
+ "step": 2920
2054
+ },
2055
+ {
2056
+ "epoch": 0.586,
2057
+ "grad_norm": 43.844261169433594,
2058
+ "learning_rate": 2.186666916089239e-05,
2059
+ "loss": 2.285,
2060
+ "step": 2930
2061
+ },
2062
+ {
2063
+ "epoch": 0.588,
2064
+ "grad_norm": 10.760743141174316,
2065
+ "learning_rate": 2.1693590243571938e-05,
2066
+ "loss": 2.7095,
2067
+ "step": 2940
2068
+ },
2069
+ {
2070
+ "epoch": 0.59,
2071
+ "grad_norm": 8.60970401763916,
2072
+ "learning_rate": 2.1520672475998373e-05,
2073
+ "loss": 1.4936,
2074
+ "step": 2950
2075
+ },
2076
+ {
2077
+ "epoch": 0.592,
2078
+ "grad_norm": 4.47432804107666,
2079
+ "learning_rate": 2.1347924285939714e-05,
2080
+ "loss": 2.9807,
2081
+ "step": 2960
2082
+ },
2083
+ {
2084
+ "epoch": 0.594,
2085
+ "grad_norm": 14.855293273925781,
2086
+ "learning_rate": 2.117535409289905e-05,
2087
+ "loss": 1.8499,
2088
+ "step": 2970
2089
+ },
2090
+ {
2091
+ "epoch": 0.596,
2092
+ "grad_norm": 5.818737506866455,
2093
+ "learning_rate": 2.1002970307704132e-05,
2094
+ "loss": 2.6104,
2095
+ "step": 2980
2096
+ },
2097
+ {
2098
+ "epoch": 0.598,
2099
+ "grad_norm": 56.345890045166016,
2100
+ "learning_rate": 2.0830781332097446e-05,
2101
+ "loss": 4.632,
2102
+ "step": 2990
2103
+ },
2104
+ {
2105
+ "epoch": 0.6,
2106
+ "grad_norm": 0.7071281671524048,
2107
+ "learning_rate": 2.0658795558326743e-05,
2108
+ "loss": 2.1349,
2109
+ "step": 3000
2110
+ },
2111
+ {
2112
+ "epoch": 0.602,
2113
+ "grad_norm": 15.646844863891602,
2114
+ "learning_rate": 2.0487021368736003e-05,
2115
+ "loss": 2.881,
2116
+ "step": 3010
2117
+ },
2118
+ {
2119
+ "epoch": 0.604,
2120
+ "grad_norm": 54.35161209106445,
2121
+ "learning_rate": 2.031546713535688e-05,
2122
+ "loss": 2.999,
2123
+ "step": 3020
2124
+ },
2125
+ {
2126
+ "epoch": 0.606,
2127
+ "grad_norm": 4.890777587890625,
2128
+ "learning_rate": 2.0144141219500705e-05,
2129
+ "loss": 3.0427,
2130
+ "step": 3030
2131
+ },
2132
+ {
2133
+ "epoch": 0.608,
2134
+ "grad_norm": 2.275003433227539,
2135
+ "learning_rate": 1.9973051971350888e-05,
2136
+ "loss": 2.5017,
2137
+ "step": 3040
2138
+ },
2139
+ {
2140
+ "epoch": 0.61,
2141
+ "grad_norm": 3.7734315395355225,
2142
+ "learning_rate": 1.980220772955602e-05,
2143
+ "loss": 1.5811,
2144
+ "step": 3050
2145
+ },
2146
+ {
2147
+ "epoch": 0.612,
2148
+ "grad_norm": 4.304708957672119,
2149
+ "learning_rate": 1.963161682082342e-05,
2150
+ "loss": 1.6409,
2151
+ "step": 3060
2152
+ },
2153
+ {
2154
+ "epoch": 0.614,
2155
+ "grad_norm": 4.046875953674316,
2156
+ "learning_rate": 1.946128755951332e-05,
2157
+ "loss": 3.8443,
2158
+ "step": 3070
2159
+ },
2160
+ {
2161
+ "epoch": 0.616,
2162
+ "grad_norm": 33.650970458984375,
2163
+ "learning_rate": 1.9291228247233605e-05,
2164
+ "loss": 2.3784,
2165
+ "step": 3080
2166
+ },
2167
+ {
2168
+ "epoch": 0.618,
2169
+ "grad_norm": 40.188297271728516,
2170
+ "learning_rate": 1.912144717243525e-05,
2171
+ "loss": 1.9809,
2172
+ "step": 3090
2173
+ },
2174
+ {
2175
+ "epoch": 0.62,
2176
+ "grad_norm": 16.514888763427734,
2177
+ "learning_rate": 1.895195261000831e-05,
2178
+ "loss": 1.5875,
2179
+ "step": 3100
2180
+ },
2181
+ {
2182
+ "epoch": 0.622,
2183
+ "grad_norm": 8.940461158752441,
2184
+ "learning_rate": 1.8782752820878634e-05,
2185
+ "loss": 2.2161,
2186
+ "step": 3110
2187
+ },
2188
+ {
2189
+ "epoch": 0.624,
2190
+ "grad_norm": 12.641722679138184,
2191
+ "learning_rate": 1.8613856051605243e-05,
2192
+ "loss": 6.9032,
2193
+ "step": 3120
2194
+ },
2195
+ {
2196
+ "epoch": 0.626,
2197
+ "grad_norm": 77.18262481689453,
2198
+ "learning_rate": 1.8445270533978388e-05,
2199
+ "loss": 2.9789,
2200
+ "step": 3130
2201
+ },
2202
+ {
2203
+ "epoch": 0.628,
2204
+ "grad_norm": 9.005846977233887,
2205
+ "learning_rate": 1.827700448461836e-05,
2206
+ "loss": 2.0478,
2207
+ "step": 3140
2208
+ },
2209
+ {
2210
+ "epoch": 0.63,
2211
+ "grad_norm": 14.015524864196777,
2212
+ "learning_rate": 1.8109066104575023e-05,
2213
+ "loss": 1.2295,
2214
+ "step": 3150
2215
+ },
2216
+ {
2217
+ "epoch": 0.632,
2218
+ "grad_norm": 28.657386779785156,
2219
+ "learning_rate": 1.7941463578928086e-05,
2220
+ "loss": 2.6306,
2221
+ "step": 3160
2222
+ },
2223
+ {
2224
+ "epoch": 0.634,
2225
+ "grad_norm": 3.889622926712036,
2226
+ "learning_rate": 1.7774205076388206e-05,
2227
+ "loss": 1.2663,
2228
+ "step": 3170
2229
+ },
2230
+ {
2231
+ "epoch": 0.636,
2232
+ "grad_norm": 5.531691551208496,
2233
+ "learning_rate": 1.7607298748898842e-05,
2234
+ "loss": 3.5995,
2235
+ "step": 3180
2236
+ },
2237
+ {
2238
+ "epoch": 0.638,
2239
+ "grad_norm": 2.6832261085510254,
2240
+ "learning_rate": 1.744075273123889e-05,
2241
+ "loss": 3.4226,
2242
+ "step": 3190
2243
+ },
2244
+ {
2245
+ "epoch": 0.64,
2246
+ "grad_norm": 4.261538982391357,
2247
+ "learning_rate": 1.7274575140626318e-05,
2248
+ "loss": 0.9354,
2249
+ "step": 3200
2250
+ },
2251
+ {
2252
+ "epoch": 0.642,
2253
+ "grad_norm": 8.686302185058594,
2254
+ "learning_rate": 1.7108774076322443e-05,
2255
+ "loss": 4.6267,
2256
+ "step": 3210
2257
+ },
2258
+ {
2259
+ "epoch": 0.644,
2260
+ "grad_norm": 13.585872650146484,
2261
+ "learning_rate": 1.6943357619237226e-05,
2262
+ "loss": 0.9114,
2263
+ "step": 3220
2264
+ },
2265
+ {
2266
+ "epoch": 0.646,
2267
+ "grad_norm": 1.3698618412017822,
2268
+ "learning_rate": 1.677833383153542e-05,
2269
+ "loss": 2.5474,
2270
+ "step": 3230
2271
+ },
2272
+ {
2273
+ "epoch": 0.648,
2274
+ "grad_norm": 19.42194175720215,
2275
+ "learning_rate": 1.6613710756243626e-05,
2276
+ "loss": 2.7762,
2277
+ "step": 3240
2278
+ },
2279
+ {
2280
+ "epoch": 0.65,
2281
+ "grad_norm": 13.380922317504883,
2282
+ "learning_rate": 1.6449496416858284e-05,
2283
+ "loss": 1.6468,
2284
+ "step": 3250
2285
+ },
2286
+ {
2287
+ "epoch": 0.652,
2288
+ "grad_norm": 10.786489486694336,
2289
+ "learning_rate": 1.6285698816954624e-05,
2290
+ "loss": 1.2256,
2291
+ "step": 3260
2292
+ },
2293
+ {
2294
+ "epoch": 0.654,
2295
+ "grad_norm": 5.827413558959961,
2296
+ "learning_rate": 1.612232593979658e-05,
2297
+ "loss": 1.5563,
2298
+ "step": 3270
2299
+ },
2300
+ {
2301
+ "epoch": 0.656,
2302
+ "grad_norm": 13.82107162475586,
2303
+ "learning_rate": 1.5959385747947698e-05,
2304
+ "loss": 3.3846,
2305
+ "step": 3280
2306
+ },
2307
+ {
2308
+ "epoch": 0.658,
2309
+ "grad_norm": 22.237295150756836,
2310
+ "learning_rate": 1.5796886182883053e-05,
2311
+ "loss": 2.2551,
2312
+ "step": 3290
2313
+ },
2314
+ {
2315
+ "epoch": 0.66,
2316
+ "grad_norm": 5.554327011108398,
2317
+ "learning_rate": 1.56348351646022e-05,
2318
+ "loss": 0.8103,
2319
+ "step": 3300
2320
+ },
2321
+ {
2322
+ "epoch": 0.662,
2323
+ "grad_norm": 20.308881759643555,
2324
+ "learning_rate": 1.547324059124315e-05,
2325
+ "loss": 1.7983,
2326
+ "step": 3310
2327
+ },
2328
+ {
2329
+ "epoch": 0.664,
2330
+ "grad_norm": 10.945833206176758,
2331
+ "learning_rate": 1.5312110338697426e-05,
2332
+ "loss": 4.4483,
2333
+ "step": 3320
2334
+ },
2335
+ {
2336
+ "epoch": 0.666,
2337
+ "grad_norm": 2.27091121673584,
2338
+ "learning_rate": 1.5151452260226224e-05,
2339
+ "loss": 6.7593,
2340
+ "step": 3330
2341
+ },
2342
+ {
2343
+ "epoch": 0.668,
2344
+ "grad_norm": 74.13166809082031,
2345
+ "learning_rate": 1.4991274186077632e-05,
2346
+ "loss": 3.4548,
2347
+ "step": 3340
2348
+ },
2349
+ {
2350
+ "epoch": 0.67,
2351
+ "grad_norm": 39.529685974121094,
2352
+ "learning_rate": 1.4831583923104999e-05,
2353
+ "loss": 2.0242,
2354
+ "step": 3350
2355
+ },
2356
+ {
2357
+ "epoch": 0.672,
2358
+ "grad_norm": 11.965998649597168,
2359
+ "learning_rate": 1.467238925438646e-05,
2360
+ "loss": 2.3468,
2361
+ "step": 3360
2362
+ },
2363
+ {
2364
+ "epoch": 0.674,
2365
+ "grad_norm": 4.220920085906982,
2366
+ "learning_rate": 1.4513697938845572e-05,
2367
+ "loss": 1.3924,
2368
+ "step": 3370
2369
+ },
2370
+ {
2371
+ "epoch": 0.676,
2372
+ "grad_norm": 48.23358154296875,
2373
+ "learning_rate": 1.4355517710873184e-05,
2374
+ "loss": 2.3896,
2375
+ "step": 3380
2376
+ },
2377
+ {
2378
+ "epoch": 0.678,
2379
+ "grad_norm": 13.424652099609375,
2380
+ "learning_rate": 1.4197856279950438e-05,
2381
+ "loss": 2.2237,
2382
+ "step": 3390
2383
+ },
2384
+ {
2385
+ "epoch": 0.68,
2386
+ "grad_norm": 137.2168426513672,
2387
+ "learning_rate": 1.4040721330273062e-05,
2388
+ "loss": 6.2669,
2389
+ "step": 3400
2390
+ },
2391
+ {
2392
+ "epoch": 0.682,
2393
+ "grad_norm": 28.708620071411133,
2394
+ "learning_rate": 1.388412052037682e-05,
2395
+ "loss": 2.271,
2396
+ "step": 3410
2397
+ },
2398
+ {
2399
+ "epoch": 0.684,
2400
+ "grad_norm": 29.713703155517578,
2401
+ "learning_rate": 1.3728061482764238e-05,
2402
+ "loss": 1.7512,
2403
+ "step": 3420
2404
+ },
2405
+ {
2406
+ "epoch": 0.686,
2407
+ "grad_norm": 8.746112823486328,
2408
+ "learning_rate": 1.3572551823532654e-05,
2409
+ "loss": 1.3673,
2410
+ "step": 3430
2411
+ },
2412
+ {
2413
+ "epoch": 0.688,
2414
+ "grad_norm": 4.915830612182617,
2415
+ "learning_rate": 1.3417599122003464e-05,
2416
+ "loss": 3.8359,
2417
+ "step": 3440
2418
+ },
2419
+ {
2420
+ "epoch": 0.69,
2421
+ "grad_norm": 36.02322769165039,
2422
+ "learning_rate": 1.3263210930352737e-05,
2423
+ "loss": 1.6534,
2424
+ "step": 3450
2425
+ },
2426
+ {
2427
+ "epoch": 0.692,
2428
+ "grad_norm": 4.923043727874756,
2429
+ "learning_rate": 1.3109394773243117e-05,
2430
+ "loss": 0.955,
2431
+ "step": 3460
2432
+ },
2433
+ {
2434
+ "epoch": 0.694,
2435
+ "grad_norm": 8.513254165649414,
2436
+ "learning_rate": 1.2956158147457115e-05,
2437
+ "loss": 1.9489,
2438
+ "step": 3470
2439
+ },
2440
+ {
2441
+ "epoch": 0.696,
2442
+ "grad_norm": 5.320379257202148,
2443
+ "learning_rate": 1.280350852153168e-05,
2444
+ "loss": 2.5898,
2445
+ "step": 3480
2446
+ },
2447
+ {
2448
+ "epoch": 0.698,
2449
+ "grad_norm": 17.299158096313477,
2450
+ "learning_rate": 1.2651453335394231e-05,
2451
+ "loss": 2.9695,
2452
+ "step": 3490
2453
+ },
2454
+ {
2455
+ "epoch": 0.7,
2456
+ "grad_norm": 5.057638168334961,
2457
+ "learning_rate": 1.2500000000000006e-05,
2458
+ "loss": 1.8141,
2459
+ "step": 3500
2460
+ },
2461
+ {
2462
+ "epoch": 0.702,
2463
+ "grad_norm": 11.23135757446289,
2464
+ "learning_rate": 1.234915589697091e-05,
2465
+ "loss": 1.6577,
2466
+ "step": 3510
2467
+ },
2468
+ {
2469
+ "epoch": 0.704,
2470
+ "grad_norm": 13.365480422973633,
2471
+ "learning_rate": 1.2198928378235716e-05,
2472
+ "loss": 1.0784,
2473
+ "step": 3520
2474
+ },
2475
+ {
2476
+ "epoch": 0.706,
2477
+ "grad_norm": 4.888202667236328,
2478
+ "learning_rate": 1.2049324765671749e-05,
2479
+ "loss": 1.4946,
2480
+ "step": 3530
2481
+ },
2482
+ {
2483
+ "epoch": 0.708,
2484
+ "grad_norm": 11.178568840026855,
2485
+ "learning_rate": 1.1900352350748026e-05,
2486
+ "loss": 2.9905,
2487
+ "step": 3540
2488
+ },
2489
+ {
2490
+ "epoch": 0.71,
2491
+ "grad_norm": 1.779919981956482,
2492
+ "learning_rate": 1.175201839416988e-05,
2493
+ "loss": 2.2346,
2494
+ "step": 3550
2495
+ },
2496
+ {
2497
+ "epoch": 0.712,
2498
+ "grad_norm": 11.946904182434082,
2499
+ "learning_rate": 1.1604330125525079e-05,
2500
+ "loss": 1.6511,
2501
+ "step": 3560
2502
+ },
2503
+ {
2504
+ "epoch": 0.714,
2505
+ "grad_norm": 27.550771713256836,
2506
+ "learning_rate": 1.1457294742931507e-05,
2507
+ "loss": 3.0413,
2508
+ "step": 3570
2509
+ },
2510
+ {
2511
+ "epoch": 0.716,
2512
+ "grad_norm": 41.15061950683594,
2513
+ "learning_rate": 1.1310919412686247e-05,
2514
+ "loss": 2.2105,
2515
+ "step": 3580
2516
+ },
2517
+ {
2518
+ "epoch": 0.718,
2519
+ "grad_norm": 0.8456729650497437,
2520
+ "learning_rate": 1.11652112689164e-05,
2521
+ "loss": 2.0821,
2522
+ "step": 3590
2523
+ },
2524
+ {
2525
+ "epoch": 0.72,
2526
+ "grad_norm": 19.061296463012695,
2527
+ "learning_rate": 1.1020177413231334e-05,
2528
+ "loss": 3.4402,
2529
+ "step": 3600
2530
+ },
2531
+ {
2532
+ "epoch": 0.722,
2533
+ "grad_norm": 11.271340370178223,
2534
+ "learning_rate": 1.0875824914376553e-05,
2535
+ "loss": 1.5246,
2536
+ "step": 3610
2537
+ },
2538
+ {
2539
+ "epoch": 0.724,
2540
+ "grad_norm": 63.74707794189453,
2541
+ "learning_rate": 1.0732160807889211e-05,
2542
+ "loss": 4.4784,
2543
+ "step": 3620
2544
+ },
2545
+ {
2546
+ "epoch": 0.726,
2547
+ "grad_norm": 0.7974810600280762,
2548
+ "learning_rate": 1.058919209575517e-05,
2549
+ "loss": 2.8935,
2550
+ "step": 3630
2551
+ },
2552
+ {
2553
+ "epoch": 0.728,
2554
+ "grad_norm": 17.613325119018555,
2555
+ "learning_rate": 1.0446925746067768e-05,
2556
+ "loss": 1.4045,
2557
+ "step": 3640
2558
+ },
2559
+ {
2560
+ "epoch": 0.73,
2561
+ "grad_norm": 16.592836380004883,
2562
+ "learning_rate": 1.0305368692688174e-05,
2563
+ "loss": 2.0619,
2564
+ "step": 3650
2565
+ },
2566
+ {
2567
+ "epoch": 0.732,
2568
+ "grad_norm": 8.460766792297363,
2569
+ "learning_rate": 1.0164527834907467e-05,
2570
+ "loss": 1.8641,
2571
+ "step": 3660
2572
+ },
2573
+ {
2574
+ "epoch": 0.734,
2575
+ "grad_norm": 1.6616703271865845,
2576
+ "learning_rate": 1.0024410037110357e-05,
2577
+ "loss": 3.3477,
2578
+ "step": 3670
2579
+ },
2580
+ {
2581
+ "epoch": 0.736,
2582
+ "grad_norm": 14.18490982055664,
2583
+ "learning_rate": 9.88502212844063e-06,
2584
+ "loss": 1.6977,
2585
+ "step": 3680
2586
+ },
2587
+ {
2588
+ "epoch": 0.738,
2589
+ "grad_norm": 19.63976287841797,
2590
+ "learning_rate": 9.746370902468311e-06,
2591
+ "loss": 2.4541,
2592
+ "step": 3690
2593
+ },
2594
+ {
2595
+ "epoch": 0.74,
2596
+ "grad_norm": 5.758197784423828,
2597
+ "learning_rate": 9.608463116858542e-06,
2598
+ "loss": 1.0491,
2599
+ "step": 3700
2600
+ },
2601
+ {
2602
+ "epoch": 0.742,
2603
+ "grad_norm": 11.879212379455566,
2604
+ "learning_rate": 9.471305493042243e-06,
2605
+ "loss": 1.7615,
2606
+ "step": 3710
2607
+ },
2608
+ {
2609
+ "epoch": 0.744,
2610
+ "grad_norm": 14.958152770996094,
2611
+ "learning_rate": 9.334904715888495e-06,
2612
+ "loss": 1.2825,
2613
+ "step": 3720
2614
+ },
2615
+ {
2616
+ "epoch": 0.746,
2617
+ "grad_norm": 10.86933422088623,
2618
+ "learning_rate": 9.199267433378727e-06,
2619
+ "loss": 2.0753,
2620
+ "step": 3730
2621
+ },
2622
+ {
2623
+ "epoch": 0.748,
2624
+ "grad_norm": 3.0762341022491455,
2625
+ "learning_rate": 9.064400256282757e-06,
2626
+ "loss": 2.5678,
2627
+ "step": 3740
2628
+ },
2629
+ {
2630
+ "epoch": 0.75,
2631
+ "grad_norm": 3.98964262008667,
2632
+ "learning_rate": 8.930309757836517e-06,
2633
+ "loss": 1.0403,
2634
+ "step": 3750
2635
+ },
2636
+ {
2637
+ "epoch": 0.752,
2638
+ "grad_norm": 32.30311584472656,
2639
+ "learning_rate": 8.797002473421728e-06,
2640
+ "loss": 2.1364,
2641
+ "step": 3760
2642
+ },
2643
+ {
2644
+ "epoch": 0.754,
2645
+ "grad_norm": 15.094679832458496,
2646
+ "learning_rate": 8.664484900247363e-06,
2647
+ "loss": 1.4156,
2648
+ "step": 3770
2649
+ },
2650
+ {
2651
+ "epoch": 0.756,
2652
+ "grad_norm": 20.249778747558594,
2653
+ "learning_rate": 8.532763497032987e-06,
2654
+ "loss": 2.463,
2655
+ "step": 3780
2656
+ },
2657
+ {
2658
+ "epoch": 0.758,
2659
+ "grad_norm": 8.843160629272461,
2660
+ "learning_rate": 8.40184468369396e-06,
2661
+ "loss": 1.8851,
2662
+ "step": 3790
2663
+ },
2664
+ {
2665
+ "epoch": 0.76,
2666
+ "grad_norm": 13.01088809967041,
2667
+ "learning_rate": 8.271734841028553e-06,
2668
+ "loss": 2.3422,
2669
+ "step": 3800
2670
+ },
2671
+ {
2672
+ "epoch": 0.762,
2673
+ "grad_norm": 30.352092742919922,
2674
+ "learning_rate": 8.142440310406924e-06,
2675
+ "loss": 2.0351,
2676
+ "step": 3810
2677
+ },
2678
+ {
2679
+ "epoch": 0.764,
2680
+ "grad_norm": 17.028221130371094,
2681
+ "learning_rate": 8.013967393462094e-06,
2682
+ "loss": 2.6205,
2683
+ "step": 3820
2684
+ },
2685
+ {
2686
+ "epoch": 0.766,
2687
+ "grad_norm": 1.9819483757019043,
2688
+ "learning_rate": 7.886322351782783e-06,
2689
+ "loss": 1.4258,
2690
+ "step": 3830
2691
+ },
2692
+ {
2693
+ "epoch": 0.768,
2694
+ "grad_norm": 2.398263931274414,
2695
+ "learning_rate": 7.759511406608255e-06,
2696
+ "loss": 2.0398,
2697
+ "step": 3840
2698
+ },
2699
+ {
2700
+ "epoch": 0.77,
2701
+ "grad_norm": 6.930059432983398,
2702
+ "learning_rate": 7.633540738525066e-06,
2703
+ "loss": 2.7082,
2704
+ "step": 3850
2705
+ },
2706
+ {
2707
+ "epoch": 0.772,
2708
+ "grad_norm": 56.60074996948242,
2709
+ "learning_rate": 7.508416487165862e-06,
2710
+ "loss": 2.2596,
2711
+ "step": 3860
2712
+ },
2713
+ {
2714
+ "epoch": 0.774,
2715
+ "grad_norm": 6.612997531890869,
2716
+ "learning_rate": 7.384144750910133e-06,
2717
+ "loss": 1.665,
2718
+ "step": 3870
2719
+ },
2720
+ {
2721
+ "epoch": 0.776,
2722
+ "grad_norm": 1.0754458904266357,
2723
+ "learning_rate": 7.260731586586983e-06,
2724
+ "loss": 3.3368,
2725
+ "step": 3880
2726
+ },
2727
+ {
2728
+ "epoch": 0.778,
2729
+ "grad_norm": 7.085332870483398,
2730
+ "learning_rate": 7.138183009179922e-06,
2731
+ "loss": 2.165,
2732
+ "step": 3890
2733
+ },
2734
+ {
2735
+ "epoch": 0.78,
2736
+ "grad_norm": 5.756954669952393,
2737
+ "learning_rate": 7.016504991533726e-06,
2738
+ "loss": 0.9795,
2739
+ "step": 3900
2740
+ },
2741
+ {
2742
+ "epoch": 0.782,
2743
+ "grad_norm": 290.0068664550781,
2744
+ "learning_rate": 6.895703464063319e-06,
2745
+ "loss": 4.0774,
2746
+ "step": 3910
2747
+ },
2748
+ {
2749
+ "epoch": 0.784,
2750
+ "grad_norm": 18.028898239135742,
2751
+ "learning_rate": 6.775784314464717e-06,
2752
+ "loss": 1.838,
2753
+ "step": 3920
2754
+ },
2755
+ {
2756
+ "epoch": 0.786,
2757
+ "grad_norm": 30.23217010498047,
2758
+ "learning_rate": 6.656753387428089e-06,
2759
+ "loss": 2.2859,
2760
+ "step": 3930
2761
+ },
2762
+ {
2763
+ "epoch": 0.788,
2764
+ "grad_norm": 6.802043437957764,
2765
+ "learning_rate": 6.538616484352902e-06,
2766
+ "loss": 4.2013,
2767
+ "step": 3940
2768
+ },
2769
+ {
2770
+ "epoch": 0.79,
2771
+ "grad_norm": 21.070383071899414,
2772
+ "learning_rate": 6.421379363065142e-06,
2773
+ "loss": 3.8002,
2774
+ "step": 3950
2775
+ },
2776
+ {
2777
+ "epoch": 0.792,
2778
+ "grad_norm": 59.72300720214844,
2779
+ "learning_rate": 6.305047737536707e-06,
2780
+ "loss": 2.5436,
2781
+ "step": 3960
2782
+ },
2783
+ {
2784
+ "epoch": 0.794,
2785
+ "grad_norm": 3.4971981048583984,
2786
+ "learning_rate": 6.189627277606894e-06,
2787
+ "loss": 2.4645,
2788
+ "step": 3970
2789
+ },
2790
+ {
2791
+ "epoch": 0.796,
2792
+ "grad_norm": 10.910285949707031,
2793
+ "learning_rate": 6.075123608706093e-06,
2794
+ "loss": 1.1404,
2795
+ "step": 3980
2796
+ },
2797
+ {
2798
+ "epoch": 0.798,
2799
+ "grad_norm": 223.6168212890625,
2800
+ "learning_rate": 5.961542311581586e-06,
2801
+ "loss": 4.2125,
2802
+ "step": 3990
2803
+ },
2804
+ {
2805
+ "epoch": 0.8,
2806
+ "grad_norm": 15.95744514465332,
2807
+ "learning_rate": 5.848888922025553e-06,
2808
+ "loss": 1.4396,
2809
+ "step": 4000
2810
+ },
2811
+ {
2812
+ "epoch": 0.802,
2813
+ "grad_norm": 1.5669488906860352,
2814
+ "learning_rate": 5.737168930605272e-06,
2815
+ "loss": 2.321,
2816
+ "step": 4010
2817
+ },
2818
+ {
2819
+ "epoch": 0.804,
2820
+ "grad_norm": 20.125688552856445,
2821
+ "learning_rate": 5.626387782395512e-06,
2822
+ "loss": 2.803,
2823
+ "step": 4020
2824
+ },
2825
+ {
2826
+ "epoch": 0.806,
2827
+ "grad_norm": 9.791519165039062,
2828
+ "learning_rate": 5.5165508767131415e-06,
2829
+ "loss": 1.024,
2830
+ "step": 4030
2831
+ },
2832
+ {
2833
+ "epoch": 0.808,
2834
+ "grad_norm": 4.828093528747559,
2835
+ "learning_rate": 5.4076635668540075e-06,
2836
+ "loss": 2.4534,
2837
+ "step": 4040
2838
+ },
2839
+ {
2840
+ "epoch": 0.81,
2841
+ "grad_norm": 5.493026256561279,
2842
+ "learning_rate": 5.299731159831953e-06,
2843
+ "loss": 2.1075,
2844
+ "step": 4050
2845
+ },
2846
+ {
2847
+ "epoch": 0.812,
2848
+ "grad_norm": 8.821996688842773,
2849
+ "learning_rate": 5.192758916120236e-06,
2850
+ "loss": 3.3703,
2851
+ "step": 4060
2852
+ },
2853
+ {
2854
+ "epoch": 0.814,
2855
+ "grad_norm": 8.212584495544434,
2856
+ "learning_rate": 5.086752049395094e-06,
2857
+ "loss": 2.9377,
2858
+ "step": 4070
2859
+ },
2860
+ {
2861
+ "epoch": 0.816,
2862
+ "grad_norm": 14.955241203308105,
2863
+ "learning_rate": 4.981715726281666e-06,
2864
+ "loss": 4.4126,
2865
+ "step": 4080
2866
+ },
2867
+ {
2868
+ "epoch": 0.818,
2869
+ "grad_norm": 8.878729820251465,
2870
+ "learning_rate": 4.877655066102149e-06,
2871
+ "loss": 2.5896,
2872
+ "step": 4090
2873
+ },
2874
+ {
2875
+ "epoch": 0.82,
2876
+ "grad_norm": 11.291478157043457,
2877
+ "learning_rate": 4.7745751406263165e-06,
2878
+ "loss": 1.4516,
2879
+ "step": 4100
2880
+ },
2881
+ {
2882
+ "epoch": 0.822,
2883
+ "grad_norm": 3.0645530223846436,
2884
+ "learning_rate": 4.672480973824311e-06,
2885
+ "loss": 1.3904,
2886
+ "step": 4110
2887
+ },
2888
+ {
2889
+ "epoch": 0.824,
2890
+ "grad_norm": 10.652810096740723,
2891
+ "learning_rate": 4.571377541621788e-06,
2892
+ "loss": 1.6246,
2893
+ "step": 4120
2894
+ },
2895
+ {
2896
+ "epoch": 0.826,
2897
+ "grad_norm": 50.55589294433594,
2898
+ "learning_rate": 4.4712697716574e-06,
2899
+ "loss": 3.125,
2900
+ "step": 4130
2901
+ },
2902
+ {
2903
+ "epoch": 0.828,
2904
+ "grad_norm": 10.360865592956543,
2905
+ "learning_rate": 4.372162543042624e-06,
2906
+ "loss": 1.1379,
2907
+ "step": 4140
2908
+ },
2909
+ {
2910
+ "epoch": 0.83,
2911
+ "grad_norm": 11.648319244384766,
2912
+ "learning_rate": 4.274060686123959e-06,
2913
+ "loss": 1.723,
2914
+ "step": 4150
2915
+ },
2916
+ {
2917
+ "epoch": 0.832,
2918
+ "grad_norm": 5.083891868591309,
2919
+ "learning_rate": 4.176968982247514e-06,
2920
+ "loss": 2.2388,
2921
+ "step": 4160
2922
+ },
2923
+ {
2924
+ "epoch": 0.834,
2925
+ "grad_norm": 39.73847961425781,
2926
+ "learning_rate": 4.08089216352596e-06,
2927
+ "loss": 1.7591,
2928
+ "step": 4170
2929
+ },
2930
+ {
2931
+ "epoch": 0.836,
2932
+ "grad_norm": 2.994002342224121,
2933
+ "learning_rate": 3.985834912607894e-06,
2934
+ "loss": 0.9926,
2935
+ "step": 4180
2936
+ },
2937
+ {
2938
+ "epoch": 0.838,
2939
+ "grad_norm": 21.860036849975586,
2940
+ "learning_rate": 3.891801862449629e-06,
2941
+ "loss": 2.8662,
2942
+ "step": 4190
2943
+ },
2944
+ {
2945
+ "epoch": 0.84,
2946
+ "grad_norm": 23.886770248413086,
2947
+ "learning_rate": 3.798797596089351e-06,
2948
+ "loss": 2.1153,
2949
+ "step": 4200
2950
+ },
2951
+ {
2952
+ "epoch": 0.842,
2953
+ "grad_norm": 29.538490295410156,
2954
+ "learning_rate": 3.7068266464238084e-06,
2955
+ "loss": 1.9178,
2956
+ "step": 4210
2957
+ },
2958
+ {
2959
+ "epoch": 0.844,
2960
+ "grad_norm": 1.1373424530029297,
2961
+ "learning_rate": 3.6158934959873353e-06,
2962
+ "loss": 1.9438,
2963
+ "step": 4220
2964
+ },
2965
+ {
2966
+ "epoch": 0.846,
2967
+ "grad_norm": 27.889780044555664,
2968
+ "learning_rate": 3.5260025767333893e-06,
2969
+ "loss": 2.5589,
2970
+ "step": 4230
2971
+ },
2972
+ {
2973
+ "epoch": 0.848,
2974
+ "grad_norm": 12.20418930053711,
2975
+ "learning_rate": 3.4371582698185633e-06,
2976
+ "loss": 2.031,
2977
+ "step": 4240
2978
+ },
2979
+ {
2980
+ "epoch": 0.85,
2981
+ "grad_norm": 9.347545623779297,
2982
+ "learning_rate": 3.3493649053890326e-06,
2983
+ "loss": 1.8701,
2984
+ "step": 4250
2985
+ },
2986
+ {
2987
+ "epoch": 0.852,
2988
+ "grad_norm": 39.1044921875,
2989
+ "learning_rate": 3.262626762369525e-06,
2990
+ "loss": 2.4573,
2991
+ "step": 4260
2992
+ },
2993
+ {
2994
+ "epoch": 0.854,
2995
+ "grad_norm": 8.244641304016113,
2996
+ "learning_rate": 3.176948068254762e-06,
2997
+ "loss": 1.5529,
2998
+ "step": 4270
2999
+ },
3000
+ {
3001
+ "epoch": 0.856,
3002
+ "grad_norm": 9.417642593383789,
3003
+ "learning_rate": 3.092332998903416e-06,
3004
+ "loss": 4.1672,
3005
+ "step": 4280
3006
+ },
3007
+ {
3008
+ "epoch": 0.858,
3009
+ "grad_norm": 4.196715354919434,
3010
+ "learning_rate": 3.0087856783345914e-06,
3011
+ "loss": 2.5184,
3012
+ "step": 4290
3013
+ },
3014
+ {
3015
+ "epoch": 0.86,
3016
+ "grad_norm": 5.61329460144043,
3017
+ "learning_rate": 2.9263101785268254e-06,
3018
+ "loss": 1.5187,
3019
+ "step": 4300
3020
+ },
3021
+ {
3022
+ "epoch": 0.862,
3023
+ "grad_norm": 1.2130039930343628,
3024
+ "learning_rate": 2.8449105192196316e-06,
3025
+ "loss": 2.3649,
3026
+ "step": 4310
3027
+ },
3028
+ {
3029
+ "epoch": 0.864,
3030
+ "grad_norm": 26.6439151763916,
3031
+ "learning_rate": 2.764590667717562e-06,
3032
+ "loss": 1.8013,
3033
+ "step": 4320
3034
+ },
3035
+ {
3036
+ "epoch": 0.866,
3037
+ "grad_norm": 10.56583309173584,
3038
+ "learning_rate": 2.6853545386968606e-06,
3039
+ "loss": 1.5805,
3040
+ "step": 4330
3041
+ },
3042
+ {
3043
+ "epoch": 0.868,
3044
+ "grad_norm": 17.812593460083008,
3045
+ "learning_rate": 2.6072059940146775e-06,
3046
+ "loss": 1.8378,
3047
+ "step": 4340
3048
+ },
3049
+ {
3050
+ "epoch": 0.87,
3051
+ "grad_norm": 11.021294593811035,
3052
+ "learning_rate": 2.5301488425208296e-06,
3053
+ "loss": 1.502,
3054
+ "step": 4350
3055
+ },
3056
+ {
3057
+ "epoch": 0.872,
3058
+ "grad_norm": 7.252746105194092,
3059
+ "learning_rate": 2.454186839872158e-06,
3060
+ "loss": 3.0615,
3061
+ "step": 4360
3062
+ },
3063
+ {
3064
+ "epoch": 0.874,
3065
+ "grad_norm": 6.52194356918335,
3066
+ "learning_rate": 2.379323688349516e-06,
3067
+ "loss": 2.0572,
3068
+ "step": 4370
3069
+ },
3070
+ {
3071
+ "epoch": 0.876,
3072
+ "grad_norm": 25.462242126464844,
3073
+ "learning_rate": 2.3055630366772856e-06,
3074
+ "loss": 1.5945,
3075
+ "step": 4380
3076
+ },
3077
+ {
3078
+ "epoch": 0.878,
3079
+ "grad_norm": 1.2246159315109253,
3080
+ "learning_rate": 2.2329084798455746e-06,
3081
+ "loss": 1.6811,
3082
+ "step": 4390
3083
+ },
3084
+ {
3085
+ "epoch": 0.88,
3086
+ "grad_norm": 27.265029907226562,
3087
+ "learning_rate": 2.1613635589349756e-06,
3088
+ "loss": 1.5655,
3089
+ "step": 4400
3090
+ },
3091
+ {
3092
+ "epoch": 0.882,
3093
+ "grad_norm": 19.661325454711914,
3094
+ "learning_rate": 2.0909317609440095e-06,
3095
+ "loss": 2.1422,
3096
+ "step": 4410
3097
+ },
3098
+ {
3099
+ "epoch": 0.884,
3100
+ "grad_norm": 4.112105846405029,
3101
+ "learning_rate": 2.0216165186191407e-06,
3102
+ "loss": 1.403,
3103
+ "step": 4420
3104
+ },
3105
+ {
3106
+ "epoch": 0.886,
3107
+ "grad_norm": 22.394611358642578,
3108
+ "learning_rate": 1.95342121028749e-06,
3109
+ "loss": 2.6121,
3110
+ "step": 4430
3111
+ },
3112
+ {
3113
+ "epoch": 0.888,
3114
+ "grad_norm": 6.7292561531066895,
3115
+ "learning_rate": 1.8863491596921745e-06,
3116
+ "loss": 5.1337,
3117
+ "step": 4440
3118
+ },
3119
+ {
3120
+ "epoch": 0.89,
3121
+ "grad_norm": 9.215736389160156,
3122
+ "learning_rate": 1.8204036358303173e-06,
3123
+ "loss": 2.1225,
3124
+ "step": 4450
3125
+ },
3126
+ {
3127
+ "epoch": 0.892,
3128
+ "grad_norm": 44.11046600341797,
3129
+ "learning_rate": 1.7555878527937164e-06,
3130
+ "loss": 2.8199,
3131
+ "step": 4460
3132
+ },
3133
+ {
3134
+ "epoch": 0.894,
3135
+ "grad_norm": 45.898719787597656,
3136
+ "learning_rate": 1.6919049696121958e-06,
3137
+ "loss": 2.1345,
3138
+ "step": 4470
3139
+ },
3140
+ {
3141
+ "epoch": 0.896,
3142
+ "grad_norm": 6.135681629180908,
3143
+ "learning_rate": 1.629358090099639e-06,
3144
+ "loss": 1.671,
3145
+ "step": 4480
3146
+ },
3147
+ {
3148
+ "epoch": 0.898,
3149
+ "grad_norm": 88.45669555664062,
3150
+ "learning_rate": 1.5679502627027136e-06,
3151
+ "loss": 2.7573,
3152
+ "step": 4490
3153
+ },
3154
+ {
3155
+ "epoch": 0.9,
3156
+ "grad_norm": 8.329882621765137,
3157
+ "learning_rate": 1.5076844803522922e-06,
3158
+ "loss": 1.8508,
3159
+ "step": 4500
3160
+ },
3161
+ {
3162
+ "epoch": 0.902,
3163
+ "grad_norm": 19.587743759155273,
3164
+ "learning_rate": 1.4485636803175829e-06,
3165
+ "loss": 4.521,
3166
+ "step": 4510
3167
+ },
3168
+ {
3169
+ "epoch": 0.904,
3170
+ "grad_norm": 30.500099182128906,
3171
+ "learning_rate": 1.3905907440629752e-06,
3172
+ "loss": 3.2342,
3173
+ "step": 4520
3174
+ },
3175
+ {
3176
+ "epoch": 0.906,
3177
+ "grad_norm": 13.65441608428955,
3178
+ "learning_rate": 1.333768497107593e-06,
3179
+ "loss": 3.0516,
3180
+ "step": 4530
3181
+ },
3182
+ {
3183
+ "epoch": 0.908,
3184
+ "grad_norm": 39.75031280517578,
3185
+ "learning_rate": 1.2780997088875869e-06,
3186
+ "loss": 2.0561,
3187
+ "step": 4540
3188
+ },
3189
+ {
3190
+ "epoch": 0.91,
3191
+ "grad_norm": 45.91644287109375,
3192
+ "learning_rate": 1.2235870926211619e-06,
3193
+ "loss": 1.5619,
3194
+ "step": 4550
3195
+ },
3196
+ {
3197
+ "epoch": 0.912,
3198
+ "grad_norm": 5.88566255569458,
3199
+ "learning_rate": 1.170233305176327e-06,
3200
+ "loss": 2.0134,
3201
+ "step": 4560
3202
+ },
3203
+ {
3204
+ "epoch": 0.914,
3205
+ "grad_norm": 5.401540279388428,
3206
+ "learning_rate": 1.1180409469414094e-06,
3207
+ "loss": 0.8128,
3208
+ "step": 4570
3209
+ },
3210
+ {
3211
+ "epoch": 0.916,
3212
+ "grad_norm": 30.60863494873047,
3213
+ "learning_rate": 1.067012561698319e-06,
3214
+ "loss": 2.5157,
3215
+ "step": 4580
3216
+ },
3217
+ {
3218
+ "epoch": 0.918,
3219
+ "grad_norm": 2.0508158206939697,
3220
+ "learning_rate": 1.0171506364985622e-06,
3221
+ "loss": 1.7379,
3222
+ "step": 4590
3223
+ },
3224
+ {
3225
+ "epoch": 0.92,
3226
+ "grad_norm": 11.00378704071045,
3227
+ "learning_rate": 9.684576015420278e-07,
3228
+ "loss": 1.3243,
3229
+ "step": 4600
3230
+ },
3231
+ {
3232
+ "epoch": 0.922,
3233
+ "grad_norm": 68.18540954589844,
3234
+ "learning_rate": 9.209358300585474e-07,
3235
+ "loss": 4.207,
3236
+ "step": 4610
3237
+ },
3238
+ {
3239
+ "epoch": 0.924,
3240
+ "grad_norm": 11.006647109985352,
3241
+ "learning_rate": 8.745876381922147e-07,
3242
+ "loss": 2.8713,
3243
+ "step": 4620
3244
+ },
3245
+ {
3246
+ "epoch": 0.926,
3247
+ "grad_norm": 3.3466615676879883,
3248
+ "learning_rate": 8.294152848885157e-07,
3249
+ "loss": 2.2642,
3250
+ "step": 4630
3251
+ },
3252
+ {
3253
+ "epoch": 0.928,
3254
+ "grad_norm": 1.4981013536453247,
3255
+ "learning_rate": 7.854209717842231e-07,
3256
+ "loss": 3.0498,
3257
+ "step": 4640
3258
+ },
3259
+ {
3260
+ "epoch": 0.93,
3261
+ "grad_norm": 9.377291679382324,
3262
+ "learning_rate": 7.426068431000882e-07,
3263
+ "loss": 3.1851,
3264
+ "step": 4650
3265
+ },
3266
+ {
3267
+ "epoch": 0.932,
3268
+ "grad_norm": 32.81710433959961,
3269
+ "learning_rate": 7.009749855363456e-07,
3270
+ "loss": 1.5354,
3271
+ "step": 4660
3272
+ },
3273
+ {
3274
+ "epoch": 0.934,
3275
+ "grad_norm": 13.120279312133789,
3276
+ "learning_rate": 6.605274281709928e-07,
3277
+ "loss": 1.7694,
3278
+ "step": 4670
3279
+ },
3280
+ {
3281
+ "epoch": 0.936,
3282
+ "grad_norm": 3.324674606323242,
3283
+ "learning_rate": 6.212661423609184e-07,
3284
+ "loss": 1.4321,
3285
+ "step": 4680
3286
+ },
3287
+ {
3288
+ "epoch": 0.938,
3289
+ "grad_norm": 2.2350542545318604,
3290
+ "learning_rate": 5.83193041645802e-07,
3291
+ "loss": 1.4432,
3292
+ "step": 4690
3293
+ },
3294
+ {
3295
+ "epoch": 0.94,
3296
+ "grad_norm": 51.64103317260742,
3297
+ "learning_rate": 5.463099816548579e-07,
3298
+ "loss": 1.7524,
3299
+ "step": 4700
3300
+ },
3301
+ {
3302
+ "epoch": 0.942,
3303
+ "grad_norm": 20.108779907226562,
3304
+ "learning_rate": 5.106187600163987e-07,
3305
+ "loss": 3.8866,
3306
+ "step": 4710
3307
+ },
3308
+ {
3309
+ "epoch": 0.944,
3310
+ "grad_norm": 25.551082611083984,
3311
+ "learning_rate": 4.7612111627021175e-07,
3312
+ "loss": 1.618,
3313
+ "step": 4720
3314
+ },
3315
+ {
3316
+ "epoch": 0.946,
3317
+ "grad_norm": 5.8722639083862305,
3318
+ "learning_rate": 4.4281873178278475e-07,
3319
+ "loss": 2.4191,
3320
+ "step": 4730
3321
+ },
3322
+ {
3323
+ "epoch": 0.948,
3324
+ "grad_norm": 5.5632500648498535,
3325
+ "learning_rate": 4.107132296653549e-07,
3326
+ "loss": 1.4241,
3327
+ "step": 4740
3328
+ },
3329
+ {
3330
+ "epoch": 0.95,
3331
+ "grad_norm": 66.90338897705078,
3332
+ "learning_rate": 3.7980617469479953e-07,
3333
+ "loss": 1.8877,
3334
+ "step": 4750
3335
+ },
3336
+ {
3337
+ "epoch": 0.952,
3338
+ "grad_norm": 27.994674682617188,
3339
+ "learning_rate": 3.5009907323737825e-07,
3340
+ "loss": 1.4828,
3341
+ "step": 4760
3342
+ },
3343
+ {
3344
+ "epoch": 0.954,
3345
+ "grad_norm": 16.3344669342041,
3346
+ "learning_rate": 3.215933731753024e-07,
3347
+ "loss": 2.1333,
3348
+ "step": 4770
3349
+ },
3350
+ {
3351
+ "epoch": 0.956,
3352
+ "grad_norm": 2.000822067260742,
3353
+ "learning_rate": 2.942904638361804e-07,
3354
+ "loss": 2.0255,
3355
+ "step": 4780
3356
+ },
3357
+ {
3358
+ "epoch": 0.958,
3359
+ "grad_norm": 84.14417266845703,
3360
+ "learning_rate": 2.681916759252917e-07,
3361
+ "loss": 3.1469,
3362
+ "step": 4790
3363
+ },
3364
+ {
3365
+ "epoch": 0.96,
3366
+ "grad_norm": 2.5054540634155273,
3367
+ "learning_rate": 2.4329828146074095e-07,
3368
+ "loss": 2.5756,
3369
+ "step": 4800
3370
+ },
3371
+ {
3372
+ "epoch": 0.962,
3373
+ "grad_norm": 18.779565811157227,
3374
+ "learning_rate": 2.1961149371145795e-07,
3375
+ "loss": 2.2082,
3376
+ "step": 4810
3377
+ },
3378
+ {
3379
+ "epoch": 0.964,
3380
+ "grad_norm": 10.171392440795898,
3381
+ "learning_rate": 1.9713246713805588e-07,
3382
+ "loss": 1.2087,
3383
+ "step": 4820
3384
+ },
3385
+ {
3386
+ "epoch": 0.966,
3387
+ "grad_norm": 12.274552345275879,
3388
+ "learning_rate": 1.7586229733657644e-07,
3389
+ "loss": 3.4375,
3390
+ "step": 4830
3391
+ },
3392
+ {
3393
+ "epoch": 0.968,
3394
+ "grad_norm": 13.97036075592041,
3395
+ "learning_rate": 1.5580202098509077e-07,
3396
+ "loss": 1.2341,
3397
+ "step": 4840
3398
+ },
3399
+ {
3400
+ "epoch": 0.97,
3401
+ "grad_norm": 4.010193824768066,
3402
+ "learning_rate": 1.3695261579316777e-07,
3403
+ "loss": 1.7228,
3404
+ "step": 4850
3405
+ },
3406
+ {
3407
+ "epoch": 0.972,
3408
+ "grad_norm": 19.15084457397461,
3409
+ "learning_rate": 1.193150004542204e-07,
3410
+ "loss": 1.4849,
3411
+ "step": 4860
3412
+ },
3413
+ {
3414
+ "epoch": 0.974,
3415
+ "grad_norm": 4.279737949371338,
3416
+ "learning_rate": 1.0289003460074165e-07,
3417
+ "loss": 1.5513,
3418
+ "step": 4870
3419
+ },
3420
+ {
3421
+ "epoch": 0.976,
3422
+ "grad_norm": 3.084926128387451,
3423
+ "learning_rate": 8.767851876239074e-08,
3424
+ "loss": 1.3479,
3425
+ "step": 4880
3426
+ },
3427
+ {
3428
+ "epoch": 0.978,
3429
+ "grad_norm": 5.715677738189697,
3430
+ "learning_rate": 7.368119432699383e-08,
3431
+ "loss": 2.6461,
3432
+ "step": 4890
3433
+ },
3434
+ {
3435
+ "epoch": 0.98,
3436
+ "grad_norm": 72.04022216796875,
3437
+ "learning_rate": 6.089874350439506e-08,
3438
+ "loss": 1.5137,
3439
+ "step": 4900
3440
+ },
3441
+ {
3442
+ "epoch": 0.982,
3443
+ "grad_norm": 1.6261266469955444,
3444
+ "learning_rate": 4.9331789293211026e-08,
3445
+ "loss": 2.9775,
3446
+ "step": 4910
3447
+ },
3448
+ {
3449
+ "epoch": 0.984,
3450
+ "grad_norm": 3.387927532196045,
3451
+ "learning_rate": 3.8980895450474455e-08,
3452
+ "loss": 1.0868,
3453
+ "step": 4920
3454
+ },
3455
+ {
3456
+ "epoch": 0.986,
3457
+ "grad_norm": 73.5595932006836,
3458
+ "learning_rate": 2.9846566464150626e-08,
3459
+ "loss": 2.3111,
3460
+ "step": 4930
3461
+ },
3462
+ {
3463
+ "epoch": 0.988,
3464
+ "grad_norm": 8.196198463439941,
3465
+ "learning_rate": 2.192924752854042e-08,
3466
+ "loss": 2.8528,
3467
+ "step": 4940
3468
+ },
3469
+ {
3470
+ "epoch": 0.99,
3471
+ "grad_norm": 10.610214233398438,
3472
+ "learning_rate": 1.522932452260595e-08,
3473
+ "loss": 7.4089,
3474
+ "step": 4950
3475
+ },
3476
+ {
3477
+ "epoch": 0.992,
3478
+ "grad_norm": 2.894430160522461,
3479
+ "learning_rate": 9.747123991141194e-09,
3480
+ "loss": 5.3353,
3481
+ "step": 4960
3482
+ },
3483
+ {
3484
+ "epoch": 0.994,
3485
+ "grad_norm": 8.657221794128418,
3486
+ "learning_rate": 5.48291312886251e-09,
3487
+ "loss": 1.8168,
3488
+ "step": 4970
3489
+ },
3490
+ {
3491
+ "epoch": 0.996,
3492
+ "grad_norm": 2.366279363632202,
3493
+ "learning_rate": 2.4368997673940297e-09,
3494
+ "loss": 2.34,
3495
+ "step": 4980
3496
+ },
3497
+ {
3498
+ "epoch": 0.998,
3499
+ "grad_norm": 5.894517421722412,
3500
+ "learning_rate": 6.092323651313292e-10,
3501
+ "loss": 1.8474,
3502
+ "step": 4990
3503
+ },
3504
+ {
3505
+ "epoch": 1.0,
3506
+ "grad_norm": 27.153657913208008,
3507
+ "learning_rate": 0.0,
3508
+ "loss": 5.5777,
3509
+ "step": 5000
3510
+ },
3511
+ {
3512
+ "epoch": 1.0,
3513
+ "step": 5000,
3514
+ "total_flos": 9478520693637120.0,
3515
+ "train_loss": 2.8943692499160765,
3516
+ "train_runtime": 1526.3896,
3517
+ "train_samples_per_second": 3.276,
3518
+ "train_steps_per_second": 3.276
3519
+ }
3520
+ ],
3521
+ "logging_steps": 10,
3522
+ "max_steps": 5000,
3523
+ "num_input_tokens_seen": 0,
3524
+ "num_train_epochs": 1,
3525
+ "save_steps": 4000,
3526
+ "stateful_callbacks": {
3527
+ "TrainerControl": {
3528
+ "args": {
3529
+ "should_epoch_stop": false,
3530
+ "should_evaluate": false,
3531
+ "should_log": false,
3532
+ "should_save": true,
3533
+ "should_training_stop": true
3534
+ },
3535
+ "attributes": {}
3536
+ }
3537
+ },
3538
+ "total_flos": 9478520693637120.0,
3539
+ "train_batch_size": 1,
3540
+ "trial_name": null,
3541
+ "trial_params": null
3542
+ }
Llama-2-13b-chat-hf/DomainBench/Agriculture/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23a0f0ef63025bf3610f24845041b3ec2e067c7bef4a95b8c5cb40cf4406b62d
3
+ size 5432
Llama-2-13b-chat-hf/DomainBench/Agriculture/training_loss.png ADDED
Llama-2-13b-chat-hf/DomainBench/Finance/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: /hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: threshold_3-lamb_0.1-lr_5e-5
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # threshold_3-lamb_0.1-lr_5e-5
18
+
19
+ This model is a fine-tuned version of [/hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf](https://huggingface.co//hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf) on the wealth dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 1
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: cosine
44
+ - lr_scheduler_warmup_ratio: 0.1
45
+ - num_epochs: 1.0
46
+
47
+ ### Training results
48
+
49
+
50
+
51
+ ### Framework versions
52
+
53
+ - PEFT 0.12.0
54
+ - Transformers 4.46.1
55
+ - Pytorch 2.5.1+cu124
56
+ - Datasets 3.1.0
57
+ - Tokenizers 0.20.3
Llama-2-13b-chat-hf/DomainBench/Finance/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
Llama-2-13b-chat-hf/DomainBench/Finance/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa42a9d633b1cd3d1ecf795c7e679526bed7c191f6e0a95154fca04b2d6fe75d
3
+ size 26235704
Llama-2-13b-chat-hf/DomainBench/Finance/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 1.151346780094464e+16,
4
+ "train_loss": 1.6451873833656312,
5
+ "train_runtime": 1464.1065,
6
+ "train_samples_per_second": 3.415,
7
+ "train_steps_per_second": 3.415
8
+ }
Llama-2-13b-chat-hf/DomainBench/Finance/logfile.txt ADDED
The diff for this file is too large to render. See raw diff
 
Llama-2-13b-chat-hf/DomainBench/Finance/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
Llama-2-13b-chat-hf/DomainBench/Finance/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama-2-13b-chat-hf/DomainBench/Finance/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
Llama-2-13b-chat-hf/DomainBench/Finance/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
Llama-2-13b-chat-hf/DomainBench/Finance/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 1.151346780094464e+16,
4
+ "train_loss": 1.6451873833656312,
5
+ "train_runtime": 1464.1065,
6
+ "train_samples_per_second": 3.415,
7
+ "train_steps_per_second": 3.415
8
+ }
Llama-2-13b-chat-hf/DomainBench/Finance/trainer_log.jsonl ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 5000, "loss": 3.5335, "lr": 1.0000000000000002e-06, "epoch": 0.002, "percentage": 0.2, "elapsed_time": "0:00:04", "remaining_time": "0:33:19"}
2
+ {"current_steps": 20, "total_steps": 5000, "loss": 5.5042, "lr": 2.0000000000000003e-06, "epoch": 0.004, "percentage": 0.4, "elapsed_time": "0:00:06", "remaining_time": "0:28:52"}
3
+ {"current_steps": 30, "total_steps": 5000, "loss": 2.9337, "lr": 3e-06, "epoch": 0.006, "percentage": 0.6, "elapsed_time": "0:00:09", "remaining_time": "0:27:30"}
4
+ {"current_steps": 40, "total_steps": 5000, "loss": 8.5725, "lr": 4.000000000000001e-06, "epoch": 0.008, "percentage": 0.8, "elapsed_time": "0:00:12", "remaining_time": "0:26:44"}
5
+ {"current_steps": 50, "total_steps": 5000, "loss": 2.2742, "lr": 5e-06, "epoch": 0.01, "percentage": 1.0, "elapsed_time": "0:00:15", "remaining_time": "0:25:30"}
6
+ {"current_steps": 60, "total_steps": 5000, "loss": 4.5745, "lr": 6e-06, "epoch": 0.012, "percentage": 1.2, "elapsed_time": "0:00:17", "remaining_time": "0:24:16"}
7
+ {"current_steps": 70, "total_steps": 5000, "loss": 3.9419, "lr": 7.000000000000001e-06, "epoch": 0.014, "percentage": 1.4, "elapsed_time": "0:00:20", "remaining_time": "0:24:13"}
8
+ {"current_steps": 80, "total_steps": 5000, "loss": 2.2781, "lr": 8.000000000000001e-06, "epoch": 0.016, "percentage": 1.6, "elapsed_time": "0:00:23", "remaining_time": "0:23:59"}
9
+ {"current_steps": 90, "total_steps": 5000, "loss": 6.3897, "lr": 9e-06, "epoch": 0.018, "percentage": 1.8, "elapsed_time": "0:00:53", "remaining_time": "0:48:21"}
10
+ {"current_steps": 100, "total_steps": 5000, "loss": 3.6837, "lr": 1e-05, "epoch": 0.02, "percentage": 2.0, "elapsed_time": "0:00:55", "remaining_time": "0:45:42"}
11
+ {"current_steps": 110, "total_steps": 5000, "loss": 1.3266, "lr": 1.1000000000000001e-05, "epoch": 0.022, "percentage": 2.2, "elapsed_time": "0:00:58", "remaining_time": "0:43:20"}
12
+ {"current_steps": 120, "total_steps": 5000, "loss": 6.1833, "lr": 1.2e-05, "epoch": 0.024, "percentage": 2.4, "elapsed_time": "0:01:01", "remaining_time": "0:41:40"}
13
+ {"current_steps": 130, "total_steps": 5000, "loss": 2.6712, "lr": 1.3000000000000001e-05, "epoch": 0.026, "percentage": 2.6, "elapsed_time": "0:01:04", "remaining_time": "0:40:16"}
14
+ {"current_steps": 140, "total_steps": 5000, "loss": 2.5445, "lr": 1.4000000000000001e-05, "epoch": 0.028, "percentage": 2.8, "elapsed_time": "0:01:07", "remaining_time": "0:39:03"}
15
+ {"current_steps": 150, "total_steps": 5000, "loss": 2.7276, "lr": 1.5e-05, "epoch": 0.03, "percentage": 3.0, "elapsed_time": "0:01:10", "remaining_time": "0:37:59"}
16
+ {"current_steps": 160, "total_steps": 5000, "loss": 1.7329, "lr": 1.6000000000000003e-05, "epoch": 0.032, "percentage": 3.2, "elapsed_time": "0:01:12", "remaining_time": "0:36:48"}
17
+ {"current_steps": 170, "total_steps": 5000, "loss": 1.916, "lr": 1.7000000000000003e-05, "epoch": 0.034, "percentage": 3.4, "elapsed_time": "0:01:15", "remaining_time": "0:35:52"}
18
+ {"current_steps": 180, "total_steps": 5000, "loss": 6.3988, "lr": 1.8e-05, "epoch": 0.036, "percentage": 3.6, "elapsed_time": "0:01:18", "remaining_time": "0:35:01"}
19
+ {"current_steps": 190, "total_steps": 5000, "loss": 2.1053, "lr": 1.9e-05, "epoch": 0.038, "percentage": 3.8, "elapsed_time": "0:01:21", "remaining_time": "0:34:23"}
20
+ {"current_steps": 200, "total_steps": 5000, "loss": 0.798, "lr": 2e-05, "epoch": 0.04, "percentage": 4.0, "elapsed_time": "0:01:24", "remaining_time": "0:33:37"}
21
+ {"current_steps": 210, "total_steps": 5000, "loss": 2.0661, "lr": 2.1e-05, "epoch": 0.042, "percentage": 4.2, "elapsed_time": "0:01:26", "remaining_time": "0:32:59"}
22
+ {"current_steps": 220, "total_steps": 5000, "loss": 2.4783, "lr": 2.2000000000000003e-05, "epoch": 0.044, "percentage": 4.4, "elapsed_time": "0:01:29", "remaining_time": "0:32:26"}
23
+ {"current_steps": 230, "total_steps": 5000, "loss": 0.7402, "lr": 2.3000000000000003e-05, "epoch": 0.046, "percentage": 4.6, "elapsed_time": "0:01:32", "remaining_time": "0:31:58"}
24
+ {"current_steps": 240, "total_steps": 5000, "loss": 2.5115, "lr": 2.4e-05, "epoch": 0.048, "percentage": 4.8, "elapsed_time": "0:01:35", "remaining_time": "0:31:36"}
25
+ {"current_steps": 250, "total_steps": 5000, "loss": 2.13, "lr": 2.5e-05, "epoch": 0.05, "percentage": 5.0, "elapsed_time": "0:01:38", "remaining_time": "0:31:11"}
26
+ {"current_steps": 260, "total_steps": 5000, "loss": 1.6962, "lr": 2.6000000000000002e-05, "epoch": 0.052, "percentage": 5.2, "elapsed_time": "0:01:41", "remaining_time": "0:30:50"}
27
+ {"current_steps": 270, "total_steps": 5000, "loss": 2.8705, "lr": 2.7000000000000002e-05, "epoch": 0.054, "percentage": 5.4, "elapsed_time": "0:01:44", "remaining_time": "0:30:31"}
28
+ {"current_steps": 280, "total_steps": 5000, "loss": 2.9734, "lr": 2.8000000000000003e-05, "epoch": 0.056, "percentage": 5.6, "elapsed_time": "0:01:46", "remaining_time": "0:30:02"}
29
+ {"current_steps": 290, "total_steps": 5000, "loss": 1.7669, "lr": 2.9e-05, "epoch": 0.058, "percentage": 5.8, "elapsed_time": "0:01:49", "remaining_time": "0:29:46"}
30
+ {"current_steps": 300, "total_steps": 5000, "loss": 2.1499, "lr": 3e-05, "epoch": 0.06, "percentage": 6.0, "elapsed_time": "0:01:53", "remaining_time": "0:29:30"}
31
+ {"current_steps": 310, "total_steps": 5000, "loss": 1.6354, "lr": 3.1e-05, "epoch": 0.062, "percentage": 6.2, "elapsed_time": "0:01:55", "remaining_time": "0:29:11"}
32
+ {"current_steps": 320, "total_steps": 5000, "loss": 2.1457, "lr": 3.2000000000000005e-05, "epoch": 0.064, "percentage": 6.4, "elapsed_time": "0:01:58", "remaining_time": "0:28:53"}
33
+ {"current_steps": 330, "total_steps": 5000, "loss": 3.0738, "lr": 3.3e-05, "epoch": 0.066, "percentage": 6.6, "elapsed_time": "0:02:01", "remaining_time": "0:28:39"}
34
+ {"current_steps": 340, "total_steps": 5000, "loss": 2.4357, "lr": 3.4000000000000007e-05, "epoch": 0.068, "percentage": 6.8, "elapsed_time": "0:02:04", "remaining_time": "0:28:26"}
35
+ {"current_steps": 350, "total_steps": 5000, "loss": 1.7431, "lr": 3.5e-05, "epoch": 0.07, "percentage": 7.0, "elapsed_time": "0:02:07", "remaining_time": "0:28:10"}
36
+ {"current_steps": 360, "total_steps": 5000, "loss": 1.7527, "lr": 3.6e-05, "epoch": 0.072, "percentage": 7.2, "elapsed_time": "0:02:10", "remaining_time": "0:27:59"}
37
+ {"current_steps": 370, "total_steps": 5000, "loss": 6.1666, "lr": 3.7e-05, "epoch": 0.074, "percentage": 7.4, "elapsed_time": "0:02:13", "remaining_time": "0:27:48"}
38
+ {"current_steps": 380, "total_steps": 5000, "loss": 0.5917, "lr": 3.8e-05, "epoch": 0.076, "percentage": 7.6, "elapsed_time": "0:02:16", "remaining_time": "0:27:35"}
39
+ {"current_steps": 390, "total_steps": 5000, "loss": 1.5061, "lr": 3.9000000000000006e-05, "epoch": 0.078, "percentage": 7.8, "elapsed_time": "0:02:18", "remaining_time": "0:27:18"}
40
+ {"current_steps": 400, "total_steps": 5000, "loss": 1.5694, "lr": 4e-05, "epoch": 0.08, "percentage": 8.0, "elapsed_time": "0:02:21", "remaining_time": "0:27:08"}
41
+ {"current_steps": 410, "total_steps": 5000, "loss": 1.4762, "lr": 4.1e-05, "epoch": 0.082, "percentage": 8.2, "elapsed_time": "0:02:24", "remaining_time": "0:26:59"}
42
+ {"current_steps": 420, "total_steps": 5000, "loss": 1.0468, "lr": 4.2e-05, "epoch": 0.084, "percentage": 8.4, "elapsed_time": "0:02:27", "remaining_time": "0:26:47"}
43
+ {"current_steps": 430, "total_steps": 5000, "loss": 7.1942, "lr": 4.3e-05, "epoch": 0.086, "percentage": 8.6, "elapsed_time": "0:02:30", "remaining_time": "0:26:35"}
44
+ {"current_steps": 440, "total_steps": 5000, "loss": 2.706, "lr": 4.4000000000000006e-05, "epoch": 0.088, "percentage": 8.8, "elapsed_time": "0:02:33", "remaining_time": "0:26:26"}
45
+ {"current_steps": 450, "total_steps": 5000, "loss": 1.8559, "lr": 4.5e-05, "epoch": 0.09, "percentage": 9.0, "elapsed_time": "0:02:35", "remaining_time": "0:26:15"}
46
+ {"current_steps": 460, "total_steps": 5000, "loss": 1.2105, "lr": 4.600000000000001e-05, "epoch": 0.092, "percentage": 9.2, "elapsed_time": "0:02:38", "remaining_time": "0:26:07"}
47
+ {"current_steps": 470, "total_steps": 5000, "loss": 2.8524, "lr": 4.7e-05, "epoch": 0.094, "percentage": 9.4, "elapsed_time": "0:02:41", "remaining_time": "0:25:57"}
48
+ {"current_steps": 480, "total_steps": 5000, "loss": 2.0881, "lr": 4.8e-05, "epoch": 0.096, "percentage": 9.6, "elapsed_time": "0:02:44", "remaining_time": "0:25:47"}
49
+ {"current_steps": 490, "total_steps": 5000, "loss": 2.4156, "lr": 4.9e-05, "epoch": 0.098, "percentage": 9.8, "elapsed_time": "0:02:47", "remaining_time": "0:25:40"}
50
+ {"current_steps": 500, "total_steps": 5000, "loss": 0.9531, "lr": 5e-05, "epoch": 0.1, "percentage": 10.0, "elapsed_time": "0:02:50", "remaining_time": "0:25:31"}
51
+ {"current_steps": 510, "total_steps": 5000, "loss": 3.9691, "lr": 4.999939076763487e-05, "epoch": 0.102, "percentage": 10.2, "elapsed_time": "0:02:52", "remaining_time": "0:25:22"}
52
+ {"current_steps": 520, "total_steps": 5000, "loss": 0.8777, "lr": 4.999756310023261e-05, "epoch": 0.104, "percentage": 10.4, "elapsed_time": "0:02:55", "remaining_time": "0:25:11"}
53
+ {"current_steps": 530, "total_steps": 5000, "loss": 0.9434, "lr": 4.999451708687114e-05, "epoch": 0.106, "percentage": 10.6, "elapsed_time": "0:02:58", "remaining_time": "0:25:03"}
54
+ {"current_steps": 540, "total_steps": 5000, "loss": 1.5592, "lr": 4.999025287600886e-05, "epoch": 0.108, "percentage": 10.8, "elapsed_time": "0:03:01", "remaining_time": "0:24:55"}
55
+ {"current_steps": 550, "total_steps": 5000, "loss": 0.6868, "lr": 4.99847706754774e-05, "epoch": 0.11, "percentage": 11.0, "elapsed_time": "0:03:04", "remaining_time": "0:24:50"}
56
+ {"current_steps": 560, "total_steps": 5000, "loss": 0.956, "lr": 4.997807075247146e-05, "epoch": 0.112, "percentage": 11.2, "elapsed_time": "0:03:07", "remaining_time": "0:24:44"}
57
+ {"current_steps": 570, "total_steps": 5000, "loss": 0.8766, "lr": 4.997015343353585e-05, "epoch": 0.114, "percentage": 11.4, "elapsed_time": "0:03:10", "remaining_time": "0:24:36"}
58
+ {"current_steps": 580, "total_steps": 5000, "loss": 0.5423, "lr": 4.996101910454953e-05, "epoch": 0.116, "percentage": 11.6, "elapsed_time": "0:03:13", "remaining_time": "0:24:30"}
59
+ {"current_steps": 590, "total_steps": 5000, "loss": 1.7762, "lr": 4.995066821070679e-05, "epoch": 0.118, "percentage": 11.8, "elapsed_time": "0:03:15", "remaining_time": "0:24:22"}
60
+ {"current_steps": 600, "total_steps": 5000, "loss": 2.294, "lr": 4.993910125649561e-05, "epoch": 0.12, "percentage": 12.0, "elapsed_time": "0:03:18", "remaining_time": "0:24:16"}
61
+ {"current_steps": 610, "total_steps": 5000, "loss": 0.4881, "lr": 4.992631880567301e-05, "epoch": 0.122, "percentage": 12.2, "elapsed_time": "0:03:21", "remaining_time": "0:24:11"}
62
+ {"current_steps": 620, "total_steps": 5000, "loss": 3.3744, "lr": 4.991232148123761e-05, "epoch": 0.124, "percentage": 12.4, "elapsed_time": "0:03:24", "remaining_time": "0:24:05"}
63
+ {"current_steps": 630, "total_steps": 5000, "loss": 1.2847, "lr": 4.989710996539926e-05, "epoch": 0.126, "percentage": 12.6, "elapsed_time": "0:03:27", "remaining_time": "0:24:00"}
64
+ {"current_steps": 640, "total_steps": 5000, "loss": 0.9029, "lr": 4.988068499954578e-05, "epoch": 0.128, "percentage": 12.8, "elapsed_time": "0:03:30", "remaining_time": "0:23:53"}
65
+ {"current_steps": 650, "total_steps": 5000, "loss": 1.3799, "lr": 4.9863047384206835e-05, "epoch": 0.13, "percentage": 13.0, "elapsed_time": "0:03:33", "remaining_time": "0:23:46"}
66
+ {"current_steps": 660, "total_steps": 5000, "loss": 3.7841, "lr": 4.984419797901491e-05, "epoch": 0.132, "percentage": 13.2, "elapsed_time": "0:03:36", "remaining_time": "0:23:41"}
67
+ {"current_steps": 670, "total_steps": 5000, "loss": 0.9186, "lr": 4.982413770266342e-05, "epoch": 0.134, "percentage": 13.4, "elapsed_time": "0:03:39", "remaining_time": "0:23:36"}
68
+ {"current_steps": 680, "total_steps": 5000, "loss": 1.4738, "lr": 4.980286753286195e-05, "epoch": 0.136, "percentage": 13.6, "elapsed_time": "0:03:41", "remaining_time": "0:23:28"}
69
+ {"current_steps": 690, "total_steps": 5000, "loss": 3.8651, "lr": 4.978038850628854e-05, "epoch": 0.138, "percentage": 13.8, "elapsed_time": "0:03:44", "remaining_time": "0:23:22"}
70
+ {"current_steps": 700, "total_steps": 5000, "loss": 1.1881, "lr": 4.975670171853926e-05, "epoch": 0.14, "percentage": 14.0, "elapsed_time": "0:03:47", "remaining_time": "0:23:16"}
71
+ {"current_steps": 710, "total_steps": 5000, "loss": 0.8245, "lr": 4.9731808324074717e-05, "epoch": 0.142, "percentage": 14.2, "elapsed_time": "0:03:50", "remaining_time": "0:23:10"}
72
+ {"current_steps": 720, "total_steps": 5000, "loss": 1.2642, "lr": 4.9705709536163824e-05, "epoch": 0.144, "percentage": 14.4, "elapsed_time": "0:03:53", "remaining_time": "0:23:05"}
73
+ {"current_steps": 730, "total_steps": 5000, "loss": 1.6256, "lr": 4.96784066268247e-05, "epoch": 0.146, "percentage": 14.6, "elapsed_time": "0:03:56", "remaining_time": "0:23:00"}
74
+ {"current_steps": 740, "total_steps": 5000, "loss": 9.4718, "lr": 4.964990092676263e-05, "epoch": 0.148, "percentage": 14.8, "elapsed_time": "0:03:59", "remaining_time": "0:22:56"}
75
+ {"current_steps": 750, "total_steps": 5000, "loss": 1.6593, "lr": 4.962019382530521e-05, "epoch": 0.15, "percentage": 15.0, "elapsed_time": "0:04:02", "remaining_time": "0:22:51"}
76
+ {"current_steps": 760, "total_steps": 5000, "loss": 1.4136, "lr": 4.9589286770334654e-05, "epoch": 0.152, "percentage": 15.2, "elapsed_time": "0:04:05", "remaining_time": "0:22:46"}
77
+ {"current_steps": 770, "total_steps": 5000, "loss": 2.5935, "lr": 4.9557181268217227e-05, "epoch": 0.154, "percentage": 15.4, "elapsed_time": "0:04:07", "remaining_time": "0:22:41"}
78
+ {"current_steps": 780, "total_steps": 5000, "loss": 2.0708, "lr": 4.952387888372979e-05, "epoch": 0.156, "percentage": 15.6, "elapsed_time": "0:04:10", "remaining_time": "0:22:34"}
79
+ {"current_steps": 790, "total_steps": 5000, "loss": 0.8737, "lr": 4.94893812399836e-05, "epoch": 0.158, "percentage": 15.8, "elapsed_time": "0:04:13", "remaining_time": "0:22:28"}
80
+ {"current_steps": 800, "total_steps": 5000, "loss": 2.0691, "lr": 4.9453690018345144e-05, "epoch": 0.16, "percentage": 16.0, "elapsed_time": "0:04:15", "remaining_time": "0:22:23"}
81
+ {"current_steps": 810, "total_steps": 5000, "loss": 0.7324, "lr": 4.94168069583542e-05, "epoch": 0.162, "percentage": 16.2, "elapsed_time": "0:04:18", "remaining_time": "0:22:19"}
82
+ {"current_steps": 820, "total_steps": 5000, "loss": 0.577, "lr": 4.937873385763908e-05, "epoch": 0.164, "percentage": 16.4, "elapsed_time": "0:04:21", "remaining_time": "0:22:11"}
83
+ {"current_steps": 830, "total_steps": 5000, "loss": 0.6428, "lr": 4.933947257182901e-05, "epoch": 0.166, "percentage": 16.6, "elapsed_time": "0:04:23", "remaining_time": "0:22:04"}
84
+ {"current_steps": 840, "total_steps": 5000, "loss": 1.4208, "lr": 4.929902501446366e-05, "epoch": 0.168, "percentage": 16.8, "elapsed_time": "0:04:26", "remaining_time": "0:22:00"}
85
+ {"current_steps": 850, "total_steps": 5000, "loss": 1.1485, "lr": 4.925739315689991e-05, "epoch": 0.17, "percentage": 17.0, "elapsed_time": "0:04:29", "remaining_time": "0:21:55"}
86
+ {"current_steps": 860, "total_steps": 5000, "loss": 0.8729, "lr": 4.9214579028215776e-05, "epoch": 0.172, "percentage": 17.2, "elapsed_time": "0:04:32", "remaining_time": "0:21:51"}
87
+ {"current_steps": 870, "total_steps": 5000, "loss": 2.3371, "lr": 4.917058471511149e-05, "epoch": 0.174, "percentage": 17.4, "elapsed_time": "0:04:35", "remaining_time": "0:21:46"}
88
+ {"current_steps": 880, "total_steps": 5000, "loss": 0.6887, "lr": 4.912541236180779e-05, "epoch": 0.176, "percentage": 17.6, "elapsed_time": "0:04:38", "remaining_time": "0:21:42"}
89
+ {"current_steps": 890, "total_steps": 5000, "loss": 0.8769, "lr": 4.907906416994146e-05, "epoch": 0.178, "percentage": 17.8, "elapsed_time": "0:04:40", "remaining_time": "0:21:37"}
90
+ {"current_steps": 900, "total_steps": 5000, "loss": 2.4827, "lr": 4.9031542398457974e-05, "epoch": 0.18, "percentage": 18.0, "elapsed_time": "0:04:43", "remaining_time": "0:21:33"}
91
+ {"current_steps": 910, "total_steps": 5000, "loss": 0.5722, "lr": 4.898284936350144e-05, "epoch": 0.182, "percentage": 18.2, "elapsed_time": "0:04:46", "remaining_time": "0:21:28"}
92
+ {"current_steps": 920, "total_steps": 5000, "loss": 1.3822, "lr": 4.893298743830168e-05, "epoch": 0.184, "percentage": 18.4, "elapsed_time": "0:04:49", "remaining_time": "0:21:24"}
93
+ {"current_steps": 930, "total_steps": 5000, "loss": 0.7233, "lr": 4.888195905305859e-05, "epoch": 0.186, "percentage": 18.6, "elapsed_time": "0:04:52", "remaining_time": "0:21:20"}
94
+ {"current_steps": 940, "total_steps": 5000, "loss": 9.3579, "lr": 4.882976669482367e-05, "epoch": 0.188, "percentage": 18.8, "elapsed_time": "0:04:55", "remaining_time": "0:21:15"}
95
+ {"current_steps": 950, "total_steps": 5000, "loss": 2.2926, "lr": 4.877641290737884e-05, "epoch": 0.19, "percentage": 19.0, "elapsed_time": "0:04:58", "remaining_time": "0:21:10"}
96
+ {"current_steps": 960, "total_steps": 5000, "loss": 0.8563, "lr": 4.8721900291112415e-05, "epoch": 0.192, "percentage": 19.2, "elapsed_time": "0:05:00", "remaining_time": "0:21:06"}
97
+ {"current_steps": 970, "total_steps": 5000, "loss": 2.229, "lr": 4.8666231502892415e-05, "epoch": 0.194, "percentage": 19.4, "elapsed_time": "0:05:03", "remaining_time": "0:21:02"}
98
+ {"current_steps": 980, "total_steps": 5000, "loss": 1.7804, "lr": 4.860940925593703e-05, "epoch": 0.196, "percentage": 19.6, "elapsed_time": "0:05:06", "remaining_time": "0:20:58"}
99
+ {"current_steps": 990, "total_steps": 5000, "loss": 0.826, "lr": 4.855143631968242e-05, "epoch": 0.198, "percentage": 19.8, "elapsed_time": "0:05:09", "remaining_time": "0:20:55"}
100
+ {"current_steps": 1000, "total_steps": 5000, "loss": 1.746, "lr": 4.849231551964771e-05, "epoch": 0.2, "percentage": 20.0, "elapsed_time": "0:05:12", "remaining_time": "0:20:51"}
101
+ {"current_steps": 1010, "total_steps": 5000, "loss": 2.6019, "lr": 4.843204973729729e-05, "epoch": 0.202, "percentage": 20.2, "elapsed_time": "0:05:15", "remaining_time": "0:20:47"}
102
+ {"current_steps": 1020, "total_steps": 5000, "loss": 1.5425, "lr": 4.837064190990036e-05, "epoch": 0.204, "percentage": 20.4, "elapsed_time": "0:05:18", "remaining_time": "0:20:43"}
103
+ {"current_steps": 1030, "total_steps": 5000, "loss": 1.2792, "lr": 4.830809503038781e-05, "epoch": 0.206, "percentage": 20.6, "elapsed_time": "0:05:21", "remaining_time": "0:20:39"}
104
+ {"current_steps": 1040, "total_steps": 5000, "loss": 0.9529, "lr": 4.8244412147206284e-05, "epoch": 0.208, "percentage": 20.8, "elapsed_time": "0:05:24", "remaining_time": "0:20:35"}
105
+ {"current_steps": 1050, "total_steps": 5000, "loss": 1.3505, "lr": 4.817959636416969e-05, "epoch": 0.21, "percentage": 21.0, "elapsed_time": "0:05:26", "remaining_time": "0:20:29"}
106
+ {"current_steps": 1060, "total_steps": 5000, "loss": 0.9163, "lr": 4.8113650840307834e-05, "epoch": 0.212, "percentage": 21.2, "elapsed_time": "0:05:29", "remaining_time": "0:20:26"}
107
+ {"current_steps": 1070, "total_steps": 5000, "loss": 1.228, "lr": 4.8046578789712515e-05, "epoch": 0.214, "percentage": 21.4, "elapsed_time": "0:05:32", "remaining_time": "0:20:22"}
108
+ {"current_steps": 1080, "total_steps": 5000, "loss": 1.1117, "lr": 4.797838348138086e-05, "epoch": 0.216, "percentage": 21.6, "elapsed_time": "0:05:35", "remaining_time": "0:20:19"}
109
+ {"current_steps": 1090, "total_steps": 5000, "loss": 1.7063, "lr": 4.790906823905599e-05, "epoch": 0.218, "percentage": 21.8, "elapsed_time": "0:05:38", "remaining_time": "0:20:14"}
110
+ {"current_steps": 1100, "total_steps": 5000, "loss": 0.9846, "lr": 4.783863644106502e-05, "epoch": 0.22, "percentage": 22.0, "elapsed_time": "0:05:41", "remaining_time": "0:20:10"}
111
+ {"current_steps": 1110, "total_steps": 5000, "loss": 0.9996, "lr": 4.776709152015443e-05, "epoch": 0.222, "percentage": 22.2, "elapsed_time": "0:05:44", "remaining_time": "0:20:07"}
112
+ {"current_steps": 1120, "total_steps": 5000, "loss": 1.5269, "lr": 4.769443696332272e-05, "epoch": 0.224, "percentage": 22.4, "elapsed_time": "0:05:47", "remaining_time": "0:20:03"}
113
+ {"current_steps": 1130, "total_steps": 5000, "loss": 1.7736, "lr": 4.762067631165049e-05, "epoch": 0.226, "percentage": 22.6, "elapsed_time": "0:05:50", "remaining_time": "0:19:59"}
114
+ {"current_steps": 1140, "total_steps": 5000, "loss": 1.0233, "lr": 4.754581316012785e-05, "epoch": 0.228, "percentage": 22.8, "elapsed_time": "0:05:53", "remaining_time": "0:19:56"}
115
+ {"current_steps": 1150, "total_steps": 5000, "loss": 1.3813, "lr": 4.7469851157479177e-05, "epoch": 0.23, "percentage": 23.0, "elapsed_time": "0:05:56", "remaining_time": "0:19:53"}
116
+ {"current_steps": 1160, "total_steps": 5000, "loss": 0.6272, "lr": 4.7392794005985326e-05, "epoch": 0.232, "percentage": 23.2, "elapsed_time": "0:05:59", "remaining_time": "0:19:48"}
117
+ {"current_steps": 1170, "total_steps": 5000, "loss": 2.541, "lr": 4.731464546130314e-05, "epoch": 0.234, "percentage": 23.4, "elapsed_time": "0:06:02", "remaining_time": "0:19:45"}
118
+ {"current_steps": 1180, "total_steps": 5000, "loss": 1.0791, "lr": 4.723540933228244e-05, "epoch": 0.236, "percentage": 23.6, "elapsed_time": "0:06:05", "remaining_time": "0:19:41"}
119
+ {"current_steps": 1190, "total_steps": 5000, "loss": 1.5918, "lr": 4.715508948078037e-05, "epoch": 0.238, "percentage": 23.8, "elapsed_time": "0:06:08", "remaining_time": "0:19:38"}
120
+ {"current_steps": 1200, "total_steps": 5000, "loss": 1.3247, "lr": 4.707368982147318e-05, "epoch": 0.24, "percentage": 24.0, "elapsed_time": "0:06:10", "remaining_time": "0:19:33"}
121
+ {"current_steps": 1210, "total_steps": 5000, "loss": 1.0801, "lr": 4.6991214321665414e-05, "epoch": 0.242, "percentage": 24.2, "elapsed_time": "0:06:13", "remaining_time": "0:19:29"}
122
+ {"current_steps": 1220, "total_steps": 5000, "loss": 2.2492, "lr": 4.690766700109659e-05, "epoch": 0.244, "percentage": 24.4, "elapsed_time": "0:06:16", "remaining_time": "0:19:25"}
123
+ {"current_steps": 1230, "total_steps": 5000, "loss": 0.9898, "lr": 4.682305193174524e-05, "epoch": 0.246, "percentage": 24.6, "elapsed_time": "0:06:19", "remaining_time": "0:19:22"}
124
+ {"current_steps": 1240, "total_steps": 5000, "loss": 0.8907, "lr": 4.6737373237630476e-05, "epoch": 0.248, "percentage": 24.8, "elapsed_time": "0:06:22", "remaining_time": "0:19:18"}
125
+ {"current_steps": 1250, "total_steps": 5000, "loss": 0.7756, "lr": 4.665063509461097e-05, "epoch": 0.25, "percentage": 25.0, "elapsed_time": "0:06:24", "remaining_time": "0:19:13"}
126
+ {"current_steps": 1260, "total_steps": 5000, "loss": 0.6585, "lr": 4.656284173018144e-05, "epoch": 0.252, "percentage": 25.2, "elapsed_time": "0:06:27", "remaining_time": "0:19:09"}
127
+ {"current_steps": 1270, "total_steps": 5000, "loss": 1.2712, "lr": 4.6473997423266614e-05, "epoch": 0.254, "percentage": 25.4, "elapsed_time": "0:06:30", "remaining_time": "0:19:06"}
128
+ {"current_steps": 1280, "total_steps": 5000, "loss": 1.2611, "lr": 4.638410650401267e-05, "epoch": 0.256, "percentage": 25.6, "elapsed_time": "0:06:33", "remaining_time": "0:19:03"}
129
+ {"current_steps": 1290, "total_steps": 5000, "loss": 1.0479, "lr": 4.629317335357619e-05, "epoch": 0.258, "percentage": 25.8, "elapsed_time": "0:06:36", "remaining_time": "0:19:00"}
130
+ {"current_steps": 1300, "total_steps": 5000, "loss": 1.6104, "lr": 4.620120240391065e-05, "epoch": 0.26, "percentage": 26.0, "elapsed_time": "0:06:39", "remaining_time": "0:18:56"}
131
+ {"current_steps": 1310, "total_steps": 5000, "loss": 1.3264, "lr": 4.610819813755038e-05, "epoch": 0.262, "percentage": 26.2, "elapsed_time": "0:06:42", "remaining_time": "0:18:52"}
132
+ {"current_steps": 1320, "total_steps": 5000, "loss": 1.1883, "lr": 4.601416508739211e-05, "epoch": 0.264, "percentage": 26.4, "elapsed_time": "0:06:45", "remaining_time": "0:18:49"}
133
+ {"current_steps": 1330, "total_steps": 5000, "loss": 1.4327, "lr": 4.591910783647404e-05, "epoch": 0.266, "percentage": 26.6, "elapsed_time": "0:06:48", "remaining_time": "0:18:46"}
134
+ {"current_steps": 1340, "total_steps": 5000, "loss": 1.3558, "lr": 4.5823031017752485e-05, "epoch": 0.268, "percentage": 26.8, "elapsed_time": "0:06:50", "remaining_time": "0:18:42"}
135
+ {"current_steps": 1350, "total_steps": 5000, "loss": 2.6668, "lr": 4.572593931387604e-05, "epoch": 0.27, "percentage": 27.0, "elapsed_time": "0:06:53", "remaining_time": "0:18:38"}
136
+ {"current_steps": 1360, "total_steps": 5000, "loss": 0.4225, "lr": 4.562783745695738e-05, "epoch": 0.272, "percentage": 27.2, "elapsed_time": "0:06:56", "remaining_time": "0:18:33"}
137
+ {"current_steps": 1370, "total_steps": 5000, "loss": 1.1548, "lr": 4.5528730228342605e-05, "epoch": 0.274, "percentage": 27.4, "elapsed_time": "0:06:59", "remaining_time": "0:18:30"}
138
+ {"current_steps": 1380, "total_steps": 5000, "loss": 0.9096, "lr": 4.542862245837821e-05, "epoch": 0.276, "percentage": 27.6, "elapsed_time": "0:07:01", "remaining_time": "0:18:26"}
139
+ {"current_steps": 1390, "total_steps": 5000, "loss": 1.3681, "lr": 4.532751902617569e-05, "epoch": 0.278, "percentage": 27.8, "elapsed_time": "0:07:04", "remaining_time": "0:18:23"}
140
+ {"current_steps": 1400, "total_steps": 5000, "loss": 1.286, "lr": 4.522542485937369e-05, "epoch": 0.28, "percentage": 28.0, "elapsed_time": "0:07:07", "remaining_time": "0:18:19"}
141
+ {"current_steps": 1410, "total_steps": 5000, "loss": 1.0204, "lr": 4.512234493389785e-05, "epoch": 0.282, "percentage": 28.2, "elapsed_time": "0:07:10", "remaining_time": "0:18:15"}
142
+ {"current_steps": 1420, "total_steps": 5000, "loss": 1.083, "lr": 4.5018284273718336e-05, "epoch": 0.284, "percentage": 28.4, "elapsed_time": "0:07:12", "remaining_time": "0:18:11"}
143
+ {"current_steps": 1430, "total_steps": 5000, "loss": 0.5139, "lr": 4.491324795060491e-05, "epoch": 0.286, "percentage": 28.6, "elapsed_time": "0:07:16", "remaining_time": "0:18:08"}
144
+ {"current_steps": 1440, "total_steps": 5000, "loss": 1.591, "lr": 4.480724108387977e-05, "epoch": 0.288, "percentage": 28.8, "elapsed_time": "0:07:18", "remaining_time": "0:18:04"}
145
+ {"current_steps": 1450, "total_steps": 5000, "loss": 0.4862, "lr": 4.4700268840168045e-05, "epoch": 0.29, "percentage": 29.0, "elapsed_time": "0:07:21", "remaining_time": "0:18:01"}
146
+ {"current_steps": 1460, "total_steps": 5000, "loss": 0.7162, "lr": 4.4592336433146e-05, "epoch": 0.292, "percentage": 29.2, "elapsed_time": "0:07:24", "remaining_time": "0:17:58"}
147
+ {"current_steps": 1470, "total_steps": 5000, "loss": 1.6884, "lr": 4.448344912328686e-05, "epoch": 0.294, "percentage": 29.4, "elapsed_time": "0:07:27", "remaining_time": "0:17:54"}
148
+ {"current_steps": 1480, "total_steps": 5000, "loss": 0.9289, "lr": 4.4373612217604496e-05, "epoch": 0.296, "percentage": 29.6, "elapsed_time": "0:07:30", "remaining_time": "0:17:51"}
149
+ {"current_steps": 1490, "total_steps": 5000, "loss": 1.546, "lr": 4.426283106939474e-05, "epoch": 0.298, "percentage": 29.8, "elapsed_time": "0:07:33", "remaining_time": "0:17:47"}
150
+ {"current_steps": 1500, "total_steps": 5000, "loss": 0.9777, "lr": 4.415111107797445e-05, "epoch": 0.3, "percentage": 30.0, "elapsed_time": "0:07:36", "remaining_time": "0:17:44"}
151
+ {"current_steps": 1510, "total_steps": 5000, "loss": 0.6637, "lr": 4.403845768841842e-05, "epoch": 0.302, "percentage": 30.2, "elapsed_time": "0:07:39", "remaining_time": "0:17:41"}
152
+ {"current_steps": 1520, "total_steps": 5000, "loss": 1.7305, "lr": 4.3924876391293915e-05, "epoch": 0.304, "percentage": 30.4, "elapsed_time": "0:07:42", "remaining_time": "0:17:38"}
153
+ {"current_steps": 1530, "total_steps": 5000, "loss": 0.5603, "lr": 4.381037272239311e-05, "epoch": 0.306, "percentage": 30.6, "elapsed_time": "0:07:44", "remaining_time": "0:17:34"}
154
+ {"current_steps": 1540, "total_steps": 5000, "loss": 3.2643, "lr": 4.36949522624633e-05, "epoch": 0.308, "percentage": 30.8, "elapsed_time": "0:07:47", "remaining_time": "0:17:31"}
155
+ {"current_steps": 1550, "total_steps": 5000, "loss": 3.7458, "lr": 4.357862063693486e-05, "epoch": 0.31, "percentage": 31.0, "elapsed_time": "0:07:50", "remaining_time": "0:17:27"}
156
+ {"current_steps": 1560, "total_steps": 5000, "loss": 1.9472, "lr": 4.3461383515647106e-05, "epoch": 0.312, "percentage": 31.2, "elapsed_time": "0:07:53", "remaining_time": "0:17:23"}
157
+ {"current_steps": 1570, "total_steps": 5000, "loss": 1.0152, "lr": 4.334324661257191e-05, "epoch": 0.314, "percentage": 31.4, "elapsed_time": "0:07:56", "remaining_time": "0:17:20"}
158
+ {"current_steps": 1580, "total_steps": 5000, "loss": 0.9588, "lr": 4.3224215685535294e-05, "epoch": 0.316, "percentage": 31.6, "elapsed_time": "0:07:59", "remaining_time": "0:17:17"}
159
+ {"current_steps": 1590, "total_steps": 5000, "loss": 1.1131, "lr": 4.3104296535936695e-05, "epoch": 0.318, "percentage": 31.8, "elapsed_time": "0:08:01", "remaining_time": "0:17:13"}
160
+ {"current_steps": 1600, "total_steps": 5000, "loss": 2.4659, "lr": 4.2983495008466276e-05, "epoch": 0.32, "percentage": 32.0, "elapsed_time": "0:08:04", "remaining_time": "0:17:09"}
161
+ {"current_steps": 1610, "total_steps": 5000, "loss": 0.8816, "lr": 4.2861816990820084e-05, "epoch": 0.322, "percentage": 32.2, "elapsed_time": "0:08:07", "remaining_time": "0:17:06"}
162
+ {"current_steps": 1620, "total_steps": 5000, "loss": 0.519, "lr": 4.273926841341302e-05, "epoch": 0.324, "percentage": 32.4, "elapsed_time": "0:08:10", "remaining_time": "0:17:03"}
163
+ {"current_steps": 1630, "total_steps": 5000, "loss": 2.3195, "lr": 4.261585524908987e-05, "epoch": 0.326, "percentage": 32.6, "elapsed_time": "0:08:13", "remaining_time": "0:16:59"}
164
+ {"current_steps": 1640, "total_steps": 5000, "loss": 3.6418, "lr": 4.249158351283414e-05, "epoch": 0.328, "percentage": 32.8, "elapsed_time": "0:08:16", "remaining_time": "0:16:56"}
165
+ {"current_steps": 1650, "total_steps": 5000, "loss": 2.1609, "lr": 4.2366459261474933e-05, "epoch": 0.33, "percentage": 33.0, "elapsed_time": "0:08:19", "remaining_time": "0:16:53"}
166
+ {"current_steps": 1660, "total_steps": 5000, "loss": 1.4675, "lr": 4.224048859339175e-05, "epoch": 0.332, "percentage": 33.2, "elapsed_time": "0:08:21", "remaining_time": "0:16:49"}
167
+ {"current_steps": 1670, "total_steps": 5000, "loss": 1.3251, "lr": 4.211367764821722e-05, "epoch": 0.334, "percentage": 33.4, "elapsed_time": "0:08:24", "remaining_time": "0:16:46"}
168
+ {"current_steps": 1680, "total_steps": 5000, "loss": 1.0805, "lr": 4.198603260653792e-05, "epoch": 0.336, "percentage": 33.6, "elapsed_time": "0:08:27", "remaining_time": "0:16:43"}
169
+ {"current_steps": 1690, "total_steps": 5000, "loss": 0.5501, "lr": 4.185755968959308e-05, "epoch": 0.338, "percentage": 33.8, "elapsed_time": "0:08:30", "remaining_time": "0:16:39"}
170
+ {"current_steps": 1700, "total_steps": 5000, "loss": 0.7312, "lr": 4.172826515897146e-05, "epoch": 0.34, "percentage": 34.0, "elapsed_time": "0:08:33", "remaining_time": "0:16:36"}
171
+ {"current_steps": 1710, "total_steps": 5000, "loss": 0.5805, "lr": 4.1598155316306044e-05, "epoch": 0.342, "percentage": 34.2, "elapsed_time": "0:08:36", "remaining_time": "0:16:33"}
172
+ {"current_steps": 1720, "total_steps": 5000, "loss": 1.9512, "lr": 4.146723650296701e-05, "epoch": 0.344, "percentage": 34.4, "elapsed_time": "0:08:39", "remaining_time": "0:16:30"}
173
+ {"current_steps": 1730, "total_steps": 5000, "loss": 1.5697, "lr": 4.133551509975264e-05, "epoch": 0.346, "percentage": 34.6, "elapsed_time": "0:08:42", "remaining_time": "0:16:27"}
174
+ {"current_steps": 1740, "total_steps": 5000, "loss": 0.8957, "lr": 4.1202997526578276e-05, "epoch": 0.348, "percentage": 34.8, "elapsed_time": "0:08:45", "remaining_time": "0:16:23"}
175
+ {"current_steps": 1750, "total_steps": 5000, "loss": 0.489, "lr": 4.1069690242163484e-05, "epoch": 0.35, "percentage": 35.0, "elapsed_time": "0:08:47", "remaining_time": "0:16:20"}
176
+ {"current_steps": 1760, "total_steps": 5000, "loss": 2.1314, "lr": 4.093559974371725e-05, "epoch": 0.352, "percentage": 35.2, "elapsed_time": "0:08:50", "remaining_time": "0:16:16"}
177
+ {"current_steps": 1770, "total_steps": 5000, "loss": 3.0047, "lr": 4.080073256662127e-05, "epoch": 0.354, "percentage": 35.4, "elapsed_time": "0:08:53", "remaining_time": "0:16:13"}
178
+ {"current_steps": 1780, "total_steps": 5000, "loss": 1.2667, "lr": 4.066509528411152e-05, "epoch": 0.356, "percentage": 35.6, "elapsed_time": "0:08:56", "remaining_time": "0:16:10"}
179
+ {"current_steps": 1790, "total_steps": 5000, "loss": 0.9325, "lr": 4.052869450695776e-05, "epoch": 0.358, "percentage": 35.8, "elapsed_time": "0:08:59", "remaining_time": "0:16:06"}
180
+ {"current_steps": 1800, "total_steps": 5000, "loss": 2.1198, "lr": 4.039153688314145e-05, "epoch": 0.36, "percentage": 36.0, "elapsed_time": "0:09:02", "remaining_time": "0:16:03"}
181
+ {"current_steps": 1810, "total_steps": 5000, "loss": 0.809, "lr": 4.02536290975317e-05, "epoch": 0.362, "percentage": 36.2, "elapsed_time": "0:09:04", "remaining_time": "0:16:00"}
182
+ {"current_steps": 1820, "total_steps": 5000, "loss": 0.9603, "lr": 4.011497787155938e-05, "epoch": 0.364, "percentage": 36.4, "elapsed_time": "0:09:07", "remaining_time": "0:15:56"}
183
+ {"current_steps": 1830, "total_steps": 5000, "loss": 2.463, "lr": 3.997558996288965e-05, "epoch": 0.366, "percentage": 36.6, "elapsed_time": "0:09:10", "remaining_time": "0:15:53"}
184
+ {"current_steps": 1840, "total_steps": 5000, "loss": 0.6611, "lr": 3.983547216509254e-05, "epoch": 0.368, "percentage": 36.8, "elapsed_time": "0:09:13", "remaining_time": "0:15:50"}
185
+ {"current_steps": 1850, "total_steps": 5000, "loss": 0.8496, "lr": 3.969463130731183e-05, "epoch": 0.37, "percentage": 37.0, "elapsed_time": "0:09:15", "remaining_time": "0:15:46"}
186
+ {"current_steps": 1860, "total_steps": 5000, "loss": 2.0151, "lr": 3.955307425393224e-05, "epoch": 0.372, "percentage": 37.2, "elapsed_time": "0:09:18", "remaining_time": "0:15:43"}
187
+ {"current_steps": 1870, "total_steps": 5000, "loss": 1.1714, "lr": 3.941080790424484e-05, "epoch": 0.374, "percentage": 37.4, "elapsed_time": "0:09:21", "remaining_time": "0:15:40"}
188
+ {"current_steps": 1880, "total_steps": 5000, "loss": 2.1127, "lr": 3.92678391921108e-05, "epoch": 0.376, "percentage": 37.6, "elapsed_time": "0:09:24", "remaining_time": "0:15:37"}
189
+ {"current_steps": 1890, "total_steps": 5000, "loss": 3.0312, "lr": 3.912417508562345e-05, "epoch": 0.378, "percentage": 37.8, "elapsed_time": "0:09:27", "remaining_time": "0:15:34"}
190
+ {"current_steps": 1900, "total_steps": 5000, "loss": 1.052, "lr": 3.897982258676867e-05, "epoch": 0.38, "percentage": 38.0, "elapsed_time": "0:09:30", "remaining_time": "0:15:30"}
191
+ {"current_steps": 1910, "total_steps": 5000, "loss": 1.2737, "lr": 3.883478873108361e-05, "epoch": 0.382, "percentage": 38.2, "elapsed_time": "0:09:33", "remaining_time": "0:15:27"}
192
+ {"current_steps": 1920, "total_steps": 5000, "loss": 0.487, "lr": 3.868908058731376e-05, "epoch": 0.384, "percentage": 38.4, "elapsed_time": "0:09:36", "remaining_time": "0:15:24"}
193
+ {"current_steps": 1930, "total_steps": 5000, "loss": 1.1498, "lr": 3.85427052570685e-05, "epoch": 0.386, "percentage": 38.6, "elapsed_time": "0:09:39", "remaining_time": "0:15:21"}
194
+ {"current_steps": 1940, "total_steps": 5000, "loss": 1.3501, "lr": 3.8395669874474915e-05, "epoch": 0.388, "percentage": 38.8, "elapsed_time": "0:09:42", "remaining_time": "0:15:18"}
195
+ {"current_steps": 1950, "total_steps": 5000, "loss": 0.695, "lr": 3.824798160583012e-05, "epoch": 0.39, "percentage": 39.0, "elapsed_time": "0:09:45", "remaining_time": "0:15:15"}
196
+ {"current_steps": 1960, "total_steps": 5000, "loss": 1.1411, "lr": 3.8099647649251986e-05, "epoch": 0.392, "percentage": 39.2, "elapsed_time": "0:09:48", "remaining_time": "0:15:12"}
197
+ {"current_steps": 1970, "total_steps": 5000, "loss": 1.3583, "lr": 3.795067523432826e-05, "epoch": 0.394, "percentage": 39.4, "elapsed_time": "0:09:51", "remaining_time": "0:15:09"}
198
+ {"current_steps": 1980, "total_steps": 5000, "loss": 1.43, "lr": 3.780107162176429e-05, "epoch": 0.396, "percentage": 39.6, "elapsed_time": "0:09:54", "remaining_time": "0:15:06"}
199
+ {"current_steps": 1990, "total_steps": 5000, "loss": 2.6459, "lr": 3.765084410302909e-05, "epoch": 0.398, "percentage": 39.8, "elapsed_time": "0:09:56", "remaining_time": "0:15:02"}
200
+ {"current_steps": 2000, "total_steps": 5000, "loss": 2.1941, "lr": 3.7500000000000003e-05, "epoch": 0.4, "percentage": 40.0, "elapsed_time": "0:09:59", "remaining_time": "0:14:59"}
201
+ {"current_steps": 2010, "total_steps": 5000, "loss": 1.1788, "lr": 3.7348546664605777e-05, "epoch": 0.402, "percentage": 40.2, "elapsed_time": "0:10:02", "remaining_time": "0:14:56"}
202
+ {"current_steps": 2020, "total_steps": 5000, "loss": 0.6889, "lr": 3.719649147846832e-05, "epoch": 0.404, "percentage": 40.4, "elapsed_time": "0:10:05", "remaining_time": "0:14:53"}
203
+ {"current_steps": 2030, "total_steps": 5000, "loss": 1.0726, "lr": 3.704384185254288e-05, "epoch": 0.406, "percentage": 40.6, "elapsed_time": "0:10:08", "remaining_time": "0:14:50"}
204
+ {"current_steps": 2040, "total_steps": 5000, "loss": 1.1677, "lr": 3.689060522675689e-05, "epoch": 0.408, "percentage": 40.8, "elapsed_time": "0:10:11", "remaining_time": "0:14:47"}
205
+ {"current_steps": 2050, "total_steps": 5000, "loss": 1.1147, "lr": 3.673678906964727e-05, "epoch": 0.41, "percentage": 41.0, "elapsed_time": "0:10:14", "remaining_time": "0:14:44"}
206
+ {"current_steps": 2060, "total_steps": 5000, "loss": 0.9126, "lr": 3.6582400877996546e-05, "epoch": 0.412, "percentage": 41.2, "elapsed_time": "0:10:17", "remaining_time": "0:14:41"}
207
+ {"current_steps": 2070, "total_steps": 5000, "loss": 2.0398, "lr": 3.642744817646736e-05, "epoch": 0.414, "percentage": 41.4, "elapsed_time": "0:10:20", "remaining_time": "0:14:38"}
208
+ {"current_steps": 2080, "total_steps": 5000, "loss": 1.3157, "lr": 3.627193851723577e-05, "epoch": 0.416, "percentage": 41.6, "elapsed_time": "0:10:23", "remaining_time": "0:14:35"}
209
+ {"current_steps": 2090, "total_steps": 5000, "loss": 0.8404, "lr": 3.611587947962319e-05, "epoch": 0.418, "percentage": 41.8, "elapsed_time": "0:10:26", "remaining_time": "0:14:31"}
210
+ {"current_steps": 2100, "total_steps": 5000, "loss": 1.571, "lr": 3.5959278669726935e-05, "epoch": 0.42, "percentage": 42.0, "elapsed_time": "0:10:28", "remaining_time": "0:14:28"}
211
+ {"current_steps": 2110, "total_steps": 5000, "loss": 1.7026, "lr": 3.580214372004956e-05, "epoch": 0.422, "percentage": 42.2, "elapsed_time": "0:10:31", "remaining_time": "0:14:25"}
212
+ {"current_steps": 2120, "total_steps": 5000, "loss": 0.7502, "lr": 3.564448228912682e-05, "epoch": 0.424, "percentage": 42.4, "elapsed_time": "0:10:34", "remaining_time": "0:14:22"}
213
+ {"current_steps": 2130, "total_steps": 5000, "loss": 0.7163, "lr": 3.548630206115443e-05, "epoch": 0.426, "percentage": 42.6, "elapsed_time": "0:10:37", "remaining_time": "0:14:18"}
214
+ {"current_steps": 2140, "total_steps": 5000, "loss": 1.2891, "lr": 3.532761074561355e-05, "epoch": 0.428, "percentage": 42.8, "elapsed_time": "0:10:40", "remaining_time": "0:14:15"}
215
+ {"current_steps": 2150, "total_steps": 5000, "loss": 2.0099, "lr": 3.516841607689501e-05, "epoch": 0.43, "percentage": 43.0, "elapsed_time": "0:10:42", "remaining_time": "0:14:11"}
216
+ {"current_steps": 2160, "total_steps": 5000, "loss": 2.6718, "lr": 3.5008725813922386e-05, "epoch": 0.432, "percentage": 43.2, "elapsed_time": "0:10:45", "remaining_time": "0:14:08"}
217
+ {"current_steps": 2170, "total_steps": 5000, "loss": 3.9758, "lr": 3.484854773977378e-05, "epoch": 0.434, "percentage": 43.4, "elapsed_time": "0:10:48", "remaining_time": "0:14:05"}
218
+ {"current_steps": 2180, "total_steps": 5000, "loss": 4.9642, "lr": 3.4687889661302576e-05, "epoch": 0.436, "percentage": 43.6, "elapsed_time": "0:10:51", "remaining_time": "0:14:02"}
219
+ {"current_steps": 2190, "total_steps": 5000, "loss": 1.3115, "lr": 3.452675940875686e-05, "epoch": 0.438, "percentage": 43.8, "elapsed_time": "0:10:54", "remaining_time": "0:13:59"}
220
+ {"current_steps": 2200, "total_steps": 5000, "loss": 2.1822, "lr": 3.436516483539781e-05, "epoch": 0.44, "percentage": 44.0, "elapsed_time": "0:10:56", "remaining_time": "0:13:55"}
221
+ {"current_steps": 2210, "total_steps": 5000, "loss": 0.6386, "lr": 3.4203113817116957e-05, "epoch": 0.442, "percentage": 44.2, "elapsed_time": "0:10:59", "remaining_time": "0:13:52"}
222
+ {"current_steps": 2220, "total_steps": 5000, "loss": 0.6365, "lr": 3.4040614252052305e-05, "epoch": 0.444, "percentage": 44.4, "elapsed_time": "0:11:02", "remaining_time": "0:13:49"}
223
+ {"current_steps": 2230, "total_steps": 5000, "loss": 0.897, "lr": 3.387767406020343e-05, "epoch": 0.446, "percentage": 44.6, "elapsed_time": "0:11:05", "remaining_time": "0:13:46"}
224
+ {"current_steps": 2240, "total_steps": 5000, "loss": 1.4772, "lr": 3.3714301183045385e-05, "epoch": 0.448, "percentage": 44.8, "elapsed_time": "0:11:08", "remaining_time": "0:13:43"}
225
+ {"current_steps": 2250, "total_steps": 5000, "loss": 0.6316, "lr": 3.355050358314172e-05, "epoch": 0.45, "percentage": 45.0, "elapsed_time": "0:11:11", "remaining_time": "0:13:40"}
226
+ {"current_steps": 2260, "total_steps": 5000, "loss": 0.5972, "lr": 3.338628924375638e-05, "epoch": 0.452, "percentage": 45.2, "elapsed_time": "0:11:14", "remaining_time": "0:13:37"}
227
+ {"current_steps": 2270, "total_steps": 5000, "loss": 0.7165, "lr": 3.322166616846458e-05, "epoch": 0.454, "percentage": 45.4, "elapsed_time": "0:11:16", "remaining_time": "0:13:34"}
228
+ {"current_steps": 2280, "total_steps": 5000, "loss": 1.8711, "lr": 3.305664238076278e-05, "epoch": 0.456, "percentage": 45.6, "elapsed_time": "0:11:19", "remaining_time": "0:13:31"}
229
+ {"current_steps": 2290, "total_steps": 5000, "loss": 0.9576, "lr": 3.289122592367757e-05, "epoch": 0.458, "percentage": 45.8, "elapsed_time": "0:11:23", "remaining_time": "0:13:28"}
230
+ {"current_steps": 2300, "total_steps": 5000, "loss": 1.4457, "lr": 3.272542485937369e-05, "epoch": 0.46, "percentage": 46.0, "elapsed_time": "0:11:25", "remaining_time": "0:13:25"}
231
+ {"current_steps": 2310, "total_steps": 5000, "loss": 0.9553, "lr": 3.2559247268761115e-05, "epoch": 0.462, "percentage": 46.2, "elapsed_time": "0:11:29", "remaining_time": "0:13:22"}
232
+ {"current_steps": 2320, "total_steps": 5000, "loss": 1.3703, "lr": 3.239270125110117e-05, "epoch": 0.464, "percentage": 46.4, "elapsed_time": "0:11:31", "remaining_time": "0:13:19"}
233
+ {"current_steps": 2330, "total_steps": 5000, "loss": 1.9259, "lr": 3.222579492361179e-05, "epoch": 0.466, "percentage": 46.6, "elapsed_time": "0:11:34", "remaining_time": "0:13:16"}
234
+ {"current_steps": 2340, "total_steps": 5000, "loss": 0.9021, "lr": 3.205853642107192e-05, "epoch": 0.468, "percentage": 46.8, "elapsed_time": "0:11:37", "remaining_time": "0:13:13"}
235
+ {"current_steps": 2350, "total_steps": 5000, "loss": 2.2195, "lr": 3.1890933895424976e-05, "epoch": 0.47, "percentage": 47.0, "elapsed_time": "0:11:40", "remaining_time": "0:13:10"}
236
+ {"current_steps": 2360, "total_steps": 5000, "loss": 0.724, "lr": 3.172299551538164e-05, "epoch": 0.472, "percentage": 47.2, "elapsed_time": "0:11:43", "remaining_time": "0:13:07"}
237
+ {"current_steps": 2370, "total_steps": 5000, "loss": 2.0286, "lr": 3.155472946602162e-05, "epoch": 0.474, "percentage": 47.4, "elapsed_time": "0:11:46", "remaining_time": "0:13:04"}
238
+ {"current_steps": 2380, "total_steps": 5000, "loss": 0.9387, "lr": 3.138614394839476e-05, "epoch": 0.476, "percentage": 47.6, "elapsed_time": "0:11:49", "remaining_time": "0:13:01"}
239
+ {"current_steps": 2390, "total_steps": 5000, "loss": 0.8844, "lr": 3.121724717912138e-05, "epoch": 0.478, "percentage": 47.8, "elapsed_time": "0:11:52", "remaining_time": "0:12:58"}
240
+ {"current_steps": 2400, "total_steps": 5000, "loss": 0.8334, "lr": 3.104804738999169e-05, "epoch": 0.48, "percentage": 48.0, "elapsed_time": "0:11:55", "remaining_time": "0:12:55"}
241
+ {"current_steps": 2410, "total_steps": 5000, "loss": 0.7062, "lr": 3.087855282756475e-05, "epoch": 0.482, "percentage": 48.2, "elapsed_time": "0:11:58", "remaining_time": "0:12:52"}
242
+ {"current_steps": 2420, "total_steps": 5000, "loss": 1.1329, "lr": 3.0708771752766394e-05, "epoch": 0.484, "percentage": 48.4, "elapsed_time": "0:12:01", "remaining_time": "0:12:49"}
243
+ {"current_steps": 2430, "total_steps": 5000, "loss": 3.1218, "lr": 3.053871244048669e-05, "epoch": 0.486, "percentage": 48.6, "elapsed_time": "0:12:04", "remaining_time": "0:12:46"}
244
+ {"current_steps": 2440, "total_steps": 5000, "loss": 1.795, "lr": 3.0368383179176585e-05, "epoch": 0.488, "percentage": 48.8, "elapsed_time": "0:12:07", "remaining_time": "0:12:43"}
245
+ {"current_steps": 2450, "total_steps": 5000, "loss": 1.3635, "lr": 3.0197792270443982e-05, "epoch": 0.49, "percentage": 49.0, "elapsed_time": "0:12:10", "remaining_time": "0:12:39"}
246
+ {"current_steps": 2460, "total_steps": 5000, "loss": 1.5214, "lr": 3.002694802864912e-05, "epoch": 0.492, "percentage": 49.2, "elapsed_time": "0:12:12", "remaining_time": "0:12:36"}
247
+ {"current_steps": 2470, "total_steps": 5000, "loss": 1.1378, "lr": 2.98558587804993e-05, "epoch": 0.494, "percentage": 49.4, "elapsed_time": "0:12:15", "remaining_time": "0:12:33"}
248
+ {"current_steps": 2480, "total_steps": 5000, "loss": 1.3969, "lr": 2.9684532864643122e-05, "epoch": 0.496, "percentage": 49.6, "elapsed_time": "0:12:18", "remaining_time": "0:12:30"}
249
+ {"current_steps": 2490, "total_steps": 5000, "loss": 0.7386, "lr": 2.9512978631264006e-05, "epoch": 0.498, "percentage": 49.8, "elapsed_time": "0:12:21", "remaining_time": "0:12:27"}
250
+ {"current_steps": 2500, "total_steps": 5000, "loss": 1.2025, "lr": 2.9341204441673266e-05, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:12:24", "remaining_time": "0:12:24"}
251
+ {"current_steps": 2510, "total_steps": 5000, "loss": 0.8712, "lr": 2.916921866790256e-05, "epoch": 0.502, "percentage": 50.2, "elapsed_time": "0:12:27", "remaining_time": "0:12:21"}
252
+ {"current_steps": 2520, "total_steps": 5000, "loss": 2.7228, "lr": 2.8997029692295874e-05, "epoch": 0.504, "percentage": 50.4, "elapsed_time": "0:12:30", "remaining_time": "0:12:18"}
253
+ {"current_steps": 2530, "total_steps": 5000, "loss": 1.6352, "lr": 2.8824645907100954e-05, "epoch": 0.506, "percentage": 50.6, "elapsed_time": "0:12:33", "remaining_time": "0:12:15"}
254
+ {"current_steps": 2540, "total_steps": 5000, "loss": 0.7708, "lr": 2.8652075714060295e-05, "epoch": 0.508, "percentage": 50.8, "elapsed_time": "0:12:36", "remaining_time": "0:12:12"}
255
+ {"current_steps": 2550, "total_steps": 5000, "loss": 1.7158, "lr": 2.8479327524001636e-05, "epoch": 0.51, "percentage": 51.0, "elapsed_time": "0:12:38", "remaining_time": "0:12:09"}
256
+ {"current_steps": 2560, "total_steps": 5000, "loss": 1.9952, "lr": 2.8306409756428064e-05, "epoch": 0.512, "percentage": 51.2, "elapsed_time": "0:12:41", "remaining_time": "0:12:05"}
257
+ {"current_steps": 2570, "total_steps": 5000, "loss": 1.7763, "lr": 2.8133330839107608e-05, "epoch": 0.514, "percentage": 51.4, "elapsed_time": "0:12:44", "remaining_time": "0:12:02"}
258
+ {"current_steps": 2580, "total_steps": 5000, "loss": 1.1427, "lr": 2.7960099207662532e-05, "epoch": 0.516, "percentage": 51.6, "elapsed_time": "0:12:47", "remaining_time": "0:11:59"}
259
+ {"current_steps": 2590, "total_steps": 5000, "loss": 1.3634, "lr": 2.7786723305158136e-05, "epoch": 0.518, "percentage": 51.8, "elapsed_time": "0:12:50", "remaining_time": "0:11:57"}
260
+ {"current_steps": 2600, "total_steps": 5000, "loss": 1.1372, "lr": 2.761321158169134e-05, "epoch": 0.52, "percentage": 52.0, "elapsed_time": "0:12:53", "remaining_time": "0:11:53"}
261
+ {"current_steps": 2610, "total_steps": 5000, "loss": 0.6111, "lr": 2.7439572493978736e-05, "epoch": 0.522, "percentage": 52.2, "elapsed_time": "0:12:56", "remaining_time": "0:11:50"}
262
+ {"current_steps": 2620, "total_steps": 5000, "loss": 1.1249, "lr": 2.726581450494451e-05, "epoch": 0.524, "percentage": 52.4, "elapsed_time": "0:12:59", "remaining_time": "0:11:47"}
263
+ {"current_steps": 2630, "total_steps": 5000, "loss": 4.3159, "lr": 2.7091946083307896e-05, "epoch": 0.526, "percentage": 52.6, "elapsed_time": "0:13:02", "remaining_time": "0:11:44"}
264
+ {"current_steps": 2640, "total_steps": 5000, "loss": 0.9282, "lr": 2.6917975703170466e-05, "epoch": 0.528, "percentage": 52.8, "elapsed_time": "0:13:04", "remaining_time": "0:11:41"}
265
+ {"current_steps": 2650, "total_steps": 5000, "loss": 1.3395, "lr": 2.674391184360313e-05, "epoch": 0.53, "percentage": 53.0, "elapsed_time": "0:13:07", "remaining_time": "0:11:38"}
266
+ {"current_steps": 2660, "total_steps": 5000, "loss": 0.7275, "lr": 2.656976298823284e-05, "epoch": 0.532, "percentage": 53.2, "elapsed_time": "0:13:10", "remaining_time": "0:11:35"}
267
+ {"current_steps": 2670, "total_steps": 5000, "loss": 5.1828, "lr": 2.6395537624829096e-05, "epoch": 0.534, "percentage": 53.4, "elapsed_time": "0:13:13", "remaining_time": "0:11:32"}
268
+ {"current_steps": 2680, "total_steps": 5000, "loss": 1.1841, "lr": 2.6221244244890336e-05, "epoch": 0.536, "percentage": 53.6, "elapsed_time": "0:13:15", "remaining_time": "0:11:28"}
269
+ {"current_steps": 2690, "total_steps": 5000, "loss": 2.1432, "lr": 2.604689134322999e-05, "epoch": 0.538, "percentage": 53.8, "elapsed_time": "0:13:18", "remaining_time": "0:11:26"}
270
+ {"current_steps": 2700, "total_steps": 5000, "loss": 1.1397, "lr": 2.587248741756253e-05, "epoch": 0.54, "percentage": 54.0, "elapsed_time": "0:13:21", "remaining_time": "0:11:23"}
271
+ {"current_steps": 2710, "total_steps": 5000, "loss": 1.8865, "lr": 2.5698040968089225e-05, "epoch": 0.542, "percentage": 54.2, "elapsed_time": "0:13:24", "remaining_time": "0:11:20"}
272
+ {"current_steps": 2720, "total_steps": 5000, "loss": 1.7895, "lr": 2.5523560497083926e-05, "epoch": 0.544, "percentage": 54.4, "elapsed_time": "0:13:28", "remaining_time": "0:11:17"}
273
+ {"current_steps": 2730, "total_steps": 5000, "loss": 0.5246, "lr": 2.5349054508478637e-05, "epoch": 0.546, "percentage": 54.6, "elapsed_time": "0:13:30", "remaining_time": "0:11:13"}
274
+ {"current_steps": 2740, "total_steps": 5000, "loss": 2.2288, "lr": 2.517453150744904e-05, "epoch": 0.548, "percentage": 54.8, "elapsed_time": "0:13:33", "remaining_time": "0:11:10"}
275
+ {"current_steps": 2750, "total_steps": 5000, "loss": 1.1424, "lr": 2.5e-05, "epoch": 0.55, "percentage": 55.0, "elapsed_time": "0:13:36", "remaining_time": "0:11:07"}
276
+ {"current_steps": 2760, "total_steps": 5000, "loss": 0.5168, "lr": 2.4825468492550964e-05, "epoch": 0.552, "percentage": 55.2, "elapsed_time": "0:13:38", "remaining_time": "0:11:04"}
277
+ {"current_steps": 2770, "total_steps": 5000, "loss": 1.6828, "lr": 2.4650945491521372e-05, "epoch": 0.554, "percentage": 55.4, "elapsed_time": "0:13:41", "remaining_time": "0:11:01"}
278
+ {"current_steps": 2780, "total_steps": 5000, "loss": 1.3219, "lr": 2.447643950291608e-05, "epoch": 0.556, "percentage": 55.6, "elapsed_time": "0:13:44", "remaining_time": "0:10:58"}
279
+ {"current_steps": 2790, "total_steps": 5000, "loss": 1.511, "lr": 2.4301959031910784e-05, "epoch": 0.558, "percentage": 55.8, "elapsed_time": "0:13:47", "remaining_time": "0:10:55"}
280
+ {"current_steps": 2800, "total_steps": 5000, "loss": 1.5983, "lr": 2.4127512582437485e-05, "epoch": 0.56, "percentage": 56.0, "elapsed_time": "0:13:50", "remaining_time": "0:10:52"}
281
+ {"current_steps": 2810, "total_steps": 5000, "loss": 0.7233, "lr": 2.3953108656770016e-05, "epoch": 0.562, "percentage": 56.2, "elapsed_time": "0:13:53", "remaining_time": "0:10:49"}
282
+ {"current_steps": 2820, "total_steps": 5000, "loss": 1.2128, "lr": 2.377875575510967e-05, "epoch": 0.564, "percentage": 56.4, "elapsed_time": "0:13:56", "remaining_time": "0:10:46"}
283
+ {"current_steps": 2830, "total_steps": 5000, "loss": 1.3444, "lr": 2.3604462375170906e-05, "epoch": 0.566, "percentage": 56.6, "elapsed_time": "0:13:59", "remaining_time": "0:10:43"}
284
+ {"current_steps": 2840, "total_steps": 5000, "loss": 0.6756, "lr": 2.3430237011767167e-05, "epoch": 0.568, "percentage": 56.8, "elapsed_time": "0:14:02", "remaining_time": "0:10:41"}
285
+ {"current_steps": 2850, "total_steps": 5000, "loss": 2.1279, "lr": 2.3256088156396868e-05, "epoch": 0.57, "percentage": 57.0, "elapsed_time": "0:14:05", "remaining_time": "0:10:37"}
286
+ {"current_steps": 2860, "total_steps": 5000, "loss": 1.2081, "lr": 2.3082024296829536e-05, "epoch": 0.572, "percentage": 57.2, "elapsed_time": "0:14:08", "remaining_time": "0:10:34"}
287
+ {"current_steps": 2870, "total_steps": 5000, "loss": 1.6913, "lr": 2.2908053916692117e-05, "epoch": 0.574, "percentage": 57.4, "elapsed_time": "0:14:11", "remaining_time": "0:10:32"}
288
+ {"current_steps": 2880, "total_steps": 5000, "loss": 0.7114, "lr": 2.2734185495055503e-05, "epoch": 0.576, "percentage": 57.6, "elapsed_time": "0:14:14", "remaining_time": "0:10:29"}
289
+ {"current_steps": 2890, "total_steps": 5000, "loss": 1.5252, "lr": 2.2560427506021266e-05, "epoch": 0.578, "percentage": 57.8, "elapsed_time": "0:14:17", "remaining_time": "0:10:26"}
290
+ {"current_steps": 2900, "total_steps": 5000, "loss": 1.2513, "lr": 2.238678841830867e-05, "epoch": 0.58, "percentage": 58.0, "elapsed_time": "0:14:20", "remaining_time": "0:10:23"}
291
+ {"current_steps": 2910, "total_steps": 5000, "loss": 1.5359, "lr": 2.2213276694841866e-05, "epoch": 0.582, "percentage": 58.2, "elapsed_time": "0:14:23", "remaining_time": "0:10:20"}
292
+ {"current_steps": 2920, "total_steps": 5000, "loss": 1.3664, "lr": 2.2039900792337474e-05, "epoch": 0.584, "percentage": 58.4, "elapsed_time": "0:14:26", "remaining_time": "0:10:17"}
293
+ {"current_steps": 2930, "total_steps": 5000, "loss": 1.102, "lr": 2.186666916089239e-05, "epoch": 0.586, "percentage": 58.6, "elapsed_time": "0:14:29", "remaining_time": "0:10:14"}
294
+ {"current_steps": 2940, "total_steps": 5000, "loss": 1.5992, "lr": 2.1693590243571938e-05, "epoch": 0.588, "percentage": 58.8, "elapsed_time": "0:14:32", "remaining_time": "0:10:11"}
295
+ {"current_steps": 2950, "total_steps": 5000, "loss": 0.8194, "lr": 2.1520672475998373e-05, "epoch": 0.59, "percentage": 59.0, "elapsed_time": "0:14:35", "remaining_time": "0:10:08"}
296
+ {"current_steps": 2960, "total_steps": 5000, "loss": 0.8753, "lr": 2.1347924285939714e-05, "epoch": 0.592, "percentage": 59.2, "elapsed_time": "0:14:38", "remaining_time": "0:10:05"}
297
+ {"current_steps": 2970, "total_steps": 5000, "loss": 1.8568, "lr": 2.117535409289905e-05, "epoch": 0.594, "percentage": 59.4, "elapsed_time": "0:14:41", "remaining_time": "0:10:02"}
298
+ {"current_steps": 2980, "total_steps": 5000, "loss": 2.0932, "lr": 2.1002970307704132e-05, "epoch": 0.596, "percentage": 59.6, "elapsed_time": "0:14:44", "remaining_time": "0:09:59"}
299
+ {"current_steps": 2990, "total_steps": 5000, "loss": 0.6447, "lr": 2.0830781332097446e-05, "epoch": 0.598, "percentage": 59.8, "elapsed_time": "0:14:47", "remaining_time": "0:09:56"}
300
+ {"current_steps": 3000, "total_steps": 5000, "loss": 0.8866, "lr": 2.0658795558326743e-05, "epoch": 0.6, "percentage": 60.0, "elapsed_time": "0:14:50", "remaining_time": "0:09:53"}
301
+ {"current_steps": 3010, "total_steps": 5000, "loss": 2.3415, "lr": 2.0487021368736003e-05, "epoch": 0.602, "percentage": 60.2, "elapsed_time": "0:14:52", "remaining_time": "0:09:50"}
302
+ {"current_steps": 3020, "total_steps": 5000, "loss": 1.244, "lr": 2.031546713535688e-05, "epoch": 0.604, "percentage": 60.4, "elapsed_time": "0:14:55", "remaining_time": "0:09:47"}
303
+ {"current_steps": 3030, "total_steps": 5000, "loss": 2.6138, "lr": 2.0144141219500705e-05, "epoch": 0.606, "percentage": 60.6, "elapsed_time": "0:14:58", "remaining_time": "0:09:44"}
304
+ {"current_steps": 3040, "total_steps": 5000, "loss": 0.8315, "lr": 1.9973051971350888e-05, "epoch": 0.608, "percentage": 60.8, "elapsed_time": "0:15:01", "remaining_time": "0:09:41"}
305
+ {"current_steps": 3050, "total_steps": 5000, "loss": 4.5689, "lr": 1.980220772955602e-05, "epoch": 0.61, "percentage": 61.0, "elapsed_time": "0:15:04", "remaining_time": "0:09:38"}
306
+ {"current_steps": 3060, "total_steps": 5000, "loss": 1.2894, "lr": 1.963161682082342e-05, "epoch": 0.612, "percentage": 61.2, "elapsed_time": "0:15:07", "remaining_time": "0:09:35"}
307
+ {"current_steps": 3070, "total_steps": 5000, "loss": 1.1867, "lr": 1.946128755951332e-05, "epoch": 0.614, "percentage": 61.4, "elapsed_time": "0:15:10", "remaining_time": "0:09:32"}
308
+ {"current_steps": 3080, "total_steps": 5000, "loss": 1.8681, "lr": 1.9291228247233605e-05, "epoch": 0.616, "percentage": 61.6, "elapsed_time": "0:15:13", "remaining_time": "0:09:29"}
309
+ {"current_steps": 3090, "total_steps": 5000, "loss": 1.1927, "lr": 1.912144717243525e-05, "epoch": 0.618, "percentage": 61.8, "elapsed_time": "0:15:16", "remaining_time": "0:09:26"}
310
+ {"current_steps": 3100, "total_steps": 5000, "loss": 1.6265, "lr": 1.895195261000831e-05, "epoch": 0.62, "percentage": 62.0, "elapsed_time": "0:15:19", "remaining_time": "0:09:23"}
311
+ {"current_steps": 3110, "total_steps": 5000, "loss": 1.3608, "lr": 1.8782752820878634e-05, "epoch": 0.622, "percentage": 62.2, "elapsed_time": "0:15:21", "remaining_time": "0:09:20"}
312
+ {"current_steps": 3120, "total_steps": 5000, "loss": 1.4472, "lr": 1.8613856051605243e-05, "epoch": 0.624, "percentage": 62.4, "elapsed_time": "0:15:24", "remaining_time": "0:09:17"}
313
+ {"current_steps": 3130, "total_steps": 5000, "loss": 0.6093, "lr": 1.8445270533978388e-05, "epoch": 0.626, "percentage": 62.6, "elapsed_time": "0:15:27", "remaining_time": "0:09:14"}
314
+ {"current_steps": 3140, "total_steps": 5000, "loss": 1.3848, "lr": 1.827700448461836e-05, "epoch": 0.628, "percentage": 62.8, "elapsed_time": "0:15:30", "remaining_time": "0:09:11"}
315
+ {"current_steps": 3150, "total_steps": 5000, "loss": 1.1096, "lr": 1.8109066104575023e-05, "epoch": 0.63, "percentage": 63.0, "elapsed_time": "0:15:33", "remaining_time": "0:09:08"}
316
+ {"current_steps": 3160, "total_steps": 5000, "loss": 0.8174, "lr": 1.7941463578928086e-05, "epoch": 0.632, "percentage": 63.2, "elapsed_time": "0:15:35", "remaining_time": "0:09:04"}
317
+ {"current_steps": 3170, "total_steps": 5000, "loss": 1.5061, "lr": 1.7774205076388206e-05, "epoch": 0.634, "percentage": 63.4, "elapsed_time": "0:15:38", "remaining_time": "0:09:01"}
318
+ {"current_steps": 3180, "total_steps": 5000, "loss": 0.7052, "lr": 1.7607298748898842e-05, "epoch": 0.636, "percentage": 63.6, "elapsed_time": "0:15:41", "remaining_time": "0:08:58"}
319
+ {"current_steps": 3190, "total_steps": 5000, "loss": 2.0225, "lr": 1.744075273123889e-05, "epoch": 0.638, "percentage": 63.8, "elapsed_time": "0:15:44", "remaining_time": "0:08:55"}
320
+ {"current_steps": 3200, "total_steps": 5000, "loss": 2.2687, "lr": 1.7274575140626318e-05, "epoch": 0.64, "percentage": 64.0, "elapsed_time": "0:15:47", "remaining_time": "0:08:52"}
321
+ {"current_steps": 3210, "total_steps": 5000, "loss": 0.6663, "lr": 1.7108774076322443e-05, "epoch": 0.642, "percentage": 64.2, "elapsed_time": "0:15:50", "remaining_time": "0:08:49"}
322
+ {"current_steps": 3220, "total_steps": 5000, "loss": 1.259, "lr": 1.6943357619237226e-05, "epoch": 0.644, "percentage": 64.4, "elapsed_time": "0:15:53", "remaining_time": "0:08:47"}
323
+ {"current_steps": 3230, "total_steps": 5000, "loss": 1.1647, "lr": 1.677833383153542e-05, "epoch": 0.646, "percentage": 64.6, "elapsed_time": "0:15:56", "remaining_time": "0:08:44"}
324
+ {"current_steps": 3240, "total_steps": 5000, "loss": 0.9512, "lr": 1.6613710756243626e-05, "epoch": 0.648, "percentage": 64.8, "elapsed_time": "0:15:59", "remaining_time": "0:08:40"}
325
+ {"current_steps": 3250, "total_steps": 5000, "loss": 3.6506, "lr": 1.6449496416858284e-05, "epoch": 0.65, "percentage": 65.0, "elapsed_time": "0:16:02", "remaining_time": "0:08:38"}
326
+ {"current_steps": 3260, "total_steps": 5000, "loss": 1.8059, "lr": 1.6285698816954624e-05, "epoch": 0.652, "percentage": 65.2, "elapsed_time": "0:16:05", "remaining_time": "0:08:35"}
327
+ {"current_steps": 3270, "total_steps": 5000, "loss": 1.8584, "lr": 1.612232593979658e-05, "epoch": 0.654, "percentage": 65.4, "elapsed_time": "0:16:07", "remaining_time": "0:08:32"}
328
+ {"current_steps": 3280, "total_steps": 5000, "loss": 0.8049, "lr": 1.5959385747947698e-05, "epoch": 0.656, "percentage": 65.6, "elapsed_time": "0:16:10", "remaining_time": "0:08:28"}
329
+ {"current_steps": 3290, "total_steps": 5000, "loss": 1.5368, "lr": 1.5796886182883053e-05, "epoch": 0.658, "percentage": 65.8, "elapsed_time": "0:16:13", "remaining_time": "0:08:26"}
330
+ {"current_steps": 3300, "total_steps": 5000, "loss": 1.5826, "lr": 1.56348351646022e-05, "epoch": 0.66, "percentage": 66.0, "elapsed_time": "0:16:16", "remaining_time": "0:08:22"}
331
+ {"current_steps": 3310, "total_steps": 5000, "loss": 0.8753, "lr": 1.547324059124315e-05, "epoch": 0.662, "percentage": 66.2, "elapsed_time": "0:16:19", "remaining_time": "0:08:20"}
332
+ {"current_steps": 3320, "total_steps": 5000, "loss": 1.7769, "lr": 1.5312110338697426e-05, "epoch": 0.664, "percentage": 66.4, "elapsed_time": "0:16:22", "remaining_time": "0:08:16"}
333
+ {"current_steps": 3330, "total_steps": 5000, "loss": 3.1034, "lr": 1.5151452260226224e-05, "epoch": 0.666, "percentage": 66.6, "elapsed_time": "0:16:24", "remaining_time": "0:08:13"}
334
+ {"current_steps": 3340, "total_steps": 5000, "loss": 2.3354, "lr": 1.4991274186077632e-05, "epoch": 0.668, "percentage": 66.8, "elapsed_time": "0:16:27", "remaining_time": "0:08:10"}
335
+ {"current_steps": 3350, "total_steps": 5000, "loss": 0.603, "lr": 1.4831583923104999e-05, "epoch": 0.67, "percentage": 67.0, "elapsed_time": "0:16:30", "remaining_time": "0:08:07"}
336
+ {"current_steps": 3360, "total_steps": 5000, "loss": 0.6025, "lr": 1.467238925438646e-05, "epoch": 0.672, "percentage": 67.2, "elapsed_time": "0:16:33", "remaining_time": "0:08:05"}
337
+ {"current_steps": 3370, "total_steps": 5000, "loss": 0.9105, "lr": 1.4513697938845572e-05, "epoch": 0.674, "percentage": 67.4, "elapsed_time": "0:16:36", "remaining_time": "0:08:01"}
338
+ {"current_steps": 3380, "total_steps": 5000, "loss": 0.8274, "lr": 1.4355517710873184e-05, "epoch": 0.676, "percentage": 67.6, "elapsed_time": "0:16:39", "remaining_time": "0:07:59"}
339
+ {"current_steps": 3390, "total_steps": 5000, "loss": 1.5935, "lr": 1.4197856279950438e-05, "epoch": 0.678, "percentage": 67.8, "elapsed_time": "0:16:42", "remaining_time": "0:07:56"}
340
+ {"current_steps": 3400, "total_steps": 5000, "loss": 0.7384, "lr": 1.4040721330273062e-05, "epoch": 0.68, "percentage": 68.0, "elapsed_time": "0:16:44", "remaining_time": "0:07:52"}
341
+ {"current_steps": 3410, "total_steps": 5000, "loss": 1.8132, "lr": 1.388412052037682e-05, "epoch": 0.682, "percentage": 68.2, "elapsed_time": "0:16:47", "remaining_time": "0:07:49"}
342
+ {"current_steps": 3420, "total_steps": 5000, "loss": 0.7159, "lr": 1.3728061482764238e-05, "epoch": 0.684, "percentage": 68.4, "elapsed_time": "0:16:50", "remaining_time": "0:07:46"}
343
+ {"current_steps": 3430, "total_steps": 5000, "loss": 0.8062, "lr": 1.3572551823532654e-05, "epoch": 0.686, "percentage": 68.6, "elapsed_time": "0:16:53", "remaining_time": "0:07:44"}
344
+ {"current_steps": 3440, "total_steps": 5000, "loss": 1.4223, "lr": 1.3417599122003464e-05, "epoch": 0.688, "percentage": 68.8, "elapsed_time": "0:16:56", "remaining_time": "0:07:41"}
345
+ {"current_steps": 3450, "total_steps": 5000, "loss": 0.8253, "lr": 1.3263210930352737e-05, "epoch": 0.69, "percentage": 69.0, "elapsed_time": "0:16:59", "remaining_time": "0:07:38"}
346
+ {"current_steps": 3460, "total_steps": 5000, "loss": 1.6049, "lr": 1.3109394773243117e-05, "epoch": 0.692, "percentage": 69.2, "elapsed_time": "0:17:02", "remaining_time": "0:07:35"}
347
+ {"current_steps": 3470, "total_steps": 5000, "loss": 0.6334, "lr": 1.2956158147457115e-05, "epoch": 0.694, "percentage": 69.4, "elapsed_time": "0:17:04", "remaining_time": "0:07:31"}
348
+ {"current_steps": 3480, "total_steps": 5000, "loss": 1.605, "lr": 1.280350852153168e-05, "epoch": 0.696, "percentage": 69.6, "elapsed_time": "0:17:07", "remaining_time": "0:07:29"}
349
+ {"current_steps": 3490, "total_steps": 5000, "loss": 1.3822, "lr": 1.2651453335394231e-05, "epoch": 0.698, "percentage": 69.8, "elapsed_time": "0:17:10", "remaining_time": "0:07:26"}
350
+ {"current_steps": 3500, "total_steps": 5000, "loss": 1.154, "lr": 1.2500000000000006e-05, "epoch": 0.7, "percentage": 70.0, "elapsed_time": "0:17:13", "remaining_time": "0:07:23"}
351
+ {"current_steps": 3510, "total_steps": 5000, "loss": 1.3894, "lr": 1.234915589697091e-05, "epoch": 0.702, "percentage": 70.2, "elapsed_time": "0:17:16", "remaining_time": "0:07:20"}
352
+ {"current_steps": 3520, "total_steps": 5000, "loss": 0.8913, "lr": 1.2198928378235716e-05, "epoch": 0.704, "percentage": 70.4, "elapsed_time": "0:17:19", "remaining_time": "0:07:17"}
353
+ {"current_steps": 3530, "total_steps": 5000, "loss": 1.4518, "lr": 1.2049324765671749e-05, "epoch": 0.706, "percentage": 70.6, "elapsed_time": "0:17:22", "remaining_time": "0:07:14"}
354
+ {"current_steps": 3540, "total_steps": 5000, "loss": 2.2489, "lr": 1.1900352350748026e-05, "epoch": 0.708, "percentage": 70.8, "elapsed_time": "0:17:25", "remaining_time": "0:07:11"}
355
+ {"current_steps": 3550, "total_steps": 5000, "loss": 0.8946, "lr": 1.175201839416988e-05, "epoch": 0.71, "percentage": 71.0, "elapsed_time": "0:17:28", "remaining_time": "0:07:08"}
356
+ {"current_steps": 3560, "total_steps": 5000, "loss": 0.5814, "lr": 1.1604330125525079e-05, "epoch": 0.712, "percentage": 71.2, "elapsed_time": "0:17:30", "remaining_time": "0:07:04"}
357
+ {"current_steps": 3570, "total_steps": 5000, "loss": 4.9924, "lr": 1.1457294742931507e-05, "epoch": 0.714, "percentage": 71.4, "elapsed_time": "0:17:33", "remaining_time": "0:07:02"}
358
+ {"current_steps": 3580, "total_steps": 5000, "loss": 0.9913, "lr": 1.1310919412686247e-05, "epoch": 0.716, "percentage": 71.6, "elapsed_time": "0:17:36", "remaining_time": "0:06:59"}
359
+ {"current_steps": 3590, "total_steps": 5000, "loss": 4.148, "lr": 1.11652112689164e-05, "epoch": 0.718, "percentage": 71.8, "elapsed_time": "0:17:39", "remaining_time": "0:06:56"}
360
+ {"current_steps": 3600, "total_steps": 5000, "loss": 1.184, "lr": 1.1020177413231334e-05, "epoch": 0.72, "percentage": 72.0, "elapsed_time": "0:17:42", "remaining_time": "0:06:53"}
361
+ {"current_steps": 3610, "total_steps": 5000, "loss": 0.8565, "lr": 1.0875824914376553e-05, "epoch": 0.722, "percentage": 72.2, "elapsed_time": "0:17:45", "remaining_time": "0:06:50"}
362
+ {"current_steps": 3620, "total_steps": 5000, "loss": 1.6703, "lr": 1.0732160807889211e-05, "epoch": 0.724, "percentage": 72.4, "elapsed_time": "0:17:47", "remaining_time": "0:06:47"}
363
+ {"current_steps": 3630, "total_steps": 5000, "loss": 1.2277, "lr": 1.058919209575517e-05, "epoch": 0.726, "percentage": 72.6, "elapsed_time": "0:17:51", "remaining_time": "0:06:44"}
364
+ {"current_steps": 3640, "total_steps": 5000, "loss": 0.7438, "lr": 1.0446925746067768e-05, "epoch": 0.728, "percentage": 72.8, "elapsed_time": "0:17:54", "remaining_time": "0:06:41"}
365
+ {"current_steps": 3650, "total_steps": 5000, "loss": 3.7347, "lr": 1.0305368692688174e-05, "epoch": 0.73, "percentage": 73.0, "elapsed_time": "0:17:57", "remaining_time": "0:06:38"}
366
+ {"current_steps": 3660, "total_steps": 5000, "loss": 1.8496, "lr": 1.0164527834907467e-05, "epoch": 0.732, "percentage": 73.2, "elapsed_time": "0:17:59", "remaining_time": "0:06:35"}
367
+ {"current_steps": 3670, "total_steps": 5000, "loss": 1.2388, "lr": 1.0024410037110357e-05, "epoch": 0.734, "percentage": 73.4, "elapsed_time": "0:18:02", "remaining_time": "0:06:32"}
368
+ {"current_steps": 3680, "total_steps": 5000, "loss": 5.6336, "lr": 9.88502212844063e-06, "epoch": 0.736, "percentage": 73.6, "elapsed_time": "0:18:05", "remaining_time": "0:06:29"}
369
+ {"current_steps": 3690, "total_steps": 5000, "loss": 1.434, "lr": 9.746370902468311e-06, "epoch": 0.738, "percentage": 73.8, "elapsed_time": "0:18:08", "remaining_time": "0:06:26"}
370
+ {"current_steps": 3700, "total_steps": 5000, "loss": 2.9898, "lr": 9.608463116858542e-06, "epoch": 0.74, "percentage": 74.0, "elapsed_time": "0:18:10", "remaining_time": "0:06:23"}
371
+ {"current_steps": 3710, "total_steps": 5000, "loss": 0.9298, "lr": 9.471305493042243e-06, "epoch": 0.742, "percentage": 74.2, "elapsed_time": "0:18:13", "remaining_time": "0:06:20"}
372
+ {"current_steps": 3720, "total_steps": 5000, "loss": 2.3826, "lr": 9.334904715888495e-06, "epoch": 0.744, "percentage": 74.4, "elapsed_time": "0:18:16", "remaining_time": "0:06:17"}
373
+ {"current_steps": 3730, "total_steps": 5000, "loss": 0.8495, "lr": 9.199267433378727e-06, "epoch": 0.746, "percentage": 74.6, "elapsed_time": "0:18:18", "remaining_time": "0:06:14"}
374
+ {"current_steps": 3740, "total_steps": 5000, "loss": 0.6518, "lr": 9.064400256282757e-06, "epoch": 0.748, "percentage": 74.8, "elapsed_time": "0:18:21", "remaining_time": "0:06:11"}
375
+ {"current_steps": 3750, "total_steps": 5000, "loss": 0.8714, "lr": 8.930309757836517e-06, "epoch": 0.75, "percentage": 75.0, "elapsed_time": "0:18:24", "remaining_time": "0:06:08"}
376
+ {"current_steps": 3760, "total_steps": 5000, "loss": 1.378, "lr": 8.797002473421728e-06, "epoch": 0.752, "percentage": 75.2, "elapsed_time": "0:18:27", "remaining_time": "0:06:05"}
377
+ {"current_steps": 3770, "total_steps": 5000, "loss": 1.1048, "lr": 8.664484900247363e-06, "epoch": 0.754, "percentage": 75.4, "elapsed_time": "0:18:30", "remaining_time": "0:06:02"}
378
+ {"current_steps": 3780, "total_steps": 5000, "loss": 0.9402, "lr": 8.532763497032987e-06, "epoch": 0.756, "percentage": 75.6, "elapsed_time": "0:18:33", "remaining_time": "0:05:59"}
379
+ {"current_steps": 3790, "total_steps": 5000, "loss": 1.3253, "lr": 8.40184468369396e-06, "epoch": 0.758, "percentage": 75.8, "elapsed_time": "0:18:36", "remaining_time": "0:05:56"}
380
+ {"current_steps": 3800, "total_steps": 5000, "loss": 1.0062, "lr": 8.271734841028553e-06, "epoch": 0.76, "percentage": 76.0, "elapsed_time": "0:18:39", "remaining_time": "0:05:53"}
381
+ {"current_steps": 3810, "total_steps": 5000, "loss": 0.9064, "lr": 8.142440310406924e-06, "epoch": 0.762, "percentage": 76.2, "elapsed_time": "0:18:42", "remaining_time": "0:05:50"}
382
+ {"current_steps": 3820, "total_steps": 5000, "loss": 3.918, "lr": 8.013967393462094e-06, "epoch": 0.764, "percentage": 76.4, "elapsed_time": "0:18:45", "remaining_time": "0:05:47"}
383
+ {"current_steps": 3830, "total_steps": 5000, "loss": 0.4079, "lr": 7.886322351782783e-06, "epoch": 0.766, "percentage": 76.6, "elapsed_time": "0:18:48", "remaining_time": "0:05:44"}
384
+ {"current_steps": 3840, "total_steps": 5000, "loss": 0.8509, "lr": 7.759511406608255e-06, "epoch": 0.768, "percentage": 76.8, "elapsed_time": "0:18:51", "remaining_time": "0:05:41"}
385
+ {"current_steps": 3850, "total_steps": 5000, "loss": 1.9055, "lr": 7.633540738525066e-06, "epoch": 0.77, "percentage": 77.0, "elapsed_time": "0:18:53", "remaining_time": "0:05:38"}
386
+ {"current_steps": 3860, "total_steps": 5000, "loss": 6.2419, "lr": 7.508416487165862e-06, "epoch": 0.772, "percentage": 77.2, "elapsed_time": "0:18:56", "remaining_time": "0:05:35"}
387
+ {"current_steps": 3870, "total_steps": 5000, "loss": 1.1085, "lr": 7.384144750910133e-06, "epoch": 0.774, "percentage": 77.4, "elapsed_time": "0:18:59", "remaining_time": "0:05:32"}
388
+ {"current_steps": 3880, "total_steps": 5000, "loss": 4.1298, "lr": 7.260731586586983e-06, "epoch": 0.776, "percentage": 77.6, "elapsed_time": "0:19:02", "remaining_time": "0:05:29"}
389
+ {"current_steps": 3890, "total_steps": 5000, "loss": 0.7264, "lr": 7.138183009179922e-06, "epoch": 0.778, "percentage": 77.8, "elapsed_time": "0:19:05", "remaining_time": "0:05:26"}
390
+ {"current_steps": 3900, "total_steps": 5000, "loss": 1.0637, "lr": 7.016504991533726e-06, "epoch": 0.78, "percentage": 78.0, "elapsed_time": "0:19:08", "remaining_time": "0:05:23"}
391
+ {"current_steps": 3910, "total_steps": 5000, "loss": 1.2957, "lr": 6.895703464063319e-06, "epoch": 0.782, "percentage": 78.2, "elapsed_time": "0:19:11", "remaining_time": "0:05:20"}
392
+ {"current_steps": 3920, "total_steps": 5000, "loss": 0.4924, "lr": 6.775784314464717e-06, "epoch": 0.784, "percentage": 78.4, "elapsed_time": "0:19:13", "remaining_time": "0:05:17"}
393
+ {"current_steps": 3930, "total_steps": 5000, "loss": 1.0074, "lr": 6.656753387428089e-06, "epoch": 0.786, "percentage": 78.6, "elapsed_time": "0:19:16", "remaining_time": "0:05:14"}
394
+ {"current_steps": 3940, "total_steps": 5000, "loss": 1.7825, "lr": 6.538616484352902e-06, "epoch": 0.788, "percentage": 78.8, "elapsed_time": "0:19:19", "remaining_time": "0:05:11"}
395
+ {"current_steps": 3950, "total_steps": 5000, "loss": 1.2334, "lr": 6.421379363065142e-06, "epoch": 0.79, "percentage": 79.0, "elapsed_time": "0:19:22", "remaining_time": "0:05:08"}
396
+ {"current_steps": 3960, "total_steps": 5000, "loss": 1.9249, "lr": 6.305047737536707e-06, "epoch": 0.792, "percentage": 79.2, "elapsed_time": "0:19:24", "remaining_time": "0:05:05"}
397
+ {"current_steps": 3970, "total_steps": 5000, "loss": 1.4351, "lr": 6.189627277606894e-06, "epoch": 0.794, "percentage": 79.4, "elapsed_time": "0:19:27", "remaining_time": "0:05:02"}
398
+ {"current_steps": 3980, "total_steps": 5000, "loss": 1.6044, "lr": 6.075123608706093e-06, "epoch": 0.796, "percentage": 79.6, "elapsed_time": "0:19:30", "remaining_time": "0:04:59"}
399
+ {"current_steps": 3990, "total_steps": 5000, "loss": 1.7348, "lr": 5.961542311581586e-06, "epoch": 0.798, "percentage": 79.8, "elapsed_time": "0:19:33", "remaining_time": "0:04:57"}
400
+ {"current_steps": 4000, "total_steps": 5000, "loss": 12.6939, "lr": 5.848888922025553e-06, "epoch": 0.8, "percentage": 80.0, "elapsed_time": "0:19:36", "remaining_time": "0:04:54"}
401
+ {"current_steps": 4010, "total_steps": 5000, "loss": 0.8231, "lr": 5.737168930605272e-06, "epoch": 0.802, "percentage": 80.2, "elapsed_time": "0:19:39", "remaining_time": "0:04:51"}
402
+ {"current_steps": 4020, "total_steps": 5000, "loss": 2.239, "lr": 5.626387782395512e-06, "epoch": 0.804, "percentage": 80.4, "elapsed_time": "0:19:42", "remaining_time": "0:04:48"}
403
+ {"current_steps": 4030, "total_steps": 5000, "loss": 3.1218, "lr": 5.5165508767131415e-06, "epoch": 0.806, "percentage": 80.6, "elapsed_time": "0:19:45", "remaining_time": "0:04:45"}
404
+ {"current_steps": 4040, "total_steps": 5000, "loss": 0.5524, "lr": 5.4076635668540075e-06, "epoch": 0.808, "percentage": 80.8, "elapsed_time": "0:19:48", "remaining_time": "0:04:42"}
405
+ {"current_steps": 4050, "total_steps": 5000, "loss": 2.4598, "lr": 5.299731159831953e-06, "epoch": 0.81, "percentage": 81.0, "elapsed_time": "0:19:50", "remaining_time": "0:04:39"}
406
+ {"current_steps": 4060, "total_steps": 5000, "loss": 0.634, "lr": 5.192758916120236e-06, "epoch": 0.812, "percentage": 81.2, "elapsed_time": "0:19:53", "remaining_time": "0:04:36"}
407
+ {"current_steps": 4070, "total_steps": 5000, "loss": 0.8943, "lr": 5.086752049395094e-06, "epoch": 0.814, "percentage": 81.4, "elapsed_time": "0:19:56", "remaining_time": "0:04:33"}
408
+ {"current_steps": 4080, "total_steps": 5000, "loss": 1.1776, "lr": 4.981715726281666e-06, "epoch": 0.816, "percentage": 81.6, "elapsed_time": "0:19:59", "remaining_time": "0:04:30"}
409
+ {"current_steps": 4090, "total_steps": 5000, "loss": 0.918, "lr": 4.877655066102149e-06, "epoch": 0.818, "percentage": 81.8, "elapsed_time": "0:20:02", "remaining_time": "0:04:27"}
410
+ {"current_steps": 4100, "total_steps": 5000, "loss": 0.3834, "lr": 4.7745751406263165e-06, "epoch": 0.82, "percentage": 82.0, "elapsed_time": "0:20:04", "remaining_time": "0:04:24"}
411
+ {"current_steps": 4110, "total_steps": 5000, "loss": 1.0476, "lr": 4.672480973824311e-06, "epoch": 0.822, "percentage": 82.2, "elapsed_time": "0:20:07", "remaining_time": "0:04:21"}
412
+ {"current_steps": 4120, "total_steps": 5000, "loss": 0.7036, "lr": 4.571377541621788e-06, "epoch": 0.824, "percentage": 82.4, "elapsed_time": "0:20:10", "remaining_time": "0:04:18"}
413
+ {"current_steps": 4130, "total_steps": 5000, "loss": 1.4096, "lr": 4.4712697716574e-06, "epoch": 0.826, "percentage": 82.6, "elapsed_time": "0:20:13", "remaining_time": "0:04:15"}
414
+ {"current_steps": 4140, "total_steps": 5000, "loss": 1.2209, "lr": 4.372162543042624e-06, "epoch": 0.828, "percentage": 82.8, "elapsed_time": "0:20:15", "remaining_time": "0:04:12"}
415
+ {"current_steps": 4150, "total_steps": 5000, "loss": 1.4758, "lr": 4.274060686123959e-06, "epoch": 0.83, "percentage": 83.0, "elapsed_time": "0:20:18", "remaining_time": "0:04:09"}
416
+ {"current_steps": 4160, "total_steps": 5000, "loss": 1.546, "lr": 4.176968982247514e-06, "epoch": 0.832, "percentage": 83.2, "elapsed_time": "0:20:21", "remaining_time": "0:04:06"}
417
+ {"current_steps": 4170, "total_steps": 5000, "loss": 0.8864, "lr": 4.08089216352596e-06, "epoch": 0.834, "percentage": 83.4, "elapsed_time": "0:20:24", "remaining_time": "0:04:03"}
418
+ {"current_steps": 4180, "total_steps": 5000, "loss": 1.4541, "lr": 3.985834912607894e-06, "epoch": 0.836, "percentage": 83.6, "elapsed_time": "0:20:27", "remaining_time": "0:04:00"}
419
+ {"current_steps": 4190, "total_steps": 5000, "loss": 0.6756, "lr": 3.891801862449629e-06, "epoch": 0.838, "percentage": 83.8, "elapsed_time": "0:20:30", "remaining_time": "0:03:57"}
420
+ {"current_steps": 4200, "total_steps": 5000, "loss": 1.3881, "lr": 3.798797596089351e-06, "epoch": 0.84, "percentage": 84.0, "elapsed_time": "0:20:33", "remaining_time": "0:03:54"}
421
+ {"current_steps": 4210, "total_steps": 5000, "loss": 2.3349, "lr": 3.7068266464238084e-06, "epoch": 0.842, "percentage": 84.2, "elapsed_time": "0:20:36", "remaining_time": "0:03:52"}
422
+ {"current_steps": 4220, "total_steps": 5000, "loss": 0.7973, "lr": 3.6158934959873353e-06, "epoch": 0.844, "percentage": 84.4, "elapsed_time": "0:20:39", "remaining_time": "0:03:49"}
423
+ {"current_steps": 4230, "total_steps": 5000, "loss": 1.1455, "lr": 3.5260025767333893e-06, "epoch": 0.846, "percentage": 84.6, "elapsed_time": "0:20:42", "remaining_time": "0:03:46"}
424
+ {"current_steps": 4240, "total_steps": 5000, "loss": 2.3828, "lr": 3.4371582698185633e-06, "epoch": 0.848, "percentage": 84.8, "elapsed_time": "0:20:45", "remaining_time": "0:03:43"}
425
+ {"current_steps": 4250, "total_steps": 5000, "loss": 0.8389, "lr": 3.3493649053890326e-06, "epoch": 0.85, "percentage": 85.0, "elapsed_time": "0:20:48", "remaining_time": "0:03:40"}
426
+ {"current_steps": 4260, "total_steps": 5000, "loss": 0.7594, "lr": 3.262626762369525e-06, "epoch": 0.852, "percentage": 85.2, "elapsed_time": "0:20:51", "remaining_time": "0:03:37"}
427
+ {"current_steps": 4270, "total_steps": 5000, "loss": 0.8006, "lr": 3.176948068254762e-06, "epoch": 0.854, "percentage": 85.4, "elapsed_time": "0:20:53", "remaining_time": "0:03:34"}
428
+ {"current_steps": 4280, "total_steps": 5000, "loss": 1.4222, "lr": 3.092332998903416e-06, "epoch": 0.856, "percentage": 85.6, "elapsed_time": "0:20:56", "remaining_time": "0:03:31"}
429
+ {"current_steps": 4290, "total_steps": 5000, "loss": 2.5477, "lr": 3.0087856783345914e-06, "epoch": 0.858, "percentage": 85.8, "elapsed_time": "0:20:59", "remaining_time": "0:03:28"}
430
+ {"current_steps": 4300, "total_steps": 5000, "loss": 2.2044, "lr": 2.9263101785268254e-06, "epoch": 0.86, "percentage": 86.0, "elapsed_time": "0:21:02", "remaining_time": "0:03:25"}
431
+ {"current_steps": 4310, "total_steps": 5000, "loss": 1.5267, "lr": 2.8449105192196316e-06, "epoch": 0.862, "percentage": 86.2, "elapsed_time": "0:21:04", "remaining_time": "0:03:22"}
432
+ {"current_steps": 4320, "total_steps": 5000, "loss": 0.6063, "lr": 2.764590667717562e-06, "epoch": 0.864, "percentage": 86.4, "elapsed_time": "0:21:07", "remaining_time": "0:03:19"}
433
+ {"current_steps": 4330, "total_steps": 5000, "loss": 2.6636, "lr": 2.6853545386968606e-06, "epoch": 0.866, "percentage": 86.6, "elapsed_time": "0:21:11", "remaining_time": "0:03:16"}
434
+ {"current_steps": 4340, "total_steps": 5000, "loss": 0.9864, "lr": 2.6072059940146775e-06, "epoch": 0.868, "percentage": 86.8, "elapsed_time": "0:21:13", "remaining_time": "0:03:13"}
435
+ {"current_steps": 4350, "total_steps": 5000, "loss": 2.3756, "lr": 2.5301488425208296e-06, "epoch": 0.87, "percentage": 87.0, "elapsed_time": "0:21:16", "remaining_time": "0:03:10"}
436
+ {"current_steps": 4360, "total_steps": 5000, "loss": 0.962, "lr": 2.454186839872158e-06, "epoch": 0.872, "percentage": 87.2, "elapsed_time": "0:21:19", "remaining_time": "0:03:07"}
437
+ {"current_steps": 4370, "total_steps": 5000, "loss": 0.8765, "lr": 2.379323688349516e-06, "epoch": 0.874, "percentage": 87.4, "elapsed_time": "0:21:22", "remaining_time": "0:03:04"}
438
+ {"current_steps": 4380, "total_steps": 5000, "loss": 1.3187, "lr": 2.3055630366772856e-06, "epoch": 0.876, "percentage": 87.6, "elapsed_time": "0:21:24", "remaining_time": "0:03:01"}
439
+ {"current_steps": 4390, "total_steps": 5000, "loss": 0.996, "lr": 2.2329084798455746e-06, "epoch": 0.878, "percentage": 87.8, "elapsed_time": "0:21:28", "remaining_time": "0:02:58"}
440
+ {"current_steps": 4400, "total_steps": 5000, "loss": 1.1203, "lr": 2.1613635589349756e-06, "epoch": 0.88, "percentage": 88.0, "elapsed_time": "0:21:30", "remaining_time": "0:02:56"}
441
+ {"current_steps": 4410, "total_steps": 5000, "loss": 1.5124, "lr": 2.0909317609440095e-06, "epoch": 0.882, "percentage": 88.2, "elapsed_time": "0:21:34", "remaining_time": "0:02:53"}
442
+ {"current_steps": 4420, "total_steps": 5000, "loss": 0.5098, "lr": 2.0216165186191407e-06, "epoch": 0.884, "percentage": 88.4, "elapsed_time": "0:21:36", "remaining_time": "0:02:50"}
443
+ {"current_steps": 4430, "total_steps": 5000, "loss": 0.9764, "lr": 1.95342121028749e-06, "epoch": 0.886, "percentage": 88.6, "elapsed_time": "0:21:39", "remaining_time": "0:02:47"}
444
+ {"current_steps": 4440, "total_steps": 5000, "loss": 1.6156, "lr": 1.8863491596921745e-06, "epoch": 0.888, "percentage": 88.8, "elapsed_time": "0:21:42", "remaining_time": "0:02:44"}
445
+ {"current_steps": 4450, "total_steps": 5000, "loss": 1.3607, "lr": 1.8204036358303173e-06, "epoch": 0.89, "percentage": 89.0, "elapsed_time": "0:21:45", "remaining_time": "0:02:41"}
446
+ {"current_steps": 4460, "total_steps": 5000, "loss": 1.291, "lr": 1.7555878527937164e-06, "epoch": 0.892, "percentage": 89.2, "elapsed_time": "0:21:48", "remaining_time": "0:02:38"}
447
+ {"current_steps": 4470, "total_steps": 5000, "loss": 0.6693, "lr": 1.6919049696121958e-06, "epoch": 0.894, "percentage": 89.4, "elapsed_time": "0:21:51", "remaining_time": "0:02:35"}
448
+ {"current_steps": 4480, "total_steps": 5000, "loss": 1.8125, "lr": 1.629358090099639e-06, "epoch": 0.896, "percentage": 89.6, "elapsed_time": "0:21:54", "remaining_time": "0:02:32"}
449
+ {"current_steps": 4490, "total_steps": 5000, "loss": 1.3824, "lr": 1.5679502627027136e-06, "epoch": 0.898, "percentage": 89.8, "elapsed_time": "0:21:57", "remaining_time": "0:02:29"}
450
+ {"current_steps": 4500, "total_steps": 5000, "loss": 1.466, "lr": 1.5076844803522922e-06, "epoch": 0.9, "percentage": 90.0, "elapsed_time": "0:22:00", "remaining_time": "0:02:26"}
451
+ {"current_steps": 4510, "total_steps": 5000, "loss": 0.8571, "lr": 1.4485636803175829e-06, "epoch": 0.902, "percentage": 90.2, "elapsed_time": "0:22:03", "remaining_time": "0:02:23"}
452
+ {"current_steps": 4520, "total_steps": 5000, "loss": 1.0712, "lr": 1.3905907440629752e-06, "epoch": 0.904, "percentage": 90.4, "elapsed_time": "0:22:06", "remaining_time": "0:02:20"}
453
+ {"current_steps": 4530, "total_steps": 5000, "loss": 1.328, "lr": 1.333768497107593e-06, "epoch": 0.906, "percentage": 90.6, "elapsed_time": "0:22:09", "remaining_time": "0:02:17"}
454
+ {"current_steps": 4540, "total_steps": 5000, "loss": 3.858, "lr": 1.2780997088875869e-06, "epoch": 0.908, "percentage": 90.8, "elapsed_time": "0:22:11", "remaining_time": "0:02:14"}
455
+ {"current_steps": 4550, "total_steps": 5000, "loss": 1.3761, "lr": 1.2235870926211619e-06, "epoch": 0.91, "percentage": 91.0, "elapsed_time": "0:22:14", "remaining_time": "0:02:12"}
456
+ {"current_steps": 4560, "total_steps": 5000, "loss": 0.9014, "lr": 1.170233305176327e-06, "epoch": 0.912, "percentage": 91.2, "elapsed_time": "0:22:17", "remaining_time": "0:02:09"}
457
+ {"current_steps": 4570, "total_steps": 5000, "loss": 1.2054, "lr": 1.1180409469414094e-06, "epoch": 0.914, "percentage": 91.4, "elapsed_time": "0:22:20", "remaining_time": "0:02:06"}
458
+ {"current_steps": 4580, "total_steps": 5000, "loss": 1.1962, "lr": 1.067012561698319e-06, "epoch": 0.916, "percentage": 91.6, "elapsed_time": "0:22:23", "remaining_time": "0:02:03"}
459
+ {"current_steps": 4590, "total_steps": 5000, "loss": 1.1866, "lr": 1.0171506364985622e-06, "epoch": 0.918, "percentage": 91.8, "elapsed_time": "0:22:26", "remaining_time": "0:02:00"}
460
+ {"current_steps": 4600, "total_steps": 5000, "loss": 1.4084, "lr": 9.684576015420278e-07, "epoch": 0.92, "percentage": 92.0, "elapsed_time": "0:22:29", "remaining_time": "0:01:57"}
461
+ {"current_steps": 4610, "total_steps": 5000, "loss": 0.4785, "lr": 9.209358300585474e-07, "epoch": 0.922, "percentage": 92.2, "elapsed_time": "0:22:32", "remaining_time": "0:01:54"}
462
+ {"current_steps": 4620, "total_steps": 5000, "loss": 1.0671, "lr": 8.745876381922147e-07, "epoch": 0.924, "percentage": 92.4, "elapsed_time": "0:22:35", "remaining_time": "0:01:51"}
463
+ {"current_steps": 4630, "total_steps": 5000, "loss": 3.2901, "lr": 8.294152848885157e-07, "epoch": 0.926, "percentage": 92.6, "elapsed_time": "0:22:37", "remaining_time": "0:01:48"}
464
+ {"current_steps": 4640, "total_steps": 5000, "loss": 1.0359, "lr": 7.854209717842231e-07, "epoch": 0.928, "percentage": 92.8, "elapsed_time": "0:22:40", "remaining_time": "0:01:45"}
465
+ {"current_steps": 4650, "total_steps": 5000, "loss": 0.5115, "lr": 7.426068431000882e-07, "epoch": 0.93, "percentage": 93.0, "elapsed_time": "0:22:43", "remaining_time": "0:01:42"}
466
+ {"current_steps": 4660, "total_steps": 5000, "loss": 1.0589, "lr": 7.009749855363456e-07, "epoch": 0.932, "percentage": 93.2, "elapsed_time": "0:22:46", "remaining_time": "0:01:39"}
467
+ {"current_steps": 4670, "total_steps": 5000, "loss": 0.3984, "lr": 6.605274281709928e-07, "epoch": 0.934, "percentage": 93.4, "elapsed_time": "0:22:49", "remaining_time": "0:01:36"}
468
+ {"current_steps": 4680, "total_steps": 5000, "loss": 2.5647, "lr": 6.212661423609184e-07, "epoch": 0.936, "percentage": 93.6, "elapsed_time": "0:22:52", "remaining_time": "0:01:33"}
469
+ {"current_steps": 4690, "total_steps": 5000, "loss": 0.93, "lr": 5.83193041645802e-07, "epoch": 0.938, "percentage": 93.8, "elapsed_time": "0:22:54", "remaining_time": "0:01:30"}
470
+ {"current_steps": 4700, "total_steps": 5000, "loss": 1.4605, "lr": 5.463099816548579e-07, "epoch": 0.94, "percentage": 94.0, "elapsed_time": "0:22:58", "remaining_time": "0:01:27"}
471
+ {"current_steps": 4710, "total_steps": 5000, "loss": 0.7053, "lr": 5.106187600163987e-07, "epoch": 0.942, "percentage": 94.2, "elapsed_time": "0:23:00", "remaining_time": "0:01:25"}
472
+ {"current_steps": 4720, "total_steps": 5000, "loss": 1.9524, "lr": 4.7612111627021175e-07, "epoch": 0.944, "percentage": 94.4, "elapsed_time": "0:23:03", "remaining_time": "0:01:22"}
473
+ {"current_steps": 4730, "total_steps": 5000, "loss": 2.0867, "lr": 4.4281873178278475e-07, "epoch": 0.946, "percentage": 94.6, "elapsed_time": "0:23:06", "remaining_time": "0:01:19"}
474
+ {"current_steps": 4740, "total_steps": 5000, "loss": 0.6371, "lr": 4.107132296653549e-07, "epoch": 0.948, "percentage": 94.8, "elapsed_time": "0:23:09", "remaining_time": "0:01:16"}
475
+ {"current_steps": 4750, "total_steps": 5000, "loss": 0.5299, "lr": 3.7980617469479953e-07, "epoch": 0.95, "percentage": 95.0, "elapsed_time": "0:23:12", "remaining_time": "0:01:13"}
476
+ {"current_steps": 4760, "total_steps": 5000, "loss": 4.1515, "lr": 3.5009907323737825e-07, "epoch": 0.952, "percentage": 95.2, "elapsed_time": "0:23:14", "remaining_time": "0:01:10"}
477
+ {"current_steps": 4770, "total_steps": 5000, "loss": 0.762, "lr": 3.215933731753024e-07, "epoch": 0.954, "percentage": 95.4, "elapsed_time": "0:23:17", "remaining_time": "0:01:07"}
478
+ {"current_steps": 4780, "total_steps": 5000, "loss": 0.9055, "lr": 2.942904638361804e-07, "epoch": 0.956, "percentage": 95.6, "elapsed_time": "0:23:20", "remaining_time": "0:01:04"}
479
+ {"current_steps": 4790, "total_steps": 5000, "loss": 1.026, "lr": 2.681916759252917e-07, "epoch": 0.958, "percentage": 95.8, "elapsed_time": "0:23:23", "remaining_time": "0:01:01"}
480
+ {"current_steps": 4800, "total_steps": 5000, "loss": 1.9403, "lr": 2.4329828146074095e-07, "epoch": 0.96, "percentage": 96.0, "elapsed_time": "0:23:26", "remaining_time": "0:00:58"}
481
+ {"current_steps": 4810, "total_steps": 5000, "loss": 1.8509, "lr": 2.1961149371145795e-07, "epoch": 0.962, "percentage": 96.2, "elapsed_time": "0:23:29", "remaining_time": "0:00:55"}
482
+ {"current_steps": 4820, "total_steps": 5000, "loss": 1.5779, "lr": 1.9713246713805588e-07, "epoch": 0.964, "percentage": 96.4, "elapsed_time": "0:23:32", "remaining_time": "0:00:52"}
483
+ {"current_steps": 4830, "total_steps": 5000, "loss": 1.2824, "lr": 1.7586229733657644e-07, "epoch": 0.966, "percentage": 96.6, "elapsed_time": "0:23:34", "remaining_time": "0:00:49"}
484
+ {"current_steps": 4840, "total_steps": 5000, "loss": 1.8272, "lr": 1.5580202098509077e-07, "epoch": 0.968, "percentage": 96.8, "elapsed_time": "0:23:37", "remaining_time": "0:00:46"}
485
+ {"current_steps": 4850, "total_steps": 5000, "loss": 1.7201, "lr": 1.3695261579316777e-07, "epoch": 0.97, "percentage": 97.0, "elapsed_time": "0:23:40", "remaining_time": "0:00:43"}
486
+ {"current_steps": 4860, "total_steps": 5000, "loss": 1.2317, "lr": 1.193150004542204e-07, "epoch": 0.972, "percentage": 97.2, "elapsed_time": "0:23:42", "remaining_time": "0:00:40"}
487
+ {"current_steps": 4870, "total_steps": 5000, "loss": 1.2935, "lr": 1.0289003460074165e-07, "epoch": 0.974, "percentage": 97.4, "elapsed_time": "0:23:45", "remaining_time": "0:00:38"}
488
+ {"current_steps": 4880, "total_steps": 5000, "loss": 1.2833, "lr": 8.767851876239074e-08, "epoch": 0.976, "percentage": 97.6, "elapsed_time": "0:23:48", "remaining_time": "0:00:35"}
489
+ {"current_steps": 4890, "total_steps": 5000, "loss": 1.021, "lr": 7.368119432699383e-08, "epoch": 0.978, "percentage": 97.8, "elapsed_time": "0:23:51", "remaining_time": "0:00:32"}
490
+ {"current_steps": 4900, "total_steps": 5000, "loss": 1.1247, "lr": 6.089874350439506e-08, "epoch": 0.98, "percentage": 98.0, "elapsed_time": "0:23:54", "remaining_time": "0:00:29"}
491
+ {"current_steps": 4910, "total_steps": 5000, "loss": 1.2268, "lr": 4.9331789293211026e-08, "epoch": 0.982, "percentage": 98.2, "elapsed_time": "0:23:57", "remaining_time": "0:00:26"}
492
+ {"current_steps": 4920, "total_steps": 5000, "loss": 4.2583, "lr": 3.8980895450474455e-08, "epoch": 0.984, "percentage": 98.4, "elapsed_time": "0:24:00", "remaining_time": "0:00:23"}
493
+ {"current_steps": 4930, "total_steps": 5000, "loss": 0.7995, "lr": 2.9846566464150626e-08, "epoch": 0.986, "percentage": 98.6, "elapsed_time": "0:24:03", "remaining_time": "0:00:20"}
494
+ {"current_steps": 4940, "total_steps": 5000, "loss": 1.3818, "lr": 2.192924752854042e-08, "epoch": 0.988, "percentage": 98.8, "elapsed_time": "0:24:06", "remaining_time": "0:00:17"}
495
+ {"current_steps": 4950, "total_steps": 5000, "loss": 2.6165, "lr": 1.522932452260595e-08, "epoch": 0.99, "percentage": 99.0, "elapsed_time": "0:24:09", "remaining_time": "0:00:14"}
496
+ {"current_steps": 4960, "total_steps": 5000, "loss": 1.7912, "lr": 9.747123991141194e-09, "epoch": 0.992, "percentage": 99.2, "elapsed_time": "0:24:12", "remaining_time": "0:00:11"}
497
+ {"current_steps": 4970, "total_steps": 5000, "loss": 1.421, "lr": 5.48291312886251e-09, "epoch": 0.994, "percentage": 99.4, "elapsed_time": "0:24:15", "remaining_time": "0:00:08"}
498
+ {"current_steps": 4980, "total_steps": 5000, "loss": 1.2866, "lr": 2.4368997673940297e-09, "epoch": 0.996, "percentage": 99.6, "elapsed_time": "0:24:18", "remaining_time": "0:00:05"}
499
+ {"current_steps": 4990, "total_steps": 5000, "loss": 0.6754, "lr": 6.092323651313292e-10, "epoch": 0.998, "percentage": 99.8, "elapsed_time": "0:24:20", "remaining_time": "0:00:02"}
500
+ {"current_steps": 5000, "total_steps": 5000, "loss": 1.1699, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:24:23", "remaining_time": "0:00:00"}
501
+ {"current_steps": 5000, "total_steps": 5000, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:24:24", "remaining_time": "0:00:00"}
Llama-2-13b-chat-hf/DomainBench/Finance/trainer_state.json ADDED
@@ -0,0 +1,3542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002,
13
+ "grad_norm": 1.1971019506454468,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 3.5335,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.004,
20
+ "grad_norm": 0.6988129019737244,
21
+ "learning_rate": 2.0000000000000003e-06,
22
+ "loss": 5.5042,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.006,
27
+ "grad_norm": 0.7433498501777649,
28
+ "learning_rate": 3e-06,
29
+ "loss": 2.9337,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.008,
34
+ "grad_norm": 0.6900968551635742,
35
+ "learning_rate": 4.000000000000001e-06,
36
+ "loss": 8.5725,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.01,
41
+ "grad_norm": 0.0,
42
+ "learning_rate": 5e-06,
43
+ "loss": 2.2742,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.012,
48
+ "grad_norm": 0.980725884437561,
49
+ "learning_rate": 6e-06,
50
+ "loss": 4.5745,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.014,
55
+ "grad_norm": 1.36703360080719,
56
+ "learning_rate": 7.000000000000001e-06,
57
+ "loss": 3.9419,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.016,
62
+ "grad_norm": 1.5291451215744019,
63
+ "learning_rate": 8.000000000000001e-06,
64
+ "loss": 2.2781,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.018,
69
+ "grad_norm": 0.7842139601707458,
70
+ "learning_rate": 9e-06,
71
+ "loss": 6.3897,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.02,
76
+ "grad_norm": 0.620259702205658,
77
+ "learning_rate": 1e-05,
78
+ "loss": 3.6837,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.022,
83
+ "grad_norm": 0.0,
84
+ "learning_rate": 1.1000000000000001e-05,
85
+ "loss": 1.3266,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.024,
90
+ "grad_norm": 0.2774917483329773,
91
+ "learning_rate": 1.2e-05,
92
+ "loss": 6.1833,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.026,
97
+ "grad_norm": 1.9525136947631836,
98
+ "learning_rate": 1.3000000000000001e-05,
99
+ "loss": 2.6712,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.028,
104
+ "grad_norm": 3.779365062713623,
105
+ "learning_rate": 1.4000000000000001e-05,
106
+ "loss": 2.5445,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.03,
111
+ "grad_norm": 1.3333756923675537,
112
+ "learning_rate": 1.5e-05,
113
+ "loss": 2.7276,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.032,
118
+ "grad_norm": 0.9332829713821411,
119
+ "learning_rate": 1.6000000000000003e-05,
120
+ "loss": 1.7329,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.034,
125
+ "grad_norm": 1.1419305801391602,
126
+ "learning_rate": 1.7000000000000003e-05,
127
+ "loss": 1.916,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.036,
132
+ "grad_norm": 1.4852521419525146,
133
+ "learning_rate": 1.8e-05,
134
+ "loss": 6.3988,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.038,
139
+ "grad_norm": 0.8550328016281128,
140
+ "learning_rate": 1.9e-05,
141
+ "loss": 2.1053,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.04,
146
+ "grad_norm": 2.170074939727783,
147
+ "learning_rate": 2e-05,
148
+ "loss": 0.798,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.042,
153
+ "grad_norm": 0.7900694012641907,
154
+ "learning_rate": 2.1e-05,
155
+ "loss": 2.0661,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.044,
160
+ "grad_norm": 0.0,
161
+ "learning_rate": 2.2000000000000003e-05,
162
+ "loss": 2.4783,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.046,
167
+ "grad_norm": 0.0,
168
+ "learning_rate": 2.3000000000000003e-05,
169
+ "loss": 0.7402,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.048,
174
+ "grad_norm": 0.35598117113113403,
175
+ "learning_rate": 2.4e-05,
176
+ "loss": 2.5115,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.05,
181
+ "grad_norm": 5.627530574798584,
182
+ "learning_rate": 2.5e-05,
183
+ "loss": 2.13,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.052,
188
+ "grad_norm": 3.1692419052124023,
189
+ "learning_rate": 2.6000000000000002e-05,
190
+ "loss": 1.6962,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.054,
195
+ "grad_norm": 5.671966075897217,
196
+ "learning_rate": 2.7000000000000002e-05,
197
+ "loss": 2.8705,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.056,
202
+ "grad_norm": 0.8001578450202942,
203
+ "learning_rate": 2.8000000000000003e-05,
204
+ "loss": 2.9734,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.058,
209
+ "grad_norm": 0.21718740463256836,
210
+ "learning_rate": 2.9e-05,
211
+ "loss": 1.7669,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.06,
216
+ "grad_norm": 1.2626160383224487,
217
+ "learning_rate": 3e-05,
218
+ "loss": 2.1499,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.062,
223
+ "grad_norm": 0.0,
224
+ "learning_rate": 3.1e-05,
225
+ "loss": 1.6354,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.064,
230
+ "grad_norm": 3.380746603012085,
231
+ "learning_rate": 3.2000000000000005e-05,
232
+ "loss": 2.1457,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.066,
237
+ "grad_norm": 0.2737733721733093,
238
+ "learning_rate": 3.3e-05,
239
+ "loss": 3.0738,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.068,
244
+ "grad_norm": 0.797559380531311,
245
+ "learning_rate": 3.4000000000000007e-05,
246
+ "loss": 2.4357,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.07,
251
+ "grad_norm": 1.7590610980987549,
252
+ "learning_rate": 3.5e-05,
253
+ "loss": 1.7431,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.072,
258
+ "grad_norm": 0.7487906217575073,
259
+ "learning_rate": 3.6e-05,
260
+ "loss": 1.7527,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.074,
265
+ "grad_norm": 0.6480693221092224,
266
+ "learning_rate": 3.7e-05,
267
+ "loss": 6.1666,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.076,
272
+ "grad_norm": 0.8775593638420105,
273
+ "learning_rate": 3.8e-05,
274
+ "loss": 0.5917,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.078,
279
+ "grad_norm": 5.630661964416504,
280
+ "learning_rate": 3.9000000000000006e-05,
281
+ "loss": 1.5061,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.08,
286
+ "grad_norm": 0.7237743735313416,
287
+ "learning_rate": 4e-05,
288
+ "loss": 1.5694,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.082,
293
+ "grad_norm": 1.8499999046325684,
294
+ "learning_rate": 4.1e-05,
295
+ "loss": 1.4762,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.084,
300
+ "grad_norm": 3.2792017459869385,
301
+ "learning_rate": 4.2e-05,
302
+ "loss": 1.0468,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.086,
307
+ "grad_norm": 1.644893765449524,
308
+ "learning_rate": 4.3e-05,
309
+ "loss": 7.1942,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.088,
314
+ "grad_norm": 3.8417279720306396,
315
+ "learning_rate": 4.4000000000000006e-05,
316
+ "loss": 2.706,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.09,
321
+ "grad_norm": 9.292684555053711,
322
+ "learning_rate": 4.5e-05,
323
+ "loss": 1.8559,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.092,
328
+ "grad_norm": 8.052363395690918,
329
+ "learning_rate": 4.600000000000001e-05,
330
+ "loss": 1.2105,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.094,
335
+ "grad_norm": 2.8528406620025635,
336
+ "learning_rate": 4.7e-05,
337
+ "loss": 2.8524,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.096,
342
+ "grad_norm": 1.0747981071472168,
343
+ "learning_rate": 4.8e-05,
344
+ "loss": 2.0881,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.098,
349
+ "grad_norm": 1.3833454847335815,
350
+ "learning_rate": 4.9e-05,
351
+ "loss": 2.4156,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.1,
356
+ "grad_norm": 0.5067809820175171,
357
+ "learning_rate": 5e-05,
358
+ "loss": 0.9531,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.102,
363
+ "grad_norm": 1.2673537731170654,
364
+ "learning_rate": 4.999939076763487e-05,
365
+ "loss": 3.9691,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 0.104,
370
+ "grad_norm": 0.0,
371
+ "learning_rate": 4.999756310023261e-05,
372
+ "loss": 0.8777,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 0.106,
377
+ "grad_norm": 0.7635637521743774,
378
+ "learning_rate": 4.999451708687114e-05,
379
+ "loss": 0.9434,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 0.108,
384
+ "grad_norm": 12.323412895202637,
385
+ "learning_rate": 4.999025287600886e-05,
386
+ "loss": 1.5592,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 0.11,
391
+ "grad_norm": 0.7043775916099548,
392
+ "learning_rate": 4.99847706754774e-05,
393
+ "loss": 0.6868,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 0.112,
398
+ "grad_norm": 2.084545612335205,
399
+ "learning_rate": 4.997807075247146e-05,
400
+ "loss": 0.956,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 0.114,
405
+ "grad_norm": 1.2179222106933594,
406
+ "learning_rate": 4.997015343353585e-05,
407
+ "loss": 0.8766,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 0.116,
412
+ "grad_norm": 0.2723235785961151,
413
+ "learning_rate": 4.996101910454953e-05,
414
+ "loss": 0.5423,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 0.118,
419
+ "grad_norm": 19.425006866455078,
420
+ "learning_rate": 4.995066821070679e-05,
421
+ "loss": 1.7762,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 0.12,
426
+ "grad_norm": 7.556680679321289,
427
+ "learning_rate": 4.993910125649561e-05,
428
+ "loss": 2.294,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 0.122,
433
+ "grad_norm": 1.423615574836731,
434
+ "learning_rate": 4.992631880567301e-05,
435
+ "loss": 0.4881,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 0.124,
440
+ "grad_norm": 0.8066297769546509,
441
+ "learning_rate": 4.991232148123761e-05,
442
+ "loss": 3.3744,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 0.126,
447
+ "grad_norm": 1.3097137212753296,
448
+ "learning_rate": 4.989710996539926e-05,
449
+ "loss": 1.2847,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 0.128,
454
+ "grad_norm": 0.45432719588279724,
455
+ "learning_rate": 4.988068499954578e-05,
456
+ "loss": 0.9029,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 0.13,
461
+ "grad_norm": 0.6234528422355652,
462
+ "learning_rate": 4.9863047384206835e-05,
463
+ "loss": 1.3799,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 0.132,
468
+ "grad_norm": 5.1088385581970215,
469
+ "learning_rate": 4.984419797901491e-05,
470
+ "loss": 3.7841,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 0.134,
475
+ "grad_norm": 1.5142585039138794,
476
+ "learning_rate": 4.982413770266342e-05,
477
+ "loss": 0.9186,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 0.136,
482
+ "grad_norm": 1.2981692552566528,
483
+ "learning_rate": 4.980286753286195e-05,
484
+ "loss": 1.4738,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 0.138,
489
+ "grad_norm": 12.405978202819824,
490
+ "learning_rate": 4.978038850628854e-05,
491
+ "loss": 3.8651,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 0.14,
496
+ "grad_norm": 3.6295487880706787,
497
+ "learning_rate": 4.975670171853926e-05,
498
+ "loss": 1.1881,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 0.142,
503
+ "grad_norm": 1.0809906721115112,
504
+ "learning_rate": 4.9731808324074717e-05,
505
+ "loss": 0.8245,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 0.144,
510
+ "grad_norm": 3.7995641231536865,
511
+ "learning_rate": 4.9705709536163824e-05,
512
+ "loss": 1.2642,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 0.146,
517
+ "grad_norm": 0.8331463932991028,
518
+ "learning_rate": 4.96784066268247e-05,
519
+ "loss": 1.6256,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 0.148,
524
+ "grad_norm": 5.477302551269531,
525
+ "learning_rate": 4.964990092676263e-05,
526
+ "loss": 9.4718,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 0.15,
531
+ "grad_norm": 2.345902681350708,
532
+ "learning_rate": 4.962019382530521e-05,
533
+ "loss": 1.6593,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 0.152,
538
+ "grad_norm": 1.1139212846755981,
539
+ "learning_rate": 4.9589286770334654e-05,
540
+ "loss": 1.4136,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 0.154,
545
+ "grad_norm": 1.1984540224075317,
546
+ "learning_rate": 4.9557181268217227e-05,
547
+ "loss": 2.5935,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 0.156,
552
+ "grad_norm": 1.6878119707107544,
553
+ "learning_rate": 4.952387888372979e-05,
554
+ "loss": 2.0708,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 0.158,
559
+ "grad_norm": 0.7853636145591736,
560
+ "learning_rate": 4.94893812399836e-05,
561
+ "loss": 0.8737,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 0.16,
566
+ "grad_norm": 0.6999472379684448,
567
+ "learning_rate": 4.9453690018345144e-05,
568
+ "loss": 2.0691,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 0.162,
573
+ "grad_norm": 5.1682329177856445,
574
+ "learning_rate": 4.94168069583542e-05,
575
+ "loss": 0.7324,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 0.164,
580
+ "grad_norm": 0.0,
581
+ "learning_rate": 4.937873385763908e-05,
582
+ "loss": 0.577,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 0.166,
587
+ "grad_norm": 0.8014002442359924,
588
+ "learning_rate": 4.933947257182901e-05,
589
+ "loss": 0.6428,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 0.168,
594
+ "grad_norm": 1.618863821029663,
595
+ "learning_rate": 4.929902501446366e-05,
596
+ "loss": 1.4208,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 0.17,
601
+ "grad_norm": 8.870545387268066,
602
+ "learning_rate": 4.925739315689991e-05,
603
+ "loss": 1.1485,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 0.172,
608
+ "grad_norm": 0.5625079274177551,
609
+ "learning_rate": 4.9214579028215776e-05,
610
+ "loss": 0.8729,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 0.174,
615
+ "grad_norm": 1.9488248825073242,
616
+ "learning_rate": 4.917058471511149e-05,
617
+ "loss": 2.3371,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 0.176,
622
+ "grad_norm": 1.2681325674057007,
623
+ "learning_rate": 4.912541236180779e-05,
624
+ "loss": 0.6887,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 0.178,
629
+ "grad_norm": 1.7612295150756836,
630
+ "learning_rate": 4.907906416994146e-05,
631
+ "loss": 0.8769,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 0.18,
636
+ "grad_norm": 0.9437585473060608,
637
+ "learning_rate": 4.9031542398457974e-05,
638
+ "loss": 2.4827,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 0.182,
643
+ "grad_norm": 0.5411396026611328,
644
+ "learning_rate": 4.898284936350144e-05,
645
+ "loss": 0.5722,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 0.184,
650
+ "grad_norm": 1.9333531856536865,
651
+ "learning_rate": 4.893298743830168e-05,
652
+ "loss": 1.3822,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 0.186,
657
+ "grad_norm": 1.7342445850372314,
658
+ "learning_rate": 4.888195905305859e-05,
659
+ "loss": 0.7233,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 0.188,
664
+ "grad_norm": 2.7781763076782227,
665
+ "learning_rate": 4.882976669482367e-05,
666
+ "loss": 9.3579,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 0.19,
671
+ "grad_norm": 11.274718284606934,
672
+ "learning_rate": 4.877641290737884e-05,
673
+ "loss": 2.2926,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 0.192,
678
+ "grad_norm": 2.7516400814056396,
679
+ "learning_rate": 4.8721900291112415e-05,
680
+ "loss": 0.8563,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 0.194,
685
+ "grad_norm": 2.3136308193206787,
686
+ "learning_rate": 4.8666231502892415e-05,
687
+ "loss": 2.229,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 0.196,
692
+ "grad_norm": 2.1585183143615723,
693
+ "learning_rate": 4.860940925593703e-05,
694
+ "loss": 1.7804,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 0.198,
699
+ "grad_norm": 2.338162899017334,
700
+ "learning_rate": 4.855143631968242e-05,
701
+ "loss": 0.826,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 0.2,
706
+ "grad_norm": 1.9523894786834717,
707
+ "learning_rate": 4.849231551964771e-05,
708
+ "loss": 1.746,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 0.202,
713
+ "grad_norm": 1.1914769411087036,
714
+ "learning_rate": 4.843204973729729e-05,
715
+ "loss": 2.6019,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 0.204,
720
+ "grad_norm": 0.8508822917938232,
721
+ "learning_rate": 4.837064190990036e-05,
722
+ "loss": 1.5425,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 0.206,
727
+ "grad_norm": 1.5970311164855957,
728
+ "learning_rate": 4.830809503038781e-05,
729
+ "loss": 1.2792,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 0.208,
734
+ "grad_norm": 1.7772223949432373,
735
+ "learning_rate": 4.8244412147206284e-05,
736
+ "loss": 0.9529,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 0.21,
741
+ "grad_norm": 2.8910939693450928,
742
+ "learning_rate": 4.817959636416969e-05,
743
+ "loss": 1.3505,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 0.212,
748
+ "grad_norm": 1.4733608961105347,
749
+ "learning_rate": 4.8113650840307834e-05,
750
+ "loss": 0.9163,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 0.214,
755
+ "grad_norm": 0.32765626907348633,
756
+ "learning_rate": 4.8046578789712515e-05,
757
+ "loss": 1.228,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 0.216,
762
+ "grad_norm": 9.417176246643066,
763
+ "learning_rate": 4.797838348138086e-05,
764
+ "loss": 1.1117,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 0.218,
769
+ "grad_norm": 0.6818392872810364,
770
+ "learning_rate": 4.790906823905599e-05,
771
+ "loss": 1.7063,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 0.22,
776
+ "grad_norm": 0.9097204208374023,
777
+ "learning_rate": 4.783863644106502e-05,
778
+ "loss": 0.9846,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 0.222,
783
+ "grad_norm": 2.9016330242156982,
784
+ "learning_rate": 4.776709152015443e-05,
785
+ "loss": 0.9996,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 0.224,
790
+ "grad_norm": 2.820084810256958,
791
+ "learning_rate": 4.769443696332272e-05,
792
+ "loss": 1.5269,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 0.226,
797
+ "grad_norm": 0.6218633651733398,
798
+ "learning_rate": 4.762067631165049e-05,
799
+ "loss": 1.7736,
800
+ "step": 1130
801
+ },
802
+ {
803
+ "epoch": 0.228,
804
+ "grad_norm": 9.925037384033203,
805
+ "learning_rate": 4.754581316012785e-05,
806
+ "loss": 1.0233,
807
+ "step": 1140
808
+ },
809
+ {
810
+ "epoch": 0.23,
811
+ "grad_norm": 11.37203598022461,
812
+ "learning_rate": 4.7469851157479177e-05,
813
+ "loss": 1.3813,
814
+ "step": 1150
815
+ },
816
+ {
817
+ "epoch": 0.232,
818
+ "grad_norm": 0.0,
819
+ "learning_rate": 4.7392794005985326e-05,
820
+ "loss": 0.6272,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 0.234,
825
+ "grad_norm": 0.4212048351764679,
826
+ "learning_rate": 4.731464546130314e-05,
827
+ "loss": 2.541,
828
+ "step": 1170
829
+ },
830
+ {
831
+ "epoch": 0.236,
832
+ "grad_norm": 0.8374232053756714,
833
+ "learning_rate": 4.723540933228244e-05,
834
+ "loss": 1.0791,
835
+ "step": 1180
836
+ },
837
+ {
838
+ "epoch": 0.238,
839
+ "grad_norm": 3.9064877033233643,
840
+ "learning_rate": 4.715508948078037e-05,
841
+ "loss": 1.5918,
842
+ "step": 1190
843
+ },
844
+ {
845
+ "epoch": 0.24,
846
+ "grad_norm": 1.2058210372924805,
847
+ "learning_rate": 4.707368982147318e-05,
848
+ "loss": 1.3247,
849
+ "step": 1200
850
+ },
851
+ {
852
+ "epoch": 0.242,
853
+ "grad_norm": 1.6068898439407349,
854
+ "learning_rate": 4.6991214321665414e-05,
855
+ "loss": 1.0801,
856
+ "step": 1210
857
+ },
858
+ {
859
+ "epoch": 0.244,
860
+ "grad_norm": 0.9633479118347168,
861
+ "learning_rate": 4.690766700109659e-05,
862
+ "loss": 2.2492,
863
+ "step": 1220
864
+ },
865
+ {
866
+ "epoch": 0.246,
867
+ "grad_norm": 3.4308974742889404,
868
+ "learning_rate": 4.682305193174524e-05,
869
+ "loss": 0.9898,
870
+ "step": 1230
871
+ },
872
+ {
873
+ "epoch": 0.248,
874
+ "grad_norm": 2.3105175495147705,
875
+ "learning_rate": 4.6737373237630476e-05,
876
+ "loss": 0.8907,
877
+ "step": 1240
878
+ },
879
+ {
880
+ "epoch": 0.25,
881
+ "grad_norm": 1.836309790611267,
882
+ "learning_rate": 4.665063509461097e-05,
883
+ "loss": 0.7756,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 0.252,
888
+ "grad_norm": 2.3956916332244873,
889
+ "learning_rate": 4.656284173018144e-05,
890
+ "loss": 0.6585,
891
+ "step": 1260
892
+ },
893
+ {
894
+ "epoch": 0.254,
895
+ "grad_norm": 1.4388060569763184,
896
+ "learning_rate": 4.6473997423266614e-05,
897
+ "loss": 1.2712,
898
+ "step": 1270
899
+ },
900
+ {
901
+ "epoch": 0.256,
902
+ "grad_norm": 0.7336168885231018,
903
+ "learning_rate": 4.638410650401267e-05,
904
+ "loss": 1.2611,
905
+ "step": 1280
906
+ },
907
+ {
908
+ "epoch": 0.258,
909
+ "grad_norm": 0.7995616793632507,
910
+ "learning_rate": 4.629317335357619e-05,
911
+ "loss": 1.0479,
912
+ "step": 1290
913
+ },
914
+ {
915
+ "epoch": 0.26,
916
+ "grad_norm": 1.2063746452331543,
917
+ "learning_rate": 4.620120240391065e-05,
918
+ "loss": 1.6104,
919
+ "step": 1300
920
+ },
921
+ {
922
+ "epoch": 0.262,
923
+ "grad_norm": 0.9148932695388794,
924
+ "learning_rate": 4.610819813755038e-05,
925
+ "loss": 1.3264,
926
+ "step": 1310
927
+ },
928
+ {
929
+ "epoch": 0.264,
930
+ "grad_norm": 11.944299697875977,
931
+ "learning_rate": 4.601416508739211e-05,
932
+ "loss": 1.1883,
933
+ "step": 1320
934
+ },
935
+ {
936
+ "epoch": 0.266,
937
+ "grad_norm": 6.062456130981445,
938
+ "learning_rate": 4.591910783647404e-05,
939
+ "loss": 1.4327,
940
+ "step": 1330
941
+ },
942
+ {
943
+ "epoch": 0.268,
944
+ "grad_norm": 8.919118881225586,
945
+ "learning_rate": 4.5823031017752485e-05,
946
+ "loss": 1.3558,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 0.27,
951
+ "grad_norm": 0.7179931402206421,
952
+ "learning_rate": 4.572593931387604e-05,
953
+ "loss": 2.6668,
954
+ "step": 1350
955
+ },
956
+ {
957
+ "epoch": 0.272,
958
+ "grad_norm": 1.1414953470230103,
959
+ "learning_rate": 4.562783745695738e-05,
960
+ "loss": 0.4225,
961
+ "step": 1360
962
+ },
963
+ {
964
+ "epoch": 0.274,
965
+ "grad_norm": 2.28456449508667,
966
+ "learning_rate": 4.5528730228342605e-05,
967
+ "loss": 1.1548,
968
+ "step": 1370
969
+ },
970
+ {
971
+ "epoch": 0.276,
972
+ "grad_norm": 4.372885227203369,
973
+ "learning_rate": 4.542862245837821e-05,
974
+ "loss": 0.9096,
975
+ "step": 1380
976
+ },
977
+ {
978
+ "epoch": 0.278,
979
+ "grad_norm": 0.26524534821510315,
980
+ "learning_rate": 4.532751902617569e-05,
981
+ "loss": 1.3681,
982
+ "step": 1390
983
+ },
984
+ {
985
+ "epoch": 0.28,
986
+ "grad_norm": 1.4623117446899414,
987
+ "learning_rate": 4.522542485937369e-05,
988
+ "loss": 1.286,
989
+ "step": 1400
990
+ },
991
+ {
992
+ "epoch": 0.282,
993
+ "grad_norm": 1.503664255142212,
994
+ "learning_rate": 4.512234493389785e-05,
995
+ "loss": 1.0204,
996
+ "step": 1410
997
+ },
998
+ {
999
+ "epoch": 0.284,
1000
+ "grad_norm": 0.5991083979606628,
1001
+ "learning_rate": 4.5018284273718336e-05,
1002
+ "loss": 1.083,
1003
+ "step": 1420
1004
+ },
1005
+ {
1006
+ "epoch": 0.286,
1007
+ "grad_norm": 0.6346163749694824,
1008
+ "learning_rate": 4.491324795060491e-05,
1009
+ "loss": 0.5139,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 0.288,
1014
+ "grad_norm": 1.6709283590316772,
1015
+ "learning_rate": 4.480724108387977e-05,
1016
+ "loss": 1.591,
1017
+ "step": 1440
1018
+ },
1019
+ {
1020
+ "epoch": 0.29,
1021
+ "grad_norm": 0.49724531173706055,
1022
+ "learning_rate": 4.4700268840168045e-05,
1023
+ "loss": 0.4862,
1024
+ "step": 1450
1025
+ },
1026
+ {
1027
+ "epoch": 0.292,
1028
+ "grad_norm": 3.7753849029541016,
1029
+ "learning_rate": 4.4592336433146e-05,
1030
+ "loss": 0.7162,
1031
+ "step": 1460
1032
+ },
1033
+ {
1034
+ "epoch": 0.294,
1035
+ "grad_norm": 1.118366003036499,
1036
+ "learning_rate": 4.448344912328686e-05,
1037
+ "loss": 1.6884,
1038
+ "step": 1470
1039
+ },
1040
+ {
1041
+ "epoch": 0.296,
1042
+ "grad_norm": 0.3718686103820801,
1043
+ "learning_rate": 4.4373612217604496e-05,
1044
+ "loss": 0.9289,
1045
+ "step": 1480
1046
+ },
1047
+ {
1048
+ "epoch": 0.298,
1049
+ "grad_norm": 2.443821668624878,
1050
+ "learning_rate": 4.426283106939474e-05,
1051
+ "loss": 1.546,
1052
+ "step": 1490
1053
+ },
1054
+ {
1055
+ "epoch": 0.3,
1056
+ "grad_norm": 1.6871048212051392,
1057
+ "learning_rate": 4.415111107797445e-05,
1058
+ "loss": 0.9777,
1059
+ "step": 1500
1060
+ },
1061
+ {
1062
+ "epoch": 0.302,
1063
+ "grad_norm": 1.2891165018081665,
1064
+ "learning_rate": 4.403845768841842e-05,
1065
+ "loss": 0.6637,
1066
+ "step": 1510
1067
+ },
1068
+ {
1069
+ "epoch": 0.304,
1070
+ "grad_norm": 1.3466380834579468,
1071
+ "learning_rate": 4.3924876391293915e-05,
1072
+ "loss": 1.7305,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 0.306,
1077
+ "grad_norm": 1.7780977487564087,
1078
+ "learning_rate": 4.381037272239311e-05,
1079
+ "loss": 0.5603,
1080
+ "step": 1530
1081
+ },
1082
+ {
1083
+ "epoch": 0.308,
1084
+ "grad_norm": 0.5368999242782593,
1085
+ "learning_rate": 4.36949522624633e-05,
1086
+ "loss": 3.2643,
1087
+ "step": 1540
1088
+ },
1089
+ {
1090
+ "epoch": 0.31,
1091
+ "grad_norm": 0.0,
1092
+ "learning_rate": 4.357862063693486e-05,
1093
+ "loss": 3.7458,
1094
+ "step": 1550
1095
+ },
1096
+ {
1097
+ "epoch": 0.312,
1098
+ "grad_norm": 0.3356248736381531,
1099
+ "learning_rate": 4.3461383515647106e-05,
1100
+ "loss": 1.9472,
1101
+ "step": 1560
1102
+ },
1103
+ {
1104
+ "epoch": 0.314,
1105
+ "grad_norm": 0.4538246989250183,
1106
+ "learning_rate": 4.334324661257191e-05,
1107
+ "loss": 1.0152,
1108
+ "step": 1570
1109
+ },
1110
+ {
1111
+ "epoch": 0.316,
1112
+ "grad_norm": 0.0,
1113
+ "learning_rate": 4.3224215685535294e-05,
1114
+ "loss": 0.9588,
1115
+ "step": 1580
1116
+ },
1117
+ {
1118
+ "epoch": 0.318,
1119
+ "grad_norm": 4.520866394042969,
1120
+ "learning_rate": 4.3104296535936695e-05,
1121
+ "loss": 1.1131,
1122
+ "step": 1590
1123
+ },
1124
+ {
1125
+ "epoch": 0.32,
1126
+ "grad_norm": 1.177220106124878,
1127
+ "learning_rate": 4.2983495008466276e-05,
1128
+ "loss": 2.4659,
1129
+ "step": 1600
1130
+ },
1131
+ {
1132
+ "epoch": 0.322,
1133
+ "grad_norm": 1.685763955116272,
1134
+ "learning_rate": 4.2861816990820084e-05,
1135
+ "loss": 0.8816,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 0.324,
1140
+ "grad_norm": 0.0,
1141
+ "learning_rate": 4.273926841341302e-05,
1142
+ "loss": 0.519,
1143
+ "step": 1620
1144
+ },
1145
+ {
1146
+ "epoch": 0.326,
1147
+ "grad_norm": 0.0,
1148
+ "learning_rate": 4.261585524908987e-05,
1149
+ "loss": 2.3195,
1150
+ "step": 1630
1151
+ },
1152
+ {
1153
+ "epoch": 0.328,
1154
+ "grad_norm": 3.9909987449645996,
1155
+ "learning_rate": 4.249158351283414e-05,
1156
+ "loss": 3.6418,
1157
+ "step": 1640
1158
+ },
1159
+ {
1160
+ "epoch": 0.33,
1161
+ "grad_norm": 2.534325361251831,
1162
+ "learning_rate": 4.2366459261474933e-05,
1163
+ "loss": 2.1609,
1164
+ "step": 1650
1165
+ },
1166
+ {
1167
+ "epoch": 0.332,
1168
+ "grad_norm": 1.4876514673233032,
1169
+ "learning_rate": 4.224048859339175e-05,
1170
+ "loss": 1.4675,
1171
+ "step": 1660
1172
+ },
1173
+ {
1174
+ "epoch": 0.334,
1175
+ "grad_norm": 8.85960578918457,
1176
+ "learning_rate": 4.211367764821722e-05,
1177
+ "loss": 1.3251,
1178
+ "step": 1670
1179
+ },
1180
+ {
1181
+ "epoch": 0.336,
1182
+ "grad_norm": 0.4190084636211395,
1183
+ "learning_rate": 4.198603260653792e-05,
1184
+ "loss": 1.0805,
1185
+ "step": 1680
1186
+ },
1187
+ {
1188
+ "epoch": 0.338,
1189
+ "grad_norm": 1.5515415668487549,
1190
+ "learning_rate": 4.185755968959308e-05,
1191
+ "loss": 0.5501,
1192
+ "step": 1690
1193
+ },
1194
+ {
1195
+ "epoch": 0.34,
1196
+ "grad_norm": 2.4281444549560547,
1197
+ "learning_rate": 4.172826515897146e-05,
1198
+ "loss": 0.7312,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 0.342,
1203
+ "grad_norm": 0.6593759655952454,
1204
+ "learning_rate": 4.1598155316306044e-05,
1205
+ "loss": 0.5805,
1206
+ "step": 1710
1207
+ },
1208
+ {
1209
+ "epoch": 0.344,
1210
+ "grad_norm": 1.126512050628662,
1211
+ "learning_rate": 4.146723650296701e-05,
1212
+ "loss": 1.9512,
1213
+ "step": 1720
1214
+ },
1215
+ {
1216
+ "epoch": 0.346,
1217
+ "grad_norm": 0.6861071586608887,
1218
+ "learning_rate": 4.133551509975264e-05,
1219
+ "loss": 1.5697,
1220
+ "step": 1730
1221
+ },
1222
+ {
1223
+ "epoch": 0.348,
1224
+ "grad_norm": 0.5916430354118347,
1225
+ "learning_rate": 4.1202997526578276e-05,
1226
+ "loss": 0.8957,
1227
+ "step": 1740
1228
+ },
1229
+ {
1230
+ "epoch": 0.35,
1231
+ "grad_norm": 0.6755764484405518,
1232
+ "learning_rate": 4.1069690242163484e-05,
1233
+ "loss": 0.489,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 0.352,
1238
+ "grad_norm": 0.0,
1239
+ "learning_rate": 4.093559974371725e-05,
1240
+ "loss": 2.1314,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 0.354,
1245
+ "grad_norm": 0.4118390381336212,
1246
+ "learning_rate": 4.080073256662127e-05,
1247
+ "loss": 3.0047,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 0.356,
1252
+ "grad_norm": 5.131174564361572,
1253
+ "learning_rate": 4.066509528411152e-05,
1254
+ "loss": 1.2667,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 0.358,
1259
+ "grad_norm": 0.9545110464096069,
1260
+ "learning_rate": 4.052869450695776e-05,
1261
+ "loss": 0.9325,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 0.36,
1266
+ "grad_norm": 0.9308171272277832,
1267
+ "learning_rate": 4.039153688314145e-05,
1268
+ "loss": 2.1198,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 0.362,
1273
+ "grad_norm": 1.648389220237732,
1274
+ "learning_rate": 4.02536290975317e-05,
1275
+ "loss": 0.809,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 0.364,
1280
+ "grad_norm": 1.658455729484558,
1281
+ "learning_rate": 4.011497787155938e-05,
1282
+ "loss": 0.9603,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 0.366,
1287
+ "grad_norm": 7.657645225524902,
1288
+ "learning_rate": 3.997558996288965e-05,
1289
+ "loss": 2.463,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 0.368,
1294
+ "grad_norm": 1.101691484451294,
1295
+ "learning_rate": 3.983547216509254e-05,
1296
+ "loss": 0.6611,
1297
+ "step": 1840
1298
+ },
1299
+ {
1300
+ "epoch": 0.37,
1301
+ "grad_norm": 0.0,
1302
+ "learning_rate": 3.969463130731183e-05,
1303
+ "loss": 0.8496,
1304
+ "step": 1850
1305
+ },
1306
+ {
1307
+ "epoch": 0.372,
1308
+ "grad_norm": 1.1520494222640991,
1309
+ "learning_rate": 3.955307425393224e-05,
1310
+ "loss": 2.0151,
1311
+ "step": 1860
1312
+ },
1313
+ {
1314
+ "epoch": 0.374,
1315
+ "grad_norm": 0.7042891383171082,
1316
+ "learning_rate": 3.941080790424484e-05,
1317
+ "loss": 1.1714,
1318
+ "step": 1870
1319
+ },
1320
+ {
1321
+ "epoch": 0.376,
1322
+ "grad_norm": 1.9343875646591187,
1323
+ "learning_rate": 3.92678391921108e-05,
1324
+ "loss": 2.1127,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 0.378,
1329
+ "grad_norm": 3.507776975631714,
1330
+ "learning_rate": 3.912417508562345e-05,
1331
+ "loss": 3.0312,
1332
+ "step": 1890
1333
+ },
1334
+ {
1335
+ "epoch": 0.38,
1336
+ "grad_norm": 1.134413480758667,
1337
+ "learning_rate": 3.897982258676867e-05,
1338
+ "loss": 1.052,
1339
+ "step": 1900
1340
+ },
1341
+ {
1342
+ "epoch": 0.382,
1343
+ "grad_norm": 1.6466419696807861,
1344
+ "learning_rate": 3.883478873108361e-05,
1345
+ "loss": 1.2737,
1346
+ "step": 1910
1347
+ },
1348
+ {
1349
+ "epoch": 0.384,
1350
+ "grad_norm": 0.9162160158157349,
1351
+ "learning_rate": 3.868908058731376e-05,
1352
+ "loss": 0.487,
1353
+ "step": 1920
1354
+ },
1355
+ {
1356
+ "epoch": 0.386,
1357
+ "grad_norm": 1.0865650177001953,
1358
+ "learning_rate": 3.85427052570685e-05,
1359
+ "loss": 1.1498,
1360
+ "step": 1930
1361
+ },
1362
+ {
1363
+ "epoch": 0.388,
1364
+ "grad_norm": 2.519901990890503,
1365
+ "learning_rate": 3.8395669874474915e-05,
1366
+ "loss": 1.3501,
1367
+ "step": 1940
1368
+ },
1369
+ {
1370
+ "epoch": 0.39,
1371
+ "grad_norm": 1.382398247718811,
1372
+ "learning_rate": 3.824798160583012e-05,
1373
+ "loss": 0.695,
1374
+ "step": 1950
1375
+ },
1376
+ {
1377
+ "epoch": 0.392,
1378
+ "grad_norm": 3.123533010482788,
1379
+ "learning_rate": 3.8099647649251986e-05,
1380
+ "loss": 1.1411,
1381
+ "step": 1960
1382
+ },
1383
+ {
1384
+ "epoch": 0.394,
1385
+ "grad_norm": 0.35227781534194946,
1386
+ "learning_rate": 3.795067523432826e-05,
1387
+ "loss": 1.3583,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 0.396,
1392
+ "grad_norm": 14.265485763549805,
1393
+ "learning_rate": 3.780107162176429e-05,
1394
+ "loss": 1.43,
1395
+ "step": 1980
1396
+ },
1397
+ {
1398
+ "epoch": 0.398,
1399
+ "grad_norm": 0.9283212423324585,
1400
+ "learning_rate": 3.765084410302909e-05,
1401
+ "loss": 2.6459,
1402
+ "step": 1990
1403
+ },
1404
+ {
1405
+ "epoch": 0.4,
1406
+ "grad_norm": 1.3758940696716309,
1407
+ "learning_rate": 3.7500000000000003e-05,
1408
+ "loss": 2.1941,
1409
+ "step": 2000
1410
+ },
1411
+ {
1412
+ "epoch": 0.402,
1413
+ "grad_norm": 0.39502596855163574,
1414
+ "learning_rate": 3.7348546664605777e-05,
1415
+ "loss": 1.1788,
1416
+ "step": 2010
1417
+ },
1418
+ {
1419
+ "epoch": 0.404,
1420
+ "grad_norm": 2.504502534866333,
1421
+ "learning_rate": 3.719649147846832e-05,
1422
+ "loss": 0.6889,
1423
+ "step": 2020
1424
+ },
1425
+ {
1426
+ "epoch": 0.406,
1427
+ "grad_norm": 0.4152251183986664,
1428
+ "learning_rate": 3.704384185254288e-05,
1429
+ "loss": 1.0726,
1430
+ "step": 2030
1431
+ },
1432
+ {
1433
+ "epoch": 0.408,
1434
+ "grad_norm": 2.400519371032715,
1435
+ "learning_rate": 3.689060522675689e-05,
1436
+ "loss": 1.1677,
1437
+ "step": 2040
1438
+ },
1439
+ {
1440
+ "epoch": 0.41,
1441
+ "grad_norm": 1.742875337600708,
1442
+ "learning_rate": 3.673678906964727e-05,
1443
+ "loss": 1.1147,
1444
+ "step": 2050
1445
+ },
1446
+ {
1447
+ "epoch": 0.412,
1448
+ "grad_norm": 1.1500080823898315,
1449
+ "learning_rate": 3.6582400877996546e-05,
1450
+ "loss": 0.9126,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 0.414,
1455
+ "grad_norm": 19.241657257080078,
1456
+ "learning_rate": 3.642744817646736e-05,
1457
+ "loss": 2.0398,
1458
+ "step": 2070
1459
+ },
1460
+ {
1461
+ "epoch": 0.416,
1462
+ "grad_norm": 0.7482590675354004,
1463
+ "learning_rate": 3.627193851723577e-05,
1464
+ "loss": 1.3157,
1465
+ "step": 2080
1466
+ },
1467
+ {
1468
+ "epoch": 0.418,
1469
+ "grad_norm": 1.0819116830825806,
1470
+ "learning_rate": 3.611587947962319e-05,
1471
+ "loss": 0.8404,
1472
+ "step": 2090
1473
+ },
1474
+ {
1475
+ "epoch": 0.42,
1476
+ "grad_norm": 0.6795836687088013,
1477
+ "learning_rate": 3.5959278669726935e-05,
1478
+ "loss": 1.571,
1479
+ "step": 2100
1480
+ },
1481
+ {
1482
+ "epoch": 0.422,
1483
+ "grad_norm": 1.1088377237319946,
1484
+ "learning_rate": 3.580214372004956e-05,
1485
+ "loss": 1.7026,
1486
+ "step": 2110
1487
+ },
1488
+ {
1489
+ "epoch": 0.424,
1490
+ "grad_norm": 3.5846972465515137,
1491
+ "learning_rate": 3.564448228912682e-05,
1492
+ "loss": 0.7502,
1493
+ "step": 2120
1494
+ },
1495
+ {
1496
+ "epoch": 0.426,
1497
+ "grad_norm": 0.6942911148071289,
1498
+ "learning_rate": 3.548630206115443e-05,
1499
+ "loss": 0.7163,
1500
+ "step": 2130
1501
+ },
1502
+ {
1503
+ "epoch": 0.428,
1504
+ "grad_norm": 1.9937869310379028,
1505
+ "learning_rate": 3.532761074561355e-05,
1506
+ "loss": 1.2891,
1507
+ "step": 2140
1508
+ },
1509
+ {
1510
+ "epoch": 0.43,
1511
+ "grad_norm": 0.8418222665786743,
1512
+ "learning_rate": 3.516841607689501e-05,
1513
+ "loss": 2.0099,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 0.432,
1518
+ "grad_norm": 6.1483893394470215,
1519
+ "learning_rate": 3.5008725813922386e-05,
1520
+ "loss": 2.6718,
1521
+ "step": 2160
1522
+ },
1523
+ {
1524
+ "epoch": 0.434,
1525
+ "grad_norm": 2.0732738971710205,
1526
+ "learning_rate": 3.484854773977378e-05,
1527
+ "loss": 3.9758,
1528
+ "step": 2170
1529
+ },
1530
+ {
1531
+ "epoch": 0.436,
1532
+ "grad_norm": 1.1674740314483643,
1533
+ "learning_rate": 3.4687889661302576e-05,
1534
+ "loss": 4.9642,
1535
+ "step": 2180
1536
+ },
1537
+ {
1538
+ "epoch": 0.438,
1539
+ "grad_norm": 0.39993491768836975,
1540
+ "learning_rate": 3.452675940875686e-05,
1541
+ "loss": 1.3115,
1542
+ "step": 2190
1543
+ },
1544
+ {
1545
+ "epoch": 0.44,
1546
+ "grad_norm": 0.35298460721969604,
1547
+ "learning_rate": 3.436516483539781e-05,
1548
+ "loss": 2.1822,
1549
+ "step": 2200
1550
+ },
1551
+ {
1552
+ "epoch": 0.442,
1553
+ "grad_norm": 1.8225891590118408,
1554
+ "learning_rate": 3.4203113817116957e-05,
1555
+ "loss": 0.6386,
1556
+ "step": 2210
1557
+ },
1558
+ {
1559
+ "epoch": 0.444,
1560
+ "grad_norm": 0.0,
1561
+ "learning_rate": 3.4040614252052305e-05,
1562
+ "loss": 0.6365,
1563
+ "step": 2220
1564
+ },
1565
+ {
1566
+ "epoch": 0.446,
1567
+ "grad_norm": 3.6752185821533203,
1568
+ "learning_rate": 3.387767406020343e-05,
1569
+ "loss": 0.897,
1570
+ "step": 2230
1571
+ },
1572
+ {
1573
+ "epoch": 0.448,
1574
+ "grad_norm": 0.5872332453727722,
1575
+ "learning_rate": 3.3714301183045385e-05,
1576
+ "loss": 1.4772,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 0.45,
1581
+ "grad_norm": 0.3744696080684662,
1582
+ "learning_rate": 3.355050358314172e-05,
1583
+ "loss": 0.6316,
1584
+ "step": 2250
1585
+ },
1586
+ {
1587
+ "epoch": 0.452,
1588
+ "grad_norm": 1.4100755453109741,
1589
+ "learning_rate": 3.338628924375638e-05,
1590
+ "loss": 0.5972,
1591
+ "step": 2260
1592
+ },
1593
+ {
1594
+ "epoch": 0.454,
1595
+ "grad_norm": 0.578823447227478,
1596
+ "learning_rate": 3.322166616846458e-05,
1597
+ "loss": 0.7165,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 0.456,
1602
+ "grad_norm": 0.9964269995689392,
1603
+ "learning_rate": 3.305664238076278e-05,
1604
+ "loss": 1.8711,
1605
+ "step": 2280
1606
+ },
1607
+ {
1608
+ "epoch": 0.458,
1609
+ "grad_norm": 0.8978599905967712,
1610
+ "learning_rate": 3.289122592367757e-05,
1611
+ "loss": 0.9576,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 0.46,
1616
+ "grad_norm": 0.8042961359024048,
1617
+ "learning_rate": 3.272542485937369e-05,
1618
+ "loss": 1.4457,
1619
+ "step": 2300
1620
+ },
1621
+ {
1622
+ "epoch": 0.462,
1623
+ "grad_norm": 4.5866875648498535,
1624
+ "learning_rate": 3.2559247268761115e-05,
1625
+ "loss": 0.9553,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 0.464,
1630
+ "grad_norm": 0.3632621765136719,
1631
+ "learning_rate": 3.239270125110117e-05,
1632
+ "loss": 1.3703,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 0.466,
1637
+ "grad_norm": 0.7157305479049683,
1638
+ "learning_rate": 3.222579492361179e-05,
1639
+ "loss": 1.9259,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 0.468,
1644
+ "grad_norm": 1.0667433738708496,
1645
+ "learning_rate": 3.205853642107192e-05,
1646
+ "loss": 0.9021,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 0.47,
1651
+ "grad_norm": 1.5877084732055664,
1652
+ "learning_rate": 3.1890933895424976e-05,
1653
+ "loss": 2.2195,
1654
+ "step": 2350
1655
+ },
1656
+ {
1657
+ "epoch": 0.472,
1658
+ "grad_norm": 1.945678472518921,
1659
+ "learning_rate": 3.172299551538164e-05,
1660
+ "loss": 0.724,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 0.474,
1665
+ "grad_norm": 0.0,
1666
+ "learning_rate": 3.155472946602162e-05,
1667
+ "loss": 2.0286,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 0.476,
1672
+ "grad_norm": 7.778709411621094,
1673
+ "learning_rate": 3.138614394839476e-05,
1674
+ "loss": 0.9387,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 0.478,
1679
+ "grad_norm": 0.7516165971755981,
1680
+ "learning_rate": 3.121724717912138e-05,
1681
+ "loss": 0.8844,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 0.48,
1686
+ "grad_norm": 3.163661241531372,
1687
+ "learning_rate": 3.104804738999169e-05,
1688
+ "loss": 0.8334,
1689
+ "step": 2400
1690
+ },
1691
+ {
1692
+ "epoch": 0.482,
1693
+ "grad_norm": 1.4817134141921997,
1694
+ "learning_rate": 3.087855282756475e-05,
1695
+ "loss": 0.7062,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 0.484,
1700
+ "grad_norm": 1.825772762298584,
1701
+ "learning_rate": 3.0708771752766394e-05,
1702
+ "loss": 1.1329,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 0.486,
1707
+ "grad_norm": 0.3289898931980133,
1708
+ "learning_rate": 3.053871244048669e-05,
1709
+ "loss": 3.1218,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 0.488,
1714
+ "grad_norm": 1.0258142948150635,
1715
+ "learning_rate": 3.0368383179176585e-05,
1716
+ "loss": 1.795,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 0.49,
1721
+ "grad_norm": 4.297471523284912,
1722
+ "learning_rate": 3.0197792270443982e-05,
1723
+ "loss": 1.3635,
1724
+ "step": 2450
1725
+ },
1726
+ {
1727
+ "epoch": 0.492,
1728
+ "grad_norm": 0.9445592761039734,
1729
+ "learning_rate": 3.002694802864912e-05,
1730
+ "loss": 1.5214,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 0.494,
1735
+ "grad_norm": 3.7605855464935303,
1736
+ "learning_rate": 2.98558587804993e-05,
1737
+ "loss": 1.1378,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 0.496,
1742
+ "grad_norm": 1.3011572360992432,
1743
+ "learning_rate": 2.9684532864643122e-05,
1744
+ "loss": 1.3969,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 0.498,
1749
+ "grad_norm": 1.2740352153778076,
1750
+ "learning_rate": 2.9512978631264006e-05,
1751
+ "loss": 0.7386,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 0.5,
1756
+ "grad_norm": 0.6233336329460144,
1757
+ "learning_rate": 2.9341204441673266e-05,
1758
+ "loss": 1.2025,
1759
+ "step": 2500
1760
+ },
1761
+ {
1762
+ "epoch": 0.502,
1763
+ "grad_norm": 4.989047050476074,
1764
+ "learning_rate": 2.916921866790256e-05,
1765
+ "loss": 0.8712,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 0.504,
1770
+ "grad_norm": 2.683638095855713,
1771
+ "learning_rate": 2.8997029692295874e-05,
1772
+ "loss": 2.7228,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 0.506,
1777
+ "grad_norm": 1.5171597003936768,
1778
+ "learning_rate": 2.8824645907100954e-05,
1779
+ "loss": 1.6352,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 0.508,
1784
+ "grad_norm": 0.639072835445404,
1785
+ "learning_rate": 2.8652075714060295e-05,
1786
+ "loss": 0.7708,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 0.51,
1791
+ "grad_norm": 0.4355096220970154,
1792
+ "learning_rate": 2.8479327524001636e-05,
1793
+ "loss": 1.7158,
1794
+ "step": 2550
1795
+ },
1796
+ {
1797
+ "epoch": 0.512,
1798
+ "grad_norm": 5.390672206878662,
1799
+ "learning_rate": 2.8306409756428064e-05,
1800
+ "loss": 1.9952,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 0.514,
1805
+ "grad_norm": 1.1911485195159912,
1806
+ "learning_rate": 2.8133330839107608e-05,
1807
+ "loss": 1.7763,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 0.516,
1812
+ "grad_norm": 1.0947262048721313,
1813
+ "learning_rate": 2.7960099207662532e-05,
1814
+ "loss": 1.1427,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 0.518,
1819
+ "grad_norm": 0.9010633230209351,
1820
+ "learning_rate": 2.7786723305158136e-05,
1821
+ "loss": 1.3634,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 0.52,
1826
+ "grad_norm": 3.9178924560546875,
1827
+ "learning_rate": 2.761321158169134e-05,
1828
+ "loss": 1.1372,
1829
+ "step": 2600
1830
+ },
1831
+ {
1832
+ "epoch": 0.522,
1833
+ "grad_norm": 1.1135903596878052,
1834
+ "learning_rate": 2.7439572493978736e-05,
1835
+ "loss": 0.6111,
1836
+ "step": 2610
1837
+ },
1838
+ {
1839
+ "epoch": 0.524,
1840
+ "grad_norm": 2.4544811248779297,
1841
+ "learning_rate": 2.726581450494451e-05,
1842
+ "loss": 1.1249,
1843
+ "step": 2620
1844
+ },
1845
+ {
1846
+ "epoch": 0.526,
1847
+ "grad_norm": 0.459357887506485,
1848
+ "learning_rate": 2.7091946083307896e-05,
1849
+ "loss": 4.3159,
1850
+ "step": 2630
1851
+ },
1852
+ {
1853
+ "epoch": 0.528,
1854
+ "grad_norm": 2.8313136100769043,
1855
+ "learning_rate": 2.6917975703170466e-05,
1856
+ "loss": 0.9282,
1857
+ "step": 2640
1858
+ },
1859
+ {
1860
+ "epoch": 0.53,
1861
+ "grad_norm": 3.777279853820801,
1862
+ "learning_rate": 2.674391184360313e-05,
1863
+ "loss": 1.3395,
1864
+ "step": 2650
1865
+ },
1866
+ {
1867
+ "epoch": 0.532,
1868
+ "grad_norm": 3.277451992034912,
1869
+ "learning_rate": 2.656976298823284e-05,
1870
+ "loss": 0.7275,
1871
+ "step": 2660
1872
+ },
1873
+ {
1874
+ "epoch": 0.534,
1875
+ "grad_norm": 0.6467308402061462,
1876
+ "learning_rate": 2.6395537624829096e-05,
1877
+ "loss": 5.1828,
1878
+ "step": 2670
1879
+ },
1880
+ {
1881
+ "epoch": 0.536,
1882
+ "grad_norm": 0.0,
1883
+ "learning_rate": 2.6221244244890336e-05,
1884
+ "loss": 1.1841,
1885
+ "step": 2680
1886
+ },
1887
+ {
1888
+ "epoch": 0.538,
1889
+ "grad_norm": 0.5595866441726685,
1890
+ "learning_rate": 2.604689134322999e-05,
1891
+ "loss": 2.1432,
1892
+ "step": 2690
1893
+ },
1894
+ {
1895
+ "epoch": 0.54,
1896
+ "grad_norm": 2.23224139213562,
1897
+ "learning_rate": 2.587248741756253e-05,
1898
+ "loss": 1.1397,
1899
+ "step": 2700
1900
+ },
1901
+ {
1902
+ "epoch": 0.542,
1903
+ "grad_norm": 14.135408401489258,
1904
+ "learning_rate": 2.5698040968089225e-05,
1905
+ "loss": 1.8865,
1906
+ "step": 2710
1907
+ },
1908
+ {
1909
+ "epoch": 0.544,
1910
+ "grad_norm": 3.3315963745117188,
1911
+ "learning_rate": 2.5523560497083926e-05,
1912
+ "loss": 1.7895,
1913
+ "step": 2720
1914
+ },
1915
+ {
1916
+ "epoch": 0.546,
1917
+ "grad_norm": 0.6247251629829407,
1918
+ "learning_rate": 2.5349054508478637e-05,
1919
+ "loss": 0.5246,
1920
+ "step": 2730
1921
+ },
1922
+ {
1923
+ "epoch": 0.548,
1924
+ "grad_norm": 1.406416654586792,
1925
+ "learning_rate": 2.517453150744904e-05,
1926
+ "loss": 2.2288,
1927
+ "step": 2740
1928
+ },
1929
+ {
1930
+ "epoch": 0.55,
1931
+ "grad_norm": 8.639768600463867,
1932
+ "learning_rate": 2.5e-05,
1933
+ "loss": 1.1424,
1934
+ "step": 2750
1935
+ },
1936
+ {
1937
+ "epoch": 0.552,
1938
+ "grad_norm": 1.132100224494934,
1939
+ "learning_rate": 2.4825468492550964e-05,
1940
+ "loss": 0.5168,
1941
+ "step": 2760
1942
+ },
1943
+ {
1944
+ "epoch": 0.554,
1945
+ "grad_norm": 0.6441847681999207,
1946
+ "learning_rate": 2.4650945491521372e-05,
1947
+ "loss": 1.6828,
1948
+ "step": 2770
1949
+ },
1950
+ {
1951
+ "epoch": 0.556,
1952
+ "grad_norm": 0.38985392451286316,
1953
+ "learning_rate": 2.447643950291608e-05,
1954
+ "loss": 1.3219,
1955
+ "step": 2780
1956
+ },
1957
+ {
1958
+ "epoch": 0.558,
1959
+ "grad_norm": 6.931550025939941,
1960
+ "learning_rate": 2.4301959031910784e-05,
1961
+ "loss": 1.511,
1962
+ "step": 2790
1963
+ },
1964
+ {
1965
+ "epoch": 0.56,
1966
+ "grad_norm": 2.3524391651153564,
1967
+ "learning_rate": 2.4127512582437485e-05,
1968
+ "loss": 1.5983,
1969
+ "step": 2800
1970
+ },
1971
+ {
1972
+ "epoch": 0.562,
1973
+ "grad_norm": 2.679114580154419,
1974
+ "learning_rate": 2.3953108656770016e-05,
1975
+ "loss": 0.7233,
1976
+ "step": 2810
1977
+ },
1978
+ {
1979
+ "epoch": 0.564,
1980
+ "grad_norm": 4.362655162811279,
1981
+ "learning_rate": 2.377875575510967e-05,
1982
+ "loss": 1.2128,
1983
+ "step": 2820
1984
+ },
1985
+ {
1986
+ "epoch": 0.566,
1987
+ "grad_norm": 1.256465196609497,
1988
+ "learning_rate": 2.3604462375170906e-05,
1989
+ "loss": 1.3444,
1990
+ "step": 2830
1991
+ },
1992
+ {
1993
+ "epoch": 0.568,
1994
+ "grad_norm": 1.7675756216049194,
1995
+ "learning_rate": 2.3430237011767167e-05,
1996
+ "loss": 0.6756,
1997
+ "step": 2840
1998
+ },
1999
+ {
2000
+ "epoch": 0.57,
2001
+ "grad_norm": 0.9454560875892639,
2002
+ "learning_rate": 2.3256088156396868e-05,
2003
+ "loss": 2.1279,
2004
+ "step": 2850
2005
+ },
2006
+ {
2007
+ "epoch": 0.572,
2008
+ "grad_norm": 1.4755468368530273,
2009
+ "learning_rate": 2.3082024296829536e-05,
2010
+ "loss": 1.2081,
2011
+ "step": 2860
2012
+ },
2013
+ {
2014
+ "epoch": 0.574,
2015
+ "grad_norm": 0.39675667881965637,
2016
+ "learning_rate": 2.2908053916692117e-05,
2017
+ "loss": 1.6913,
2018
+ "step": 2870
2019
+ },
2020
+ {
2021
+ "epoch": 0.576,
2022
+ "grad_norm": 3.55375337600708,
2023
+ "learning_rate": 2.2734185495055503e-05,
2024
+ "loss": 0.7114,
2025
+ "step": 2880
2026
+ },
2027
+ {
2028
+ "epoch": 0.578,
2029
+ "grad_norm": 1.0292774438858032,
2030
+ "learning_rate": 2.2560427506021266e-05,
2031
+ "loss": 1.5252,
2032
+ "step": 2890
2033
+ },
2034
+ {
2035
+ "epoch": 0.58,
2036
+ "grad_norm": 0.8896195888519287,
2037
+ "learning_rate": 2.238678841830867e-05,
2038
+ "loss": 1.2513,
2039
+ "step": 2900
2040
+ },
2041
+ {
2042
+ "epoch": 0.582,
2043
+ "grad_norm": 4.262208461761475,
2044
+ "learning_rate": 2.2213276694841866e-05,
2045
+ "loss": 1.5359,
2046
+ "step": 2910
2047
+ },
2048
+ {
2049
+ "epoch": 0.584,
2050
+ "grad_norm": 0.8088265061378479,
2051
+ "learning_rate": 2.2039900792337474e-05,
2052
+ "loss": 1.3664,
2053
+ "step": 2920
2054
+ },
2055
+ {
2056
+ "epoch": 0.586,
2057
+ "grad_norm": 2.1337993144989014,
2058
+ "learning_rate": 2.186666916089239e-05,
2059
+ "loss": 1.102,
2060
+ "step": 2930
2061
+ },
2062
+ {
2063
+ "epoch": 0.588,
2064
+ "grad_norm": 2.935187816619873,
2065
+ "learning_rate": 2.1693590243571938e-05,
2066
+ "loss": 1.5992,
2067
+ "step": 2940
2068
+ },
2069
+ {
2070
+ "epoch": 0.59,
2071
+ "grad_norm": 0.628711998462677,
2072
+ "learning_rate": 2.1520672475998373e-05,
2073
+ "loss": 0.8194,
2074
+ "step": 2950
2075
+ },
2076
+ {
2077
+ "epoch": 0.592,
2078
+ "grad_norm": 0.5109361410140991,
2079
+ "learning_rate": 2.1347924285939714e-05,
2080
+ "loss": 0.8753,
2081
+ "step": 2960
2082
+ },
2083
+ {
2084
+ "epoch": 0.594,
2085
+ "grad_norm": 2.374326229095459,
2086
+ "learning_rate": 2.117535409289905e-05,
2087
+ "loss": 1.8568,
2088
+ "step": 2970
2089
+ },
2090
+ {
2091
+ "epoch": 0.596,
2092
+ "grad_norm": 2.674623966217041,
2093
+ "learning_rate": 2.1002970307704132e-05,
2094
+ "loss": 2.0932,
2095
+ "step": 2980
2096
+ },
2097
+ {
2098
+ "epoch": 0.598,
2099
+ "grad_norm": 0.7172428369522095,
2100
+ "learning_rate": 2.0830781332097446e-05,
2101
+ "loss": 0.6447,
2102
+ "step": 2990
2103
+ },
2104
+ {
2105
+ "epoch": 0.6,
2106
+ "grad_norm": 2.8360252380371094,
2107
+ "learning_rate": 2.0658795558326743e-05,
2108
+ "loss": 0.8866,
2109
+ "step": 3000
2110
+ },
2111
+ {
2112
+ "epoch": 0.602,
2113
+ "grad_norm": 1.225557804107666,
2114
+ "learning_rate": 2.0487021368736003e-05,
2115
+ "loss": 2.3415,
2116
+ "step": 3010
2117
+ },
2118
+ {
2119
+ "epoch": 0.604,
2120
+ "grad_norm": 0.8167237043380737,
2121
+ "learning_rate": 2.031546713535688e-05,
2122
+ "loss": 1.244,
2123
+ "step": 3020
2124
+ },
2125
+ {
2126
+ "epoch": 0.606,
2127
+ "grad_norm": 9.67261791229248,
2128
+ "learning_rate": 2.0144141219500705e-05,
2129
+ "loss": 2.6138,
2130
+ "step": 3030
2131
+ },
2132
+ {
2133
+ "epoch": 0.608,
2134
+ "grad_norm": 0.6742550730705261,
2135
+ "learning_rate": 1.9973051971350888e-05,
2136
+ "loss": 0.8315,
2137
+ "step": 3040
2138
+ },
2139
+ {
2140
+ "epoch": 0.61,
2141
+ "grad_norm": 18.961151123046875,
2142
+ "learning_rate": 1.980220772955602e-05,
2143
+ "loss": 4.5689,
2144
+ "step": 3050
2145
+ },
2146
+ {
2147
+ "epoch": 0.612,
2148
+ "grad_norm": 5.60457181930542,
2149
+ "learning_rate": 1.963161682082342e-05,
2150
+ "loss": 1.2894,
2151
+ "step": 3060
2152
+ },
2153
+ {
2154
+ "epoch": 0.614,
2155
+ "grad_norm": 1.3066905736923218,
2156
+ "learning_rate": 1.946128755951332e-05,
2157
+ "loss": 1.1867,
2158
+ "step": 3070
2159
+ },
2160
+ {
2161
+ "epoch": 0.616,
2162
+ "grad_norm": 3.874143362045288,
2163
+ "learning_rate": 1.9291228247233605e-05,
2164
+ "loss": 1.8681,
2165
+ "step": 3080
2166
+ },
2167
+ {
2168
+ "epoch": 0.618,
2169
+ "grad_norm": 2.0997185707092285,
2170
+ "learning_rate": 1.912144717243525e-05,
2171
+ "loss": 1.1927,
2172
+ "step": 3090
2173
+ },
2174
+ {
2175
+ "epoch": 0.62,
2176
+ "grad_norm": 3.868537664413452,
2177
+ "learning_rate": 1.895195261000831e-05,
2178
+ "loss": 1.6265,
2179
+ "step": 3100
2180
+ },
2181
+ {
2182
+ "epoch": 0.622,
2183
+ "grad_norm": 0.4815676808357239,
2184
+ "learning_rate": 1.8782752820878634e-05,
2185
+ "loss": 1.3608,
2186
+ "step": 3110
2187
+ },
2188
+ {
2189
+ "epoch": 0.624,
2190
+ "grad_norm": 4.783424377441406,
2191
+ "learning_rate": 1.8613856051605243e-05,
2192
+ "loss": 1.4472,
2193
+ "step": 3120
2194
+ },
2195
+ {
2196
+ "epoch": 0.626,
2197
+ "grad_norm": 0.5334388017654419,
2198
+ "learning_rate": 1.8445270533978388e-05,
2199
+ "loss": 0.6093,
2200
+ "step": 3130
2201
+ },
2202
+ {
2203
+ "epoch": 0.628,
2204
+ "grad_norm": 2.9873673915863037,
2205
+ "learning_rate": 1.827700448461836e-05,
2206
+ "loss": 1.3848,
2207
+ "step": 3140
2208
+ },
2209
+ {
2210
+ "epoch": 0.63,
2211
+ "grad_norm": 1.5935121774673462,
2212
+ "learning_rate": 1.8109066104575023e-05,
2213
+ "loss": 1.1096,
2214
+ "step": 3150
2215
+ },
2216
+ {
2217
+ "epoch": 0.632,
2218
+ "grad_norm": 0.0,
2219
+ "learning_rate": 1.7941463578928086e-05,
2220
+ "loss": 0.8174,
2221
+ "step": 3160
2222
+ },
2223
+ {
2224
+ "epoch": 0.634,
2225
+ "grad_norm": 0.5703982710838318,
2226
+ "learning_rate": 1.7774205076388206e-05,
2227
+ "loss": 1.5061,
2228
+ "step": 3170
2229
+ },
2230
+ {
2231
+ "epoch": 0.636,
2232
+ "grad_norm": 0.0,
2233
+ "learning_rate": 1.7607298748898842e-05,
2234
+ "loss": 0.7052,
2235
+ "step": 3180
2236
+ },
2237
+ {
2238
+ "epoch": 0.638,
2239
+ "grad_norm": 3.1492154598236084,
2240
+ "learning_rate": 1.744075273123889e-05,
2241
+ "loss": 2.0225,
2242
+ "step": 3190
2243
+ },
2244
+ {
2245
+ "epoch": 0.64,
2246
+ "grad_norm": 6.029577255249023,
2247
+ "learning_rate": 1.7274575140626318e-05,
2248
+ "loss": 2.2687,
2249
+ "step": 3200
2250
+ },
2251
+ {
2252
+ "epoch": 0.642,
2253
+ "grad_norm": 0.691862940788269,
2254
+ "learning_rate": 1.7108774076322443e-05,
2255
+ "loss": 0.6663,
2256
+ "step": 3210
2257
+ },
2258
+ {
2259
+ "epoch": 0.644,
2260
+ "grad_norm": 6.771577835083008,
2261
+ "learning_rate": 1.6943357619237226e-05,
2262
+ "loss": 1.259,
2263
+ "step": 3220
2264
+ },
2265
+ {
2266
+ "epoch": 0.646,
2267
+ "grad_norm": 1.3319804668426514,
2268
+ "learning_rate": 1.677833383153542e-05,
2269
+ "loss": 1.1647,
2270
+ "step": 3230
2271
+ },
2272
+ {
2273
+ "epoch": 0.648,
2274
+ "grad_norm": 3.718688488006592,
2275
+ "learning_rate": 1.6613710756243626e-05,
2276
+ "loss": 0.9512,
2277
+ "step": 3240
2278
+ },
2279
+ {
2280
+ "epoch": 0.65,
2281
+ "grad_norm": 1.9150598049163818,
2282
+ "learning_rate": 1.6449496416858284e-05,
2283
+ "loss": 3.6506,
2284
+ "step": 3250
2285
+ },
2286
+ {
2287
+ "epoch": 0.652,
2288
+ "grad_norm": 2.3984363079071045,
2289
+ "learning_rate": 1.6285698816954624e-05,
2290
+ "loss": 1.8059,
2291
+ "step": 3260
2292
+ },
2293
+ {
2294
+ "epoch": 0.654,
2295
+ "grad_norm": 1.429632544517517,
2296
+ "learning_rate": 1.612232593979658e-05,
2297
+ "loss": 1.8584,
2298
+ "step": 3270
2299
+ },
2300
+ {
2301
+ "epoch": 0.656,
2302
+ "grad_norm": 0.3058028817176819,
2303
+ "learning_rate": 1.5959385747947698e-05,
2304
+ "loss": 0.8049,
2305
+ "step": 3280
2306
+ },
2307
+ {
2308
+ "epoch": 0.658,
2309
+ "grad_norm": 2.075374126434326,
2310
+ "learning_rate": 1.5796886182883053e-05,
2311
+ "loss": 1.5368,
2312
+ "step": 3290
2313
+ },
2314
+ {
2315
+ "epoch": 0.66,
2316
+ "grad_norm": 1.1144261360168457,
2317
+ "learning_rate": 1.56348351646022e-05,
2318
+ "loss": 1.5826,
2319
+ "step": 3300
2320
+ },
2321
+ {
2322
+ "epoch": 0.662,
2323
+ "grad_norm": 1.5412263870239258,
2324
+ "learning_rate": 1.547324059124315e-05,
2325
+ "loss": 0.8753,
2326
+ "step": 3310
2327
+ },
2328
+ {
2329
+ "epoch": 0.664,
2330
+ "grad_norm": 0.0,
2331
+ "learning_rate": 1.5312110338697426e-05,
2332
+ "loss": 1.7769,
2333
+ "step": 3320
2334
+ },
2335
+ {
2336
+ "epoch": 0.666,
2337
+ "grad_norm": 0.9992184042930603,
2338
+ "learning_rate": 1.5151452260226224e-05,
2339
+ "loss": 3.1034,
2340
+ "step": 3330
2341
+ },
2342
+ {
2343
+ "epoch": 0.668,
2344
+ "grad_norm": 26.014408111572266,
2345
+ "learning_rate": 1.4991274186077632e-05,
2346
+ "loss": 2.3354,
2347
+ "step": 3340
2348
+ },
2349
+ {
2350
+ "epoch": 0.67,
2351
+ "grad_norm": 0.9760955572128296,
2352
+ "learning_rate": 1.4831583923104999e-05,
2353
+ "loss": 0.603,
2354
+ "step": 3350
2355
+ },
2356
+ {
2357
+ "epoch": 0.672,
2358
+ "grad_norm": 1.7530325651168823,
2359
+ "learning_rate": 1.467238925438646e-05,
2360
+ "loss": 0.6025,
2361
+ "step": 3360
2362
+ },
2363
+ {
2364
+ "epoch": 0.674,
2365
+ "grad_norm": 1.2257040739059448,
2366
+ "learning_rate": 1.4513697938845572e-05,
2367
+ "loss": 0.9105,
2368
+ "step": 3370
2369
+ },
2370
+ {
2371
+ "epoch": 0.676,
2372
+ "grad_norm": 0.3643525540828705,
2373
+ "learning_rate": 1.4355517710873184e-05,
2374
+ "loss": 0.8274,
2375
+ "step": 3380
2376
+ },
2377
+ {
2378
+ "epoch": 0.678,
2379
+ "grad_norm": 1.6802215576171875,
2380
+ "learning_rate": 1.4197856279950438e-05,
2381
+ "loss": 1.5935,
2382
+ "step": 3390
2383
+ },
2384
+ {
2385
+ "epoch": 0.68,
2386
+ "grad_norm": 1.3920577764511108,
2387
+ "learning_rate": 1.4040721330273062e-05,
2388
+ "loss": 0.7384,
2389
+ "step": 3400
2390
+ },
2391
+ {
2392
+ "epoch": 0.682,
2393
+ "grad_norm": 0.8468830585479736,
2394
+ "learning_rate": 1.388412052037682e-05,
2395
+ "loss": 1.8132,
2396
+ "step": 3410
2397
+ },
2398
+ {
2399
+ "epoch": 0.684,
2400
+ "grad_norm": 0.4457054138183594,
2401
+ "learning_rate": 1.3728061482764238e-05,
2402
+ "loss": 0.7159,
2403
+ "step": 3420
2404
+ },
2405
+ {
2406
+ "epoch": 0.686,
2407
+ "grad_norm": 0.8082422018051147,
2408
+ "learning_rate": 1.3572551823532654e-05,
2409
+ "loss": 0.8062,
2410
+ "step": 3430
2411
+ },
2412
+ {
2413
+ "epoch": 0.688,
2414
+ "grad_norm": 3.860785722732544,
2415
+ "learning_rate": 1.3417599122003464e-05,
2416
+ "loss": 1.4223,
2417
+ "step": 3440
2418
+ },
2419
+ {
2420
+ "epoch": 0.69,
2421
+ "grad_norm": 1.3153111934661865,
2422
+ "learning_rate": 1.3263210930352737e-05,
2423
+ "loss": 0.8253,
2424
+ "step": 3450
2425
+ },
2426
+ {
2427
+ "epoch": 0.692,
2428
+ "grad_norm": 0.49992579221725464,
2429
+ "learning_rate": 1.3109394773243117e-05,
2430
+ "loss": 1.6049,
2431
+ "step": 3460
2432
+ },
2433
+ {
2434
+ "epoch": 0.694,
2435
+ "grad_norm": 1.5094339847564697,
2436
+ "learning_rate": 1.2956158147457115e-05,
2437
+ "loss": 0.6334,
2438
+ "step": 3470
2439
+ },
2440
+ {
2441
+ "epoch": 0.696,
2442
+ "grad_norm": 1.6272149085998535,
2443
+ "learning_rate": 1.280350852153168e-05,
2444
+ "loss": 1.605,
2445
+ "step": 3480
2446
+ },
2447
+ {
2448
+ "epoch": 0.698,
2449
+ "grad_norm": 0.6700888276100159,
2450
+ "learning_rate": 1.2651453335394231e-05,
2451
+ "loss": 1.3822,
2452
+ "step": 3490
2453
+ },
2454
+ {
2455
+ "epoch": 0.7,
2456
+ "grad_norm": 2.492363691329956,
2457
+ "learning_rate": 1.2500000000000006e-05,
2458
+ "loss": 1.154,
2459
+ "step": 3500
2460
+ },
2461
+ {
2462
+ "epoch": 0.702,
2463
+ "grad_norm": 2.3340728282928467,
2464
+ "learning_rate": 1.234915589697091e-05,
2465
+ "loss": 1.3894,
2466
+ "step": 3510
2467
+ },
2468
+ {
2469
+ "epoch": 0.704,
2470
+ "grad_norm": 0.0,
2471
+ "learning_rate": 1.2198928378235716e-05,
2472
+ "loss": 0.8913,
2473
+ "step": 3520
2474
+ },
2475
+ {
2476
+ "epoch": 0.706,
2477
+ "grad_norm": 2.37203311920166,
2478
+ "learning_rate": 1.2049324765671749e-05,
2479
+ "loss": 1.4518,
2480
+ "step": 3530
2481
+ },
2482
+ {
2483
+ "epoch": 0.708,
2484
+ "grad_norm": 0.4668540954589844,
2485
+ "learning_rate": 1.1900352350748026e-05,
2486
+ "loss": 2.2489,
2487
+ "step": 3540
2488
+ },
2489
+ {
2490
+ "epoch": 0.71,
2491
+ "grad_norm": 7.209711074829102,
2492
+ "learning_rate": 1.175201839416988e-05,
2493
+ "loss": 0.8946,
2494
+ "step": 3550
2495
+ },
2496
+ {
2497
+ "epoch": 0.712,
2498
+ "grad_norm": 0.0,
2499
+ "learning_rate": 1.1604330125525079e-05,
2500
+ "loss": 0.5814,
2501
+ "step": 3560
2502
+ },
2503
+ {
2504
+ "epoch": 0.714,
2505
+ "grad_norm": 1.9763480424880981,
2506
+ "learning_rate": 1.1457294742931507e-05,
2507
+ "loss": 4.9924,
2508
+ "step": 3570
2509
+ },
2510
+ {
2511
+ "epoch": 0.716,
2512
+ "grad_norm": 4.834700107574463,
2513
+ "learning_rate": 1.1310919412686247e-05,
2514
+ "loss": 0.9913,
2515
+ "step": 3580
2516
+ },
2517
+ {
2518
+ "epoch": 0.718,
2519
+ "grad_norm": 5.4178876876831055,
2520
+ "learning_rate": 1.11652112689164e-05,
2521
+ "loss": 4.148,
2522
+ "step": 3590
2523
+ },
2524
+ {
2525
+ "epoch": 0.72,
2526
+ "grad_norm": 0.0,
2527
+ "learning_rate": 1.1020177413231334e-05,
2528
+ "loss": 1.184,
2529
+ "step": 3600
2530
+ },
2531
+ {
2532
+ "epoch": 0.722,
2533
+ "grad_norm": 5.075471878051758,
2534
+ "learning_rate": 1.0875824914376553e-05,
2535
+ "loss": 0.8565,
2536
+ "step": 3610
2537
+ },
2538
+ {
2539
+ "epoch": 0.724,
2540
+ "grad_norm": 1.6416356563568115,
2541
+ "learning_rate": 1.0732160807889211e-05,
2542
+ "loss": 1.6703,
2543
+ "step": 3620
2544
+ },
2545
+ {
2546
+ "epoch": 0.726,
2547
+ "grad_norm": 0.3667546808719635,
2548
+ "learning_rate": 1.058919209575517e-05,
2549
+ "loss": 1.2277,
2550
+ "step": 3630
2551
+ },
2552
+ {
2553
+ "epoch": 0.728,
2554
+ "grad_norm": 0.44018077850341797,
2555
+ "learning_rate": 1.0446925746067768e-05,
2556
+ "loss": 0.7438,
2557
+ "step": 3640
2558
+ },
2559
+ {
2560
+ "epoch": 0.73,
2561
+ "grad_norm": 0.9236971735954285,
2562
+ "learning_rate": 1.0305368692688174e-05,
2563
+ "loss": 3.7347,
2564
+ "step": 3650
2565
+ },
2566
+ {
2567
+ "epoch": 0.732,
2568
+ "grad_norm": 0.0,
2569
+ "learning_rate": 1.0164527834907467e-05,
2570
+ "loss": 1.8496,
2571
+ "step": 3660
2572
+ },
2573
+ {
2574
+ "epoch": 0.734,
2575
+ "grad_norm": 0.9189653396606445,
2576
+ "learning_rate": 1.0024410037110357e-05,
2577
+ "loss": 1.2388,
2578
+ "step": 3670
2579
+ },
2580
+ {
2581
+ "epoch": 0.736,
2582
+ "grad_norm": 2.626263380050659,
2583
+ "learning_rate": 9.88502212844063e-06,
2584
+ "loss": 5.6336,
2585
+ "step": 3680
2586
+ },
2587
+ {
2588
+ "epoch": 0.738,
2589
+ "grad_norm": 6.583788871765137,
2590
+ "learning_rate": 9.746370902468311e-06,
2591
+ "loss": 1.434,
2592
+ "step": 3690
2593
+ },
2594
+ {
2595
+ "epoch": 0.74,
2596
+ "grad_norm": 0.29883110523223877,
2597
+ "learning_rate": 9.608463116858542e-06,
2598
+ "loss": 2.9898,
2599
+ "step": 3700
2600
+ },
2601
+ {
2602
+ "epoch": 0.742,
2603
+ "grad_norm": 1.0266231298446655,
2604
+ "learning_rate": 9.471305493042243e-06,
2605
+ "loss": 0.9298,
2606
+ "step": 3710
2607
+ },
2608
+ {
2609
+ "epoch": 0.744,
2610
+ "grad_norm": 0.641528844833374,
2611
+ "learning_rate": 9.334904715888495e-06,
2612
+ "loss": 2.3826,
2613
+ "step": 3720
2614
+ },
2615
+ {
2616
+ "epoch": 0.746,
2617
+ "grad_norm": 0.8033006191253662,
2618
+ "learning_rate": 9.199267433378727e-06,
2619
+ "loss": 0.8495,
2620
+ "step": 3730
2621
+ },
2622
+ {
2623
+ "epoch": 0.748,
2624
+ "grad_norm": 2.0159640312194824,
2625
+ "learning_rate": 9.064400256282757e-06,
2626
+ "loss": 0.6518,
2627
+ "step": 3740
2628
+ },
2629
+ {
2630
+ "epoch": 0.75,
2631
+ "grad_norm": 3.390242099761963,
2632
+ "learning_rate": 8.930309757836517e-06,
2633
+ "loss": 0.8714,
2634
+ "step": 3750
2635
+ },
2636
+ {
2637
+ "epoch": 0.752,
2638
+ "grad_norm": 2.8902716636657715,
2639
+ "learning_rate": 8.797002473421728e-06,
2640
+ "loss": 1.378,
2641
+ "step": 3760
2642
+ },
2643
+ {
2644
+ "epoch": 0.754,
2645
+ "grad_norm": 0.5341398119926453,
2646
+ "learning_rate": 8.664484900247363e-06,
2647
+ "loss": 1.1048,
2648
+ "step": 3770
2649
+ },
2650
+ {
2651
+ "epoch": 0.756,
2652
+ "grad_norm": 1.0644313097000122,
2653
+ "learning_rate": 8.532763497032987e-06,
2654
+ "loss": 0.9402,
2655
+ "step": 3780
2656
+ },
2657
+ {
2658
+ "epoch": 0.758,
2659
+ "grad_norm": 0.3949926197528839,
2660
+ "learning_rate": 8.40184468369396e-06,
2661
+ "loss": 1.3253,
2662
+ "step": 3790
2663
+ },
2664
+ {
2665
+ "epoch": 0.76,
2666
+ "grad_norm": 7.875147819519043,
2667
+ "learning_rate": 8.271734841028553e-06,
2668
+ "loss": 1.0062,
2669
+ "step": 3800
2670
+ },
2671
+ {
2672
+ "epoch": 0.762,
2673
+ "grad_norm": 0.3310488760471344,
2674
+ "learning_rate": 8.142440310406924e-06,
2675
+ "loss": 0.9064,
2676
+ "step": 3810
2677
+ },
2678
+ {
2679
+ "epoch": 0.764,
2680
+ "grad_norm": 3.954543352127075,
2681
+ "learning_rate": 8.013967393462094e-06,
2682
+ "loss": 3.918,
2683
+ "step": 3820
2684
+ },
2685
+ {
2686
+ "epoch": 0.766,
2687
+ "grad_norm": 0.5071019530296326,
2688
+ "learning_rate": 7.886322351782783e-06,
2689
+ "loss": 0.4079,
2690
+ "step": 3830
2691
+ },
2692
+ {
2693
+ "epoch": 0.768,
2694
+ "grad_norm": 1.4938267469406128,
2695
+ "learning_rate": 7.759511406608255e-06,
2696
+ "loss": 0.8509,
2697
+ "step": 3840
2698
+ },
2699
+ {
2700
+ "epoch": 0.77,
2701
+ "grad_norm": 0.6899465322494507,
2702
+ "learning_rate": 7.633540738525066e-06,
2703
+ "loss": 1.9055,
2704
+ "step": 3850
2705
+ },
2706
+ {
2707
+ "epoch": 0.772,
2708
+ "grad_norm": 59.65831756591797,
2709
+ "learning_rate": 7.508416487165862e-06,
2710
+ "loss": 6.2419,
2711
+ "step": 3860
2712
+ },
2713
+ {
2714
+ "epoch": 0.774,
2715
+ "grad_norm": 0.5461062788963318,
2716
+ "learning_rate": 7.384144750910133e-06,
2717
+ "loss": 1.1085,
2718
+ "step": 3870
2719
+ },
2720
+ {
2721
+ "epoch": 0.776,
2722
+ "grad_norm": 0.7306953072547913,
2723
+ "learning_rate": 7.260731586586983e-06,
2724
+ "loss": 4.1298,
2725
+ "step": 3880
2726
+ },
2727
+ {
2728
+ "epoch": 0.778,
2729
+ "grad_norm": 1.150995135307312,
2730
+ "learning_rate": 7.138183009179922e-06,
2731
+ "loss": 0.7264,
2732
+ "step": 3890
2733
+ },
2734
+ {
2735
+ "epoch": 0.78,
2736
+ "grad_norm": 6.218198299407959,
2737
+ "learning_rate": 7.016504991533726e-06,
2738
+ "loss": 1.0637,
2739
+ "step": 3900
2740
+ },
2741
+ {
2742
+ "epoch": 0.782,
2743
+ "grad_norm": 1.58785080909729,
2744
+ "learning_rate": 6.895703464063319e-06,
2745
+ "loss": 1.2957,
2746
+ "step": 3910
2747
+ },
2748
+ {
2749
+ "epoch": 0.784,
2750
+ "grad_norm": 0.5615648627281189,
2751
+ "learning_rate": 6.775784314464717e-06,
2752
+ "loss": 0.4924,
2753
+ "step": 3920
2754
+ },
2755
+ {
2756
+ "epoch": 0.786,
2757
+ "grad_norm": 0.8472492098808289,
2758
+ "learning_rate": 6.656753387428089e-06,
2759
+ "loss": 1.0074,
2760
+ "step": 3930
2761
+ },
2762
+ {
2763
+ "epoch": 0.788,
2764
+ "grad_norm": 1.1373908519744873,
2765
+ "learning_rate": 6.538616484352902e-06,
2766
+ "loss": 1.7825,
2767
+ "step": 3940
2768
+ },
2769
+ {
2770
+ "epoch": 0.79,
2771
+ "grad_norm": 1.4689478874206543,
2772
+ "learning_rate": 6.421379363065142e-06,
2773
+ "loss": 1.2334,
2774
+ "step": 3950
2775
+ },
2776
+ {
2777
+ "epoch": 0.792,
2778
+ "grad_norm": 4.063680171966553,
2779
+ "learning_rate": 6.305047737536707e-06,
2780
+ "loss": 1.9249,
2781
+ "step": 3960
2782
+ },
2783
+ {
2784
+ "epoch": 0.794,
2785
+ "grad_norm": 3.548961877822876,
2786
+ "learning_rate": 6.189627277606894e-06,
2787
+ "loss": 1.4351,
2788
+ "step": 3970
2789
+ },
2790
+ {
2791
+ "epoch": 0.796,
2792
+ "grad_norm": 0.8903224468231201,
2793
+ "learning_rate": 6.075123608706093e-06,
2794
+ "loss": 1.6044,
2795
+ "step": 3980
2796
+ },
2797
+ {
2798
+ "epoch": 0.798,
2799
+ "grad_norm": 1.5656462907791138,
2800
+ "learning_rate": 5.961542311581586e-06,
2801
+ "loss": 1.7348,
2802
+ "step": 3990
2803
+ },
2804
+ {
2805
+ "epoch": 0.8,
2806
+ "grad_norm": 0.4290873110294342,
2807
+ "learning_rate": 5.848888922025553e-06,
2808
+ "loss": 12.6939,
2809
+ "step": 4000
2810
+ },
2811
+ {
2812
+ "epoch": 0.802,
2813
+ "grad_norm": 7.812740325927734,
2814
+ "learning_rate": 5.737168930605272e-06,
2815
+ "loss": 0.8231,
2816
+ "step": 4010
2817
+ },
2818
+ {
2819
+ "epoch": 0.804,
2820
+ "grad_norm": 0.8808218240737915,
2821
+ "learning_rate": 5.626387782395512e-06,
2822
+ "loss": 2.239,
2823
+ "step": 4020
2824
+ },
2825
+ {
2826
+ "epoch": 0.806,
2827
+ "grad_norm": 5.278824806213379,
2828
+ "learning_rate": 5.5165508767131415e-06,
2829
+ "loss": 3.1218,
2830
+ "step": 4030
2831
+ },
2832
+ {
2833
+ "epoch": 0.808,
2834
+ "grad_norm": 1.9946264028549194,
2835
+ "learning_rate": 5.4076635668540075e-06,
2836
+ "loss": 0.5524,
2837
+ "step": 4040
2838
+ },
2839
+ {
2840
+ "epoch": 0.81,
2841
+ "grad_norm": 2.5950546264648438,
2842
+ "learning_rate": 5.299731159831953e-06,
2843
+ "loss": 2.4598,
2844
+ "step": 4050
2845
+ },
2846
+ {
2847
+ "epoch": 0.812,
2848
+ "grad_norm": 0.0,
2849
+ "learning_rate": 5.192758916120236e-06,
2850
+ "loss": 0.634,
2851
+ "step": 4060
2852
+ },
2853
+ {
2854
+ "epoch": 0.814,
2855
+ "grad_norm": 0.4671136438846588,
2856
+ "learning_rate": 5.086752049395094e-06,
2857
+ "loss": 0.8943,
2858
+ "step": 4070
2859
+ },
2860
+ {
2861
+ "epoch": 0.816,
2862
+ "grad_norm": 3.0124940872192383,
2863
+ "learning_rate": 4.981715726281666e-06,
2864
+ "loss": 1.1776,
2865
+ "step": 4080
2866
+ },
2867
+ {
2868
+ "epoch": 0.818,
2869
+ "grad_norm": 1.0693814754486084,
2870
+ "learning_rate": 4.877655066102149e-06,
2871
+ "loss": 0.918,
2872
+ "step": 4090
2873
+ },
2874
+ {
2875
+ "epoch": 0.82,
2876
+ "grad_norm": 0.39503878355026245,
2877
+ "learning_rate": 4.7745751406263165e-06,
2878
+ "loss": 0.3834,
2879
+ "step": 4100
2880
+ },
2881
+ {
2882
+ "epoch": 0.822,
2883
+ "grad_norm": 2.3926539421081543,
2884
+ "learning_rate": 4.672480973824311e-06,
2885
+ "loss": 1.0476,
2886
+ "step": 4110
2887
+ },
2888
+ {
2889
+ "epoch": 0.824,
2890
+ "grad_norm": 2.5929503440856934,
2891
+ "learning_rate": 4.571377541621788e-06,
2892
+ "loss": 0.7036,
2893
+ "step": 4120
2894
+ },
2895
+ {
2896
+ "epoch": 0.826,
2897
+ "grad_norm": 1.2581967115402222,
2898
+ "learning_rate": 4.4712697716574e-06,
2899
+ "loss": 1.4096,
2900
+ "step": 4130
2901
+ },
2902
+ {
2903
+ "epoch": 0.828,
2904
+ "grad_norm": 3.9008257389068604,
2905
+ "learning_rate": 4.372162543042624e-06,
2906
+ "loss": 1.2209,
2907
+ "step": 4140
2908
+ },
2909
+ {
2910
+ "epoch": 0.83,
2911
+ "grad_norm": 0.9294935464859009,
2912
+ "learning_rate": 4.274060686123959e-06,
2913
+ "loss": 1.4758,
2914
+ "step": 4150
2915
+ },
2916
+ {
2917
+ "epoch": 0.832,
2918
+ "grad_norm": 0.4707418382167816,
2919
+ "learning_rate": 4.176968982247514e-06,
2920
+ "loss": 1.546,
2921
+ "step": 4160
2922
+ },
2923
+ {
2924
+ "epoch": 0.834,
2925
+ "grad_norm": 0.9732871055603027,
2926
+ "learning_rate": 4.08089216352596e-06,
2927
+ "loss": 0.8864,
2928
+ "step": 4170
2929
+ },
2930
+ {
2931
+ "epoch": 0.836,
2932
+ "grad_norm": 1.237377405166626,
2933
+ "learning_rate": 3.985834912607894e-06,
2934
+ "loss": 1.4541,
2935
+ "step": 4180
2936
+ },
2937
+ {
2938
+ "epoch": 0.838,
2939
+ "grad_norm": 1.330290675163269,
2940
+ "learning_rate": 3.891801862449629e-06,
2941
+ "loss": 0.6756,
2942
+ "step": 4190
2943
+ },
2944
+ {
2945
+ "epoch": 0.84,
2946
+ "grad_norm": 0.49262121319770813,
2947
+ "learning_rate": 3.798797596089351e-06,
2948
+ "loss": 1.3881,
2949
+ "step": 4200
2950
+ },
2951
+ {
2952
+ "epoch": 0.842,
2953
+ "grad_norm": 4.391107082366943,
2954
+ "learning_rate": 3.7068266464238084e-06,
2955
+ "loss": 2.3349,
2956
+ "step": 4210
2957
+ },
2958
+ {
2959
+ "epoch": 0.844,
2960
+ "grad_norm": 1.4304258823394775,
2961
+ "learning_rate": 3.6158934959873353e-06,
2962
+ "loss": 0.7973,
2963
+ "step": 4220
2964
+ },
2965
+ {
2966
+ "epoch": 0.846,
2967
+ "grad_norm": 1.131519079208374,
2968
+ "learning_rate": 3.5260025767333893e-06,
2969
+ "loss": 1.1455,
2970
+ "step": 4230
2971
+ },
2972
+ {
2973
+ "epoch": 0.848,
2974
+ "grad_norm": 4.368002414703369,
2975
+ "learning_rate": 3.4371582698185633e-06,
2976
+ "loss": 2.3828,
2977
+ "step": 4240
2978
+ },
2979
+ {
2980
+ "epoch": 0.85,
2981
+ "grad_norm": 3.4001667499542236,
2982
+ "learning_rate": 3.3493649053890326e-06,
2983
+ "loss": 0.8389,
2984
+ "step": 4250
2985
+ },
2986
+ {
2987
+ "epoch": 0.852,
2988
+ "grad_norm": 1.048996090888977,
2989
+ "learning_rate": 3.262626762369525e-06,
2990
+ "loss": 0.7594,
2991
+ "step": 4260
2992
+ },
2993
+ {
2994
+ "epoch": 0.854,
2995
+ "grad_norm": 0.4330098330974579,
2996
+ "learning_rate": 3.176948068254762e-06,
2997
+ "loss": 0.8006,
2998
+ "step": 4270
2999
+ },
3000
+ {
3001
+ "epoch": 0.856,
3002
+ "grad_norm": 5.37194299697876,
3003
+ "learning_rate": 3.092332998903416e-06,
3004
+ "loss": 1.4222,
3005
+ "step": 4280
3006
+ },
3007
+ {
3008
+ "epoch": 0.858,
3009
+ "grad_norm": 2.6493430137634277,
3010
+ "learning_rate": 3.0087856783345914e-06,
3011
+ "loss": 2.5477,
3012
+ "step": 4290
3013
+ },
3014
+ {
3015
+ "epoch": 0.86,
3016
+ "grad_norm": 1.8400111198425293,
3017
+ "learning_rate": 2.9263101785268254e-06,
3018
+ "loss": 2.2044,
3019
+ "step": 4300
3020
+ },
3021
+ {
3022
+ "epoch": 0.862,
3023
+ "grad_norm": 18.55029296875,
3024
+ "learning_rate": 2.8449105192196316e-06,
3025
+ "loss": 1.5267,
3026
+ "step": 4310
3027
+ },
3028
+ {
3029
+ "epoch": 0.864,
3030
+ "grad_norm": 1.1221182346343994,
3031
+ "learning_rate": 2.764590667717562e-06,
3032
+ "loss": 0.6063,
3033
+ "step": 4320
3034
+ },
3035
+ {
3036
+ "epoch": 0.866,
3037
+ "grad_norm": 1.4529755115509033,
3038
+ "learning_rate": 2.6853545386968606e-06,
3039
+ "loss": 2.6636,
3040
+ "step": 4330
3041
+ },
3042
+ {
3043
+ "epoch": 0.868,
3044
+ "grad_norm": 0.7516627907752991,
3045
+ "learning_rate": 2.6072059940146775e-06,
3046
+ "loss": 0.9864,
3047
+ "step": 4340
3048
+ },
3049
+ {
3050
+ "epoch": 0.87,
3051
+ "grad_norm": 2.175539970397949,
3052
+ "learning_rate": 2.5301488425208296e-06,
3053
+ "loss": 2.3756,
3054
+ "step": 4350
3055
+ },
3056
+ {
3057
+ "epoch": 0.872,
3058
+ "grad_norm": 3.53667950630188,
3059
+ "learning_rate": 2.454186839872158e-06,
3060
+ "loss": 0.962,
3061
+ "step": 4360
3062
+ },
3063
+ {
3064
+ "epoch": 0.874,
3065
+ "grad_norm": 1.7746353149414062,
3066
+ "learning_rate": 2.379323688349516e-06,
3067
+ "loss": 0.8765,
3068
+ "step": 4370
3069
+ },
3070
+ {
3071
+ "epoch": 0.876,
3072
+ "grad_norm": 2.2868964672088623,
3073
+ "learning_rate": 2.3055630366772856e-06,
3074
+ "loss": 1.3187,
3075
+ "step": 4380
3076
+ },
3077
+ {
3078
+ "epoch": 0.878,
3079
+ "grad_norm": 2.6441867351531982,
3080
+ "learning_rate": 2.2329084798455746e-06,
3081
+ "loss": 0.996,
3082
+ "step": 4390
3083
+ },
3084
+ {
3085
+ "epoch": 0.88,
3086
+ "grad_norm": 0.8521894216537476,
3087
+ "learning_rate": 2.1613635589349756e-06,
3088
+ "loss": 1.1203,
3089
+ "step": 4400
3090
+ },
3091
+ {
3092
+ "epoch": 0.882,
3093
+ "grad_norm": 2.128058433532715,
3094
+ "learning_rate": 2.0909317609440095e-06,
3095
+ "loss": 1.5124,
3096
+ "step": 4410
3097
+ },
3098
+ {
3099
+ "epoch": 0.884,
3100
+ "grad_norm": 1.3996164798736572,
3101
+ "learning_rate": 2.0216165186191407e-06,
3102
+ "loss": 0.5098,
3103
+ "step": 4420
3104
+ },
3105
+ {
3106
+ "epoch": 0.886,
3107
+ "grad_norm": 0.6261931657791138,
3108
+ "learning_rate": 1.95342121028749e-06,
3109
+ "loss": 0.9764,
3110
+ "step": 4430
3111
+ },
3112
+ {
3113
+ "epoch": 0.888,
3114
+ "grad_norm": 2.9988150596618652,
3115
+ "learning_rate": 1.8863491596921745e-06,
3116
+ "loss": 1.6156,
3117
+ "step": 4440
3118
+ },
3119
+ {
3120
+ "epoch": 0.89,
3121
+ "grad_norm": 0.7288640141487122,
3122
+ "learning_rate": 1.8204036358303173e-06,
3123
+ "loss": 1.3607,
3124
+ "step": 4450
3125
+ },
3126
+ {
3127
+ "epoch": 0.892,
3128
+ "grad_norm": 0.7692397832870483,
3129
+ "learning_rate": 1.7555878527937164e-06,
3130
+ "loss": 1.291,
3131
+ "step": 4460
3132
+ },
3133
+ {
3134
+ "epoch": 0.894,
3135
+ "grad_norm": 1.286188006401062,
3136
+ "learning_rate": 1.6919049696121958e-06,
3137
+ "loss": 0.6693,
3138
+ "step": 4470
3139
+ },
3140
+ {
3141
+ "epoch": 0.896,
3142
+ "grad_norm": 0.9771424531936646,
3143
+ "learning_rate": 1.629358090099639e-06,
3144
+ "loss": 1.8125,
3145
+ "step": 4480
3146
+ },
3147
+ {
3148
+ "epoch": 0.898,
3149
+ "grad_norm": 2.4204814434051514,
3150
+ "learning_rate": 1.5679502627027136e-06,
3151
+ "loss": 1.3824,
3152
+ "step": 4490
3153
+ },
3154
+ {
3155
+ "epoch": 0.9,
3156
+ "grad_norm": 4.12672758102417,
3157
+ "learning_rate": 1.5076844803522922e-06,
3158
+ "loss": 1.466,
3159
+ "step": 4500
3160
+ },
3161
+ {
3162
+ "epoch": 0.902,
3163
+ "grad_norm": 2.546496868133545,
3164
+ "learning_rate": 1.4485636803175829e-06,
3165
+ "loss": 0.8571,
3166
+ "step": 4510
3167
+ },
3168
+ {
3169
+ "epoch": 0.904,
3170
+ "grad_norm": 1.0487430095672607,
3171
+ "learning_rate": 1.3905907440629752e-06,
3172
+ "loss": 1.0712,
3173
+ "step": 4520
3174
+ },
3175
+ {
3176
+ "epoch": 0.906,
3177
+ "grad_norm": 1.2966355085372925,
3178
+ "learning_rate": 1.333768497107593e-06,
3179
+ "loss": 1.328,
3180
+ "step": 4530
3181
+ },
3182
+ {
3183
+ "epoch": 0.908,
3184
+ "grad_norm": 0.441129595041275,
3185
+ "learning_rate": 1.2780997088875869e-06,
3186
+ "loss": 3.858,
3187
+ "step": 4540
3188
+ },
3189
+ {
3190
+ "epoch": 0.91,
3191
+ "grad_norm": 1.228864073753357,
3192
+ "learning_rate": 1.2235870926211619e-06,
3193
+ "loss": 1.3761,
3194
+ "step": 4550
3195
+ },
3196
+ {
3197
+ "epoch": 0.912,
3198
+ "grad_norm": 0.6363465189933777,
3199
+ "learning_rate": 1.170233305176327e-06,
3200
+ "loss": 0.9014,
3201
+ "step": 4560
3202
+ },
3203
+ {
3204
+ "epoch": 0.914,
3205
+ "grad_norm": 1.6291348934173584,
3206
+ "learning_rate": 1.1180409469414094e-06,
3207
+ "loss": 1.2054,
3208
+ "step": 4570
3209
+ },
3210
+ {
3211
+ "epoch": 0.916,
3212
+ "grad_norm": 0.9478392601013184,
3213
+ "learning_rate": 1.067012561698319e-06,
3214
+ "loss": 1.1962,
3215
+ "step": 4580
3216
+ },
3217
+ {
3218
+ "epoch": 0.918,
3219
+ "grad_norm": 0.8788161277770996,
3220
+ "learning_rate": 1.0171506364985622e-06,
3221
+ "loss": 1.1866,
3222
+ "step": 4590
3223
+ },
3224
+ {
3225
+ "epoch": 0.92,
3226
+ "grad_norm": 7.388576984405518,
3227
+ "learning_rate": 9.684576015420278e-07,
3228
+ "loss": 1.4084,
3229
+ "step": 4600
3230
+ },
3231
+ {
3232
+ "epoch": 0.922,
3233
+ "grad_norm": 0.774747908115387,
3234
+ "learning_rate": 9.209358300585474e-07,
3235
+ "loss": 0.4785,
3236
+ "step": 4610
3237
+ },
3238
+ {
3239
+ "epoch": 0.924,
3240
+ "grad_norm": 3.6744463443756104,
3241
+ "learning_rate": 8.745876381922147e-07,
3242
+ "loss": 1.0671,
3243
+ "step": 4620
3244
+ },
3245
+ {
3246
+ "epoch": 0.926,
3247
+ "grad_norm": 15.845490455627441,
3248
+ "learning_rate": 8.294152848885157e-07,
3249
+ "loss": 3.2901,
3250
+ "step": 4630
3251
+ },
3252
+ {
3253
+ "epoch": 0.928,
3254
+ "grad_norm": 0.889503002166748,
3255
+ "learning_rate": 7.854209717842231e-07,
3256
+ "loss": 1.0359,
3257
+ "step": 4640
3258
+ },
3259
+ {
3260
+ "epoch": 0.93,
3261
+ "grad_norm": 0.6462771892547607,
3262
+ "learning_rate": 7.426068431000882e-07,
3263
+ "loss": 0.5115,
3264
+ "step": 4650
3265
+ },
3266
+ {
3267
+ "epoch": 0.932,
3268
+ "grad_norm": 1.1371968984603882,
3269
+ "learning_rate": 7.009749855363456e-07,
3270
+ "loss": 1.0589,
3271
+ "step": 4660
3272
+ },
3273
+ {
3274
+ "epoch": 0.934,
3275
+ "grad_norm": 0.5716176629066467,
3276
+ "learning_rate": 6.605274281709928e-07,
3277
+ "loss": 0.3984,
3278
+ "step": 4670
3279
+ },
3280
+ {
3281
+ "epoch": 0.936,
3282
+ "grad_norm": 3.242506742477417,
3283
+ "learning_rate": 6.212661423609184e-07,
3284
+ "loss": 2.5647,
3285
+ "step": 4680
3286
+ },
3287
+ {
3288
+ "epoch": 0.938,
3289
+ "grad_norm": 1.7465883493423462,
3290
+ "learning_rate": 5.83193041645802e-07,
3291
+ "loss": 0.93,
3292
+ "step": 4690
3293
+ },
3294
+ {
3295
+ "epoch": 0.94,
3296
+ "grad_norm": 5.38319730758667,
3297
+ "learning_rate": 5.463099816548579e-07,
3298
+ "loss": 1.4605,
3299
+ "step": 4700
3300
+ },
3301
+ {
3302
+ "epoch": 0.942,
3303
+ "grad_norm": 0.0,
3304
+ "learning_rate": 5.106187600163987e-07,
3305
+ "loss": 0.7053,
3306
+ "step": 4710
3307
+ },
3308
+ {
3309
+ "epoch": 0.944,
3310
+ "grad_norm": 16.922529220581055,
3311
+ "learning_rate": 4.7612111627021175e-07,
3312
+ "loss": 1.9524,
3313
+ "step": 4720
3314
+ },
3315
+ {
3316
+ "epoch": 0.946,
3317
+ "grad_norm": 0.44225987792015076,
3318
+ "learning_rate": 4.4281873178278475e-07,
3319
+ "loss": 2.0867,
3320
+ "step": 4730
3321
+ },
3322
+ {
3323
+ "epoch": 0.948,
3324
+ "grad_norm": 4.4445719718933105,
3325
+ "learning_rate": 4.107132296653549e-07,
3326
+ "loss": 0.6371,
3327
+ "step": 4740
3328
+ },
3329
+ {
3330
+ "epoch": 0.95,
3331
+ "grad_norm": 1.2280045747756958,
3332
+ "learning_rate": 3.7980617469479953e-07,
3333
+ "loss": 0.5299,
3334
+ "step": 4750
3335
+ },
3336
+ {
3337
+ "epoch": 0.952,
3338
+ "grad_norm": 2.0964748859405518,
3339
+ "learning_rate": 3.5009907323737825e-07,
3340
+ "loss": 4.1515,
3341
+ "step": 4760
3342
+ },
3343
+ {
3344
+ "epoch": 0.954,
3345
+ "grad_norm": 0.37474894523620605,
3346
+ "learning_rate": 3.215933731753024e-07,
3347
+ "loss": 0.762,
3348
+ "step": 4770
3349
+ },
3350
+ {
3351
+ "epoch": 0.956,
3352
+ "grad_norm": 0.0,
3353
+ "learning_rate": 2.942904638361804e-07,
3354
+ "loss": 0.9055,
3355
+ "step": 4780
3356
+ },
3357
+ {
3358
+ "epoch": 0.958,
3359
+ "grad_norm": 2.7244997024536133,
3360
+ "learning_rate": 2.681916759252917e-07,
3361
+ "loss": 1.026,
3362
+ "step": 4790
3363
+ },
3364
+ {
3365
+ "epoch": 0.96,
3366
+ "grad_norm": 0.0,
3367
+ "learning_rate": 2.4329828146074095e-07,
3368
+ "loss": 1.9403,
3369
+ "step": 4800
3370
+ },
3371
+ {
3372
+ "epoch": 0.962,
3373
+ "grad_norm": 11.823533058166504,
3374
+ "learning_rate": 2.1961149371145795e-07,
3375
+ "loss": 1.8509,
3376
+ "step": 4810
3377
+ },
3378
+ {
3379
+ "epoch": 0.964,
3380
+ "grad_norm": 0.9294369220733643,
3381
+ "learning_rate": 1.9713246713805588e-07,
3382
+ "loss": 1.5779,
3383
+ "step": 4820
3384
+ },
3385
+ {
3386
+ "epoch": 0.966,
3387
+ "grad_norm": 1.7021973133087158,
3388
+ "learning_rate": 1.7586229733657644e-07,
3389
+ "loss": 1.2824,
3390
+ "step": 4830
3391
+ },
3392
+ {
3393
+ "epoch": 0.968,
3394
+ "grad_norm": 0.9092425107955933,
3395
+ "learning_rate": 1.5580202098509077e-07,
3396
+ "loss": 1.8272,
3397
+ "step": 4840
3398
+ },
3399
+ {
3400
+ "epoch": 0.97,
3401
+ "grad_norm": 0.22959110140800476,
3402
+ "learning_rate": 1.3695261579316777e-07,
3403
+ "loss": 1.7201,
3404
+ "step": 4850
3405
+ },
3406
+ {
3407
+ "epoch": 0.972,
3408
+ "grad_norm": 1.6746439933776855,
3409
+ "learning_rate": 1.193150004542204e-07,
3410
+ "loss": 1.2317,
3411
+ "step": 4860
3412
+ },
3413
+ {
3414
+ "epoch": 0.974,
3415
+ "grad_norm": 4.792318344116211,
3416
+ "learning_rate": 1.0289003460074165e-07,
3417
+ "loss": 1.2935,
3418
+ "step": 4870
3419
+ },
3420
+ {
3421
+ "epoch": 0.976,
3422
+ "grad_norm": 0.7605292797088623,
3423
+ "learning_rate": 8.767851876239074e-08,
3424
+ "loss": 1.2833,
3425
+ "step": 4880
3426
+ },
3427
+ {
3428
+ "epoch": 0.978,
3429
+ "grad_norm": 0.4360625147819519,
3430
+ "learning_rate": 7.368119432699383e-08,
3431
+ "loss": 1.021,
3432
+ "step": 4890
3433
+ },
3434
+ {
3435
+ "epoch": 0.98,
3436
+ "grad_norm": 0.5451242923736572,
3437
+ "learning_rate": 6.089874350439506e-08,
3438
+ "loss": 1.1247,
3439
+ "step": 4900
3440
+ },
3441
+ {
3442
+ "epoch": 0.982,
3443
+ "grad_norm": 0.9947279691696167,
3444
+ "learning_rate": 4.9331789293211026e-08,
3445
+ "loss": 1.2268,
3446
+ "step": 4910
3447
+ },
3448
+ {
3449
+ "epoch": 0.984,
3450
+ "grad_norm": 9.412683486938477,
3451
+ "learning_rate": 3.8980895450474455e-08,
3452
+ "loss": 4.2583,
3453
+ "step": 4920
3454
+ },
3455
+ {
3456
+ "epoch": 0.986,
3457
+ "grad_norm": 0.6823949217796326,
3458
+ "learning_rate": 2.9846566464150626e-08,
3459
+ "loss": 0.7995,
3460
+ "step": 4930
3461
+ },
3462
+ {
3463
+ "epoch": 0.988,
3464
+ "grad_norm": 1.5976966619491577,
3465
+ "learning_rate": 2.192924752854042e-08,
3466
+ "loss": 1.3818,
3467
+ "step": 4940
3468
+ },
3469
+ {
3470
+ "epoch": 0.99,
3471
+ "grad_norm": 17.52108383178711,
3472
+ "learning_rate": 1.522932452260595e-08,
3473
+ "loss": 2.6165,
3474
+ "step": 4950
3475
+ },
3476
+ {
3477
+ "epoch": 0.992,
3478
+ "grad_norm": 3.4375789165496826,
3479
+ "learning_rate": 9.747123991141194e-09,
3480
+ "loss": 1.7912,
3481
+ "step": 4960
3482
+ },
3483
+ {
3484
+ "epoch": 0.994,
3485
+ "grad_norm": 1.8169946670532227,
3486
+ "learning_rate": 5.48291312886251e-09,
3487
+ "loss": 1.421,
3488
+ "step": 4970
3489
+ },
3490
+ {
3491
+ "epoch": 0.996,
3492
+ "grad_norm": 0.7665383815765381,
3493
+ "learning_rate": 2.4368997673940297e-09,
3494
+ "loss": 1.2866,
3495
+ "step": 4980
3496
+ },
3497
+ {
3498
+ "epoch": 0.998,
3499
+ "grad_norm": 0.4504777491092682,
3500
+ "learning_rate": 6.092323651313292e-10,
3501
+ "loss": 0.6754,
3502
+ "step": 4990
3503
+ },
3504
+ {
3505
+ "epoch": 1.0,
3506
+ "grad_norm": 0.5862205028533936,
3507
+ "learning_rate": 0.0,
3508
+ "loss": 1.1699,
3509
+ "step": 5000
3510
+ },
3511
+ {
3512
+ "epoch": 1.0,
3513
+ "step": 5000,
3514
+ "total_flos": 1.151346780094464e+16,
3515
+ "train_loss": 1.6451873833656312,
3516
+ "train_runtime": 1464.1065,
3517
+ "train_samples_per_second": 3.415,
3518
+ "train_steps_per_second": 3.415
3519
+ }
3520
+ ],
3521
+ "logging_steps": 10,
3522
+ "max_steps": 5000,
3523
+ "num_input_tokens_seen": 0,
3524
+ "num_train_epochs": 1,
3525
+ "save_steps": 4000,
3526
+ "stateful_callbacks": {
3527
+ "TrainerControl": {
3528
+ "args": {
3529
+ "should_epoch_stop": false,
3530
+ "should_evaluate": false,
3531
+ "should_log": false,
3532
+ "should_save": true,
3533
+ "should_training_stop": true
3534
+ },
3535
+ "attributes": {}
3536
+ }
3537
+ },
3538
+ "total_flos": 1.151346780094464e+16,
3539
+ "train_batch_size": 1,
3540
+ "trial_name": null,
3541
+ "trial_params": null
3542
+ }
Llama-2-13b-chat-hf/DomainBench/Finance/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f9f7a5ac6abcf0832ff55273a20894e8519bd6572cf76a25d10dc91b646f33c
3
+ size 5432
Llama-2-13b-chat-hf/DomainBench/Finance/training_loss.png ADDED
Llama-2-13b-chat-hf/DomainBench/Geography/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: /hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: threshold_3-lamb_0.1-lr_5e-5
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # threshold_3-lamb_0.1-lr_5e-5
18
+
19
+ This model is a fine-tuned version of [/hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf](https://huggingface.co//hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf) on the geosignal dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 1
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: cosine
44
+ - lr_scheduler_warmup_ratio: 0.1
45
+ - num_epochs: 1.0
46
+
47
+ ### Training results
48
+
49
+
50
+
51
+ ### Framework versions
52
+
53
+ - PEFT 0.12.0
54
+ - Transformers 4.46.1
55
+ - Pytorch 2.5.1+cu124
56
+ - Datasets 3.1.0
57
+ - Tokenizers 0.20.3
Llama-2-13b-chat-hf/DomainBench/Geography/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
Llama-2-13b-chat-hf/DomainBench/Geography/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0893ead1d969d0b9cb57398624cdedc62eb1cde6eed3b8cdec11d83f405a8a67
3
+ size 26235704
Llama-2-13b-chat-hf/DomainBench/Geography/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 3.424475897929728e+16,
4
+ "train_loss": 1.0425229248046874,
5
+ "train_runtime": 1241.3169,
6
+ "train_samples_per_second": 4.028,
7
+ "train_steps_per_second": 4.028
8
+ }
Llama-2-13b-chat-hf/DomainBench/Geography/logfile.txt ADDED
The diff for this file is too large to render. See raw diff
 
Llama-2-13b-chat-hf/DomainBench/Geography/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
Llama-2-13b-chat-hf/DomainBench/Geography/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Llama-2-13b-chat-hf/DomainBench/Geography/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
Llama-2-13b-chat-hf/DomainBench/Geography/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '<s>' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
Llama-2-13b-chat-hf/DomainBench/Geography/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 3.424475897929728e+16,
4
+ "train_loss": 1.0425229248046874,
5
+ "train_runtime": 1241.3169,
6
+ "train_samples_per_second": 4.028,
7
+ "train_steps_per_second": 4.028
8
+ }
Llama-2-13b-chat-hf/DomainBench/Geography/trainer_log.jsonl ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 5000, "loss": 1.5419, "lr": 1.0000000000000002e-06, "epoch": 0.002, "percentage": 0.2, "elapsed_time": "0:00:03", "remaining_time": "0:29:48"}
2
+ {"current_steps": 20, "total_steps": 5000, "loss": 2.7315, "lr": 2.0000000000000003e-06, "epoch": 0.004, "percentage": 0.4, "elapsed_time": "0:00:06", "remaining_time": "0:25:41"}
3
+ {"current_steps": 30, "total_steps": 5000, "loss": 1.8219, "lr": 3e-06, "epoch": 0.006, "percentage": 0.6, "elapsed_time": "0:00:08", "remaining_time": "0:23:27"}
4
+ {"current_steps": 40, "total_steps": 5000, "loss": 1.5216, "lr": 4.000000000000001e-06, "epoch": 0.008, "percentage": 0.8, "elapsed_time": "0:00:11", "remaining_time": "0:24:19"}
5
+ {"current_steps": 50, "total_steps": 5000, "loss": 5.3329, "lr": 5e-06, "epoch": 0.01, "percentage": 1.0, "elapsed_time": "0:00:14", "remaining_time": "0:24:26"}
6
+ {"current_steps": 60, "total_steps": 5000, "loss": 2.0144, "lr": 6e-06, "epoch": 0.012, "percentage": 1.2, "elapsed_time": "0:00:17", "remaining_time": "0:24:17"}
7
+ {"current_steps": 70, "total_steps": 5000, "loss": 2.6128, "lr": 7.000000000000001e-06, "epoch": 0.014, "percentage": 1.4, "elapsed_time": "0:00:20", "remaining_time": "0:24:05"}
8
+ {"current_steps": 80, "total_steps": 5000, "loss": 2.3414, "lr": 8.000000000000001e-06, "epoch": 0.016, "percentage": 1.6, "elapsed_time": "0:00:23", "remaining_time": "0:23:57"}
9
+ {"current_steps": 90, "total_steps": 5000, "loss": 3.2327, "lr": 9e-06, "epoch": 0.018, "percentage": 1.8, "elapsed_time": "0:00:26", "remaining_time": "0:23:42"}
10
+ {"current_steps": 100, "total_steps": 5000, "loss": 2.7985, "lr": 1e-05, "epoch": 0.02, "percentage": 2.0, "elapsed_time": "0:00:29", "remaining_time": "0:23:45"}
11
+ {"current_steps": 110, "total_steps": 5000, "loss": 2.397, "lr": 1.1000000000000001e-05, "epoch": 0.022, "percentage": 2.2, "elapsed_time": "0:00:31", "remaining_time": "0:23:33"}
12
+ {"current_steps": 120, "total_steps": 5000, "loss": 1.0396, "lr": 1.2e-05, "epoch": 0.024, "percentage": 2.4, "elapsed_time": "0:00:34", "remaining_time": "0:23:19"}
13
+ {"current_steps": 130, "total_steps": 5000, "loss": 3.2413, "lr": 1.3000000000000001e-05, "epoch": 0.026, "percentage": 2.6, "elapsed_time": "0:00:37", "remaining_time": "0:23:06"}
14
+ {"current_steps": 140, "total_steps": 5000, "loss": 3.3462, "lr": 1.4000000000000001e-05, "epoch": 0.028, "percentage": 2.8, "elapsed_time": "0:00:39", "remaining_time": "0:22:46"}
15
+ {"current_steps": 150, "total_steps": 5000, "loss": 1.3302, "lr": 1.5e-05, "epoch": 0.03, "percentage": 3.0, "elapsed_time": "0:00:41", "remaining_time": "0:22:08"}
16
+ {"current_steps": 160, "total_steps": 5000, "loss": 1.2748, "lr": 1.6000000000000003e-05, "epoch": 0.032, "percentage": 3.2, "elapsed_time": "0:00:42", "remaining_time": "0:21:36"}
17
+ {"current_steps": 170, "total_steps": 5000, "loss": 3.3206, "lr": 1.7000000000000003e-05, "epoch": 0.034, "percentage": 3.4, "elapsed_time": "0:00:45", "remaining_time": "0:21:29"}
18
+ {"current_steps": 180, "total_steps": 5000, "loss": 1.3943, "lr": 1.8e-05, "epoch": 0.036, "percentage": 3.6, "elapsed_time": "0:00:47", "remaining_time": "0:21:16"}
19
+ {"current_steps": 190, "total_steps": 5000, "loss": 1.2942, "lr": 1.9e-05, "epoch": 0.038, "percentage": 3.8, "elapsed_time": "0:00:49", "remaining_time": "0:21:03"}
20
+ {"current_steps": 200, "total_steps": 5000, "loss": 1.4252, "lr": 2e-05, "epoch": 0.04, "percentage": 4.0, "elapsed_time": "0:00:52", "remaining_time": "0:20:53"}
21
+ {"current_steps": 210, "total_steps": 5000, "loss": 1.6539, "lr": 2.1e-05, "epoch": 0.042, "percentage": 4.2, "elapsed_time": "0:00:54", "remaining_time": "0:20:41"}
22
+ {"current_steps": 220, "total_steps": 5000, "loss": 1.8091, "lr": 2.2000000000000003e-05, "epoch": 0.044, "percentage": 4.4, "elapsed_time": "0:00:57", "remaining_time": "0:20:45"}
23
+ {"current_steps": 230, "total_steps": 5000, "loss": 1.2866, "lr": 2.3000000000000003e-05, "epoch": 0.046, "percentage": 4.6, "elapsed_time": "0:01:00", "remaining_time": "0:20:53"}
24
+ {"current_steps": 240, "total_steps": 5000, "loss": 1.7432, "lr": 2.4e-05, "epoch": 0.048, "percentage": 4.8, "elapsed_time": "0:01:03", "remaining_time": "0:20:56"}
25
+ {"current_steps": 250, "total_steps": 5000, "loss": 1.6918, "lr": 2.5e-05, "epoch": 0.05, "percentage": 5.0, "elapsed_time": "0:01:06", "remaining_time": "0:20:59"}
26
+ {"current_steps": 260, "total_steps": 5000, "loss": 0.9121, "lr": 2.6000000000000002e-05, "epoch": 0.052, "percentage": 5.2, "elapsed_time": "0:01:08", "remaining_time": "0:20:56"}
27
+ {"current_steps": 270, "total_steps": 5000, "loss": 0.6088, "lr": 2.7000000000000002e-05, "epoch": 0.054, "percentage": 5.4, "elapsed_time": "0:01:11", "remaining_time": "0:20:53"}
28
+ {"current_steps": 280, "total_steps": 5000, "loss": 0.6236, "lr": 2.8000000000000003e-05, "epoch": 0.056, "percentage": 5.6, "elapsed_time": "0:01:13", "remaining_time": "0:20:40"}
29
+ {"current_steps": 290, "total_steps": 5000, "loss": 0.847, "lr": 2.9e-05, "epoch": 0.058, "percentage": 5.8, "elapsed_time": "0:01:16", "remaining_time": "0:20:40"}
30
+ {"current_steps": 300, "total_steps": 5000, "loss": 0.9911, "lr": 3e-05, "epoch": 0.06, "percentage": 6.0, "elapsed_time": "0:01:18", "remaining_time": "0:20:28"}
31
+ {"current_steps": 310, "total_steps": 5000, "loss": 1.4213, "lr": 3.1e-05, "epoch": 0.062, "percentage": 6.2, "elapsed_time": "0:01:21", "remaining_time": "0:20:26"}
32
+ {"current_steps": 320, "total_steps": 5000, "loss": 0.436, "lr": 3.2000000000000005e-05, "epoch": 0.064, "percentage": 6.4, "elapsed_time": "0:01:23", "remaining_time": "0:20:15"}
33
+ {"current_steps": 330, "total_steps": 5000, "loss": 1.0218, "lr": 3.3e-05, "epoch": 0.066, "percentage": 6.6, "elapsed_time": "0:01:25", "remaining_time": "0:20:16"}
34
+ {"current_steps": 340, "total_steps": 5000, "loss": 1.2908, "lr": 3.4000000000000007e-05, "epoch": 0.068, "percentage": 6.8, "elapsed_time": "0:01:27", "remaining_time": "0:20:02"}
35
+ {"current_steps": 350, "total_steps": 5000, "loss": 1.0009, "lr": 3.5e-05, "epoch": 0.07, "percentage": 7.0, "elapsed_time": "0:01:30", "remaining_time": "0:19:56"}
36
+ {"current_steps": 360, "total_steps": 5000, "loss": 0.6865, "lr": 3.6e-05, "epoch": 0.072, "percentage": 7.2, "elapsed_time": "0:01:32", "remaining_time": "0:19:50"}
37
+ {"current_steps": 370, "total_steps": 5000, "loss": 0.9106, "lr": 3.7e-05, "epoch": 0.074, "percentage": 7.4, "elapsed_time": "0:01:34", "remaining_time": "0:19:42"}
38
+ {"current_steps": 380, "total_steps": 5000, "loss": 1.4635, "lr": 3.8e-05, "epoch": 0.076, "percentage": 7.6, "elapsed_time": "0:01:37", "remaining_time": "0:19:40"}
39
+ {"current_steps": 390, "total_steps": 5000, "loss": 1.3782, "lr": 3.9000000000000006e-05, "epoch": 0.078, "percentage": 7.8, "elapsed_time": "0:01:39", "remaining_time": "0:19:38"}
40
+ {"current_steps": 400, "total_steps": 5000, "loss": 0.8778, "lr": 4e-05, "epoch": 0.08, "percentage": 8.0, "elapsed_time": "0:01:42", "remaining_time": "0:19:33"}
41
+ {"current_steps": 410, "total_steps": 5000, "loss": 0.8344, "lr": 4.1e-05, "epoch": 0.082, "percentage": 8.2, "elapsed_time": "0:01:44", "remaining_time": "0:19:25"}
42
+ {"current_steps": 420, "total_steps": 5000, "loss": 0.5733, "lr": 4.2e-05, "epoch": 0.084, "percentage": 8.4, "elapsed_time": "0:01:46", "remaining_time": "0:19:22"}
43
+ {"current_steps": 430, "total_steps": 5000, "loss": 0.6683, "lr": 4.3e-05, "epoch": 0.086, "percentage": 8.6, "elapsed_time": "0:01:48", "remaining_time": "0:19:17"}
44
+ {"current_steps": 440, "total_steps": 5000, "loss": 3.1046, "lr": 4.4000000000000006e-05, "epoch": 0.088, "percentage": 8.8, "elapsed_time": "0:01:50", "remaining_time": "0:19:09"}
45
+ {"current_steps": 450, "total_steps": 5000, "loss": 0.981, "lr": 4.5e-05, "epoch": 0.09, "percentage": 9.0, "elapsed_time": "0:01:53", "remaining_time": "0:19:11"}
46
+ {"current_steps": 460, "total_steps": 5000, "loss": 2.1118, "lr": 4.600000000000001e-05, "epoch": 0.092, "percentage": 9.2, "elapsed_time": "0:01:56", "remaining_time": "0:19:09"}
47
+ {"current_steps": 470, "total_steps": 5000, "loss": 0.3037, "lr": 4.7e-05, "epoch": 0.094, "percentage": 9.4, "elapsed_time": "0:01:58", "remaining_time": "0:18:58"}
48
+ {"current_steps": 480, "total_steps": 5000, "loss": 1.2638, "lr": 4.8e-05, "epoch": 0.096, "percentage": 9.6, "elapsed_time": "0:02:00", "remaining_time": "0:18:53"}
49
+ {"current_steps": 490, "total_steps": 5000, "loss": 2.3644, "lr": 4.9e-05, "epoch": 0.098, "percentage": 9.8, "elapsed_time": "0:02:02", "remaining_time": "0:18:49"}
50
+ {"current_steps": 500, "total_steps": 5000, "loss": 0.8317, "lr": 5e-05, "epoch": 0.1, "percentage": 10.0, "elapsed_time": "0:02:05", "remaining_time": "0:18:51"}
51
+ {"current_steps": 510, "total_steps": 5000, "loss": 2.1864, "lr": 4.999939076763487e-05, "epoch": 0.102, "percentage": 10.2, "elapsed_time": "0:02:08", "remaining_time": "0:18:47"}
52
+ {"current_steps": 520, "total_steps": 5000, "loss": 1.3502, "lr": 4.999756310023261e-05, "epoch": 0.104, "percentage": 10.4, "elapsed_time": "0:02:10", "remaining_time": "0:18:48"}
53
+ {"current_steps": 530, "total_steps": 5000, "loss": 1.161, "lr": 4.999451708687114e-05, "epoch": 0.106, "percentage": 10.6, "elapsed_time": "0:02:13", "remaining_time": "0:18:46"}
54
+ {"current_steps": 540, "total_steps": 5000, "loss": 0.7291, "lr": 4.999025287600886e-05, "epoch": 0.108, "percentage": 10.8, "elapsed_time": "0:02:16", "remaining_time": "0:18:45"}
55
+ {"current_steps": 550, "total_steps": 5000, "loss": 0.4316, "lr": 4.99847706754774e-05, "epoch": 0.11, "percentage": 11.0, "elapsed_time": "0:02:18", "remaining_time": "0:18:40"}
56
+ {"current_steps": 560, "total_steps": 5000, "loss": 1.2009, "lr": 4.997807075247146e-05, "epoch": 0.112, "percentage": 11.2, "elapsed_time": "0:02:21", "remaining_time": "0:18:42"}
57
+ {"current_steps": 570, "total_steps": 5000, "loss": 0.5649, "lr": 4.997015343353585e-05, "epoch": 0.114, "percentage": 11.4, "elapsed_time": "0:02:24", "remaining_time": "0:18:39"}
58
+ {"current_steps": 580, "total_steps": 5000, "loss": 0.4128, "lr": 4.996101910454953e-05, "epoch": 0.116, "percentage": 11.6, "elapsed_time": "0:02:25", "remaining_time": "0:18:32"}
59
+ {"current_steps": 590, "total_steps": 5000, "loss": 0.977, "lr": 4.995066821070679e-05, "epoch": 0.118, "percentage": 11.8, "elapsed_time": "0:02:28", "remaining_time": "0:18:28"}
60
+ {"current_steps": 600, "total_steps": 5000, "loss": 1.3527, "lr": 4.993910125649561e-05, "epoch": 0.12, "percentage": 12.0, "elapsed_time": "0:02:30", "remaining_time": "0:18:26"}
61
+ {"current_steps": 610, "total_steps": 5000, "loss": 0.6522, "lr": 4.992631880567301e-05, "epoch": 0.122, "percentage": 12.2, "elapsed_time": "0:02:33", "remaining_time": "0:18:25"}
62
+ {"current_steps": 620, "total_steps": 5000, "loss": 0.7698, "lr": 4.991232148123761e-05, "epoch": 0.124, "percentage": 12.4, "elapsed_time": "0:02:35", "remaining_time": "0:18:20"}
63
+ {"current_steps": 630, "total_steps": 5000, "loss": 0.9953, "lr": 4.989710996539926e-05, "epoch": 0.126, "percentage": 12.6, "elapsed_time": "0:02:38", "remaining_time": "0:18:17"}
64
+ {"current_steps": 640, "total_steps": 5000, "loss": 0.8877, "lr": 4.988068499954578e-05, "epoch": 0.128, "percentage": 12.8, "elapsed_time": "0:02:41", "remaining_time": "0:18:16"}
65
+ {"current_steps": 650, "total_steps": 5000, "loss": 0.564, "lr": 4.9863047384206835e-05, "epoch": 0.13, "percentage": 13.0, "elapsed_time": "0:02:44", "remaining_time": "0:18:18"}
66
+ {"current_steps": 660, "total_steps": 5000, "loss": 0.451, "lr": 4.984419797901491e-05, "epoch": 0.132, "percentage": 13.2, "elapsed_time": "0:02:46", "remaining_time": "0:18:13"}
67
+ {"current_steps": 670, "total_steps": 5000, "loss": 1.5067, "lr": 4.982413770266342e-05, "epoch": 0.134, "percentage": 13.4, "elapsed_time": "0:02:48", "remaining_time": "0:18:06"}
68
+ {"current_steps": 680, "total_steps": 5000, "loss": 1.6702, "lr": 4.980286753286195e-05, "epoch": 0.136, "percentage": 13.6, "elapsed_time": "0:02:50", "remaining_time": "0:18:06"}
69
+ {"current_steps": 690, "total_steps": 5000, "loss": 0.7115, "lr": 4.978038850628854e-05, "epoch": 0.138, "percentage": 13.8, "elapsed_time": "0:02:53", "remaining_time": "0:18:03"}
70
+ {"current_steps": 700, "total_steps": 5000, "loss": 0.9633, "lr": 4.975670171853926e-05, "epoch": 0.14, "percentage": 14.0, "elapsed_time": "0:02:55", "remaining_time": "0:17:59"}
71
+ {"current_steps": 710, "total_steps": 5000, "loss": 1.1906, "lr": 4.9731808324074717e-05, "epoch": 0.142, "percentage": 14.2, "elapsed_time": "0:02:58", "remaining_time": "0:17:57"}
72
+ {"current_steps": 720, "total_steps": 5000, "loss": 1.7433, "lr": 4.9705709536163824e-05, "epoch": 0.144, "percentage": 14.4, "elapsed_time": "0:03:01", "remaining_time": "0:17:56"}
73
+ {"current_steps": 730, "total_steps": 5000, "loss": 0.483, "lr": 4.96784066268247e-05, "epoch": 0.146, "percentage": 14.6, "elapsed_time": "0:03:03", "remaining_time": "0:17:54"}
74
+ {"current_steps": 740, "total_steps": 5000, "loss": 1.0321, "lr": 4.964990092676263e-05, "epoch": 0.148, "percentage": 14.8, "elapsed_time": "0:03:06", "remaining_time": "0:17:52"}
75
+ {"current_steps": 750, "total_steps": 5000, "loss": 1.0468, "lr": 4.962019382530521e-05, "epoch": 0.15, "percentage": 15.0, "elapsed_time": "0:03:08", "remaining_time": "0:17:48"}
76
+ {"current_steps": 760, "total_steps": 5000, "loss": 0.5741, "lr": 4.9589286770334654e-05, "epoch": 0.152, "percentage": 15.2, "elapsed_time": "0:03:10", "remaining_time": "0:17:44"}
77
+ {"current_steps": 770, "total_steps": 5000, "loss": 0.8734, "lr": 4.9557181268217227e-05, "epoch": 0.154, "percentage": 15.4, "elapsed_time": "0:03:13", "remaining_time": "0:17:40"}
78
+ {"current_steps": 780, "total_steps": 5000, "loss": 2.3025, "lr": 4.952387888372979e-05, "epoch": 0.156, "percentage": 15.6, "elapsed_time": "0:03:15", "remaining_time": "0:17:37"}
79
+ {"current_steps": 790, "total_steps": 5000, "loss": 1.3188, "lr": 4.94893812399836e-05, "epoch": 0.158, "percentage": 15.8, "elapsed_time": "0:03:17", "remaining_time": "0:17:34"}
80
+ {"current_steps": 800, "total_steps": 5000, "loss": 2.0348, "lr": 4.9453690018345144e-05, "epoch": 0.16, "percentage": 16.0, "elapsed_time": "0:03:20", "remaining_time": "0:17:31"}
81
+ {"current_steps": 810, "total_steps": 5000, "loss": 1.8253, "lr": 4.94168069583542e-05, "epoch": 0.162, "percentage": 16.2, "elapsed_time": "0:03:22", "remaining_time": "0:17:29"}
82
+ {"current_steps": 820, "total_steps": 5000, "loss": 0.7857, "lr": 4.937873385763908e-05, "epoch": 0.164, "percentage": 16.4, "elapsed_time": "0:03:25", "remaining_time": "0:17:26"}
83
+ {"current_steps": 830, "total_steps": 5000, "loss": 1.6698, "lr": 4.933947257182901e-05, "epoch": 0.166, "percentage": 16.6, "elapsed_time": "0:03:27", "remaining_time": "0:17:23"}
84
+ {"current_steps": 840, "total_steps": 5000, "loss": 1.3913, "lr": 4.929902501446366e-05, "epoch": 0.168, "percentage": 16.8, "elapsed_time": "0:03:30", "remaining_time": "0:17:23"}
85
+ {"current_steps": 850, "total_steps": 5000, "loss": 1.0676, "lr": 4.925739315689991e-05, "epoch": 0.17, "percentage": 17.0, "elapsed_time": "0:03:33", "remaining_time": "0:17:21"}
86
+ {"current_steps": 860, "total_steps": 5000, "loss": 1.6685, "lr": 4.9214579028215776e-05, "epoch": 0.172, "percentage": 17.2, "elapsed_time": "0:03:35", "remaining_time": "0:17:18"}
87
+ {"current_steps": 870, "total_steps": 5000, "loss": 1.4978, "lr": 4.917058471511149e-05, "epoch": 0.174, "percentage": 17.4, "elapsed_time": "0:03:38", "remaining_time": "0:17:17"}
88
+ {"current_steps": 880, "total_steps": 5000, "loss": 8.4068, "lr": 4.912541236180779e-05, "epoch": 0.176, "percentage": 17.6, "elapsed_time": "0:03:41", "remaining_time": "0:17:14"}
89
+ {"current_steps": 890, "total_steps": 5000, "loss": 1.0415, "lr": 4.907906416994146e-05, "epoch": 0.178, "percentage": 17.8, "elapsed_time": "0:03:43", "remaining_time": "0:17:11"}
90
+ {"current_steps": 900, "total_steps": 5000, "loss": 1.7801, "lr": 4.9031542398457974e-05, "epoch": 0.18, "percentage": 18.0, "elapsed_time": "0:03:46", "remaining_time": "0:17:11"}
91
+ {"current_steps": 910, "total_steps": 5000, "loss": 0.4855, "lr": 4.898284936350144e-05, "epoch": 0.182, "percentage": 18.2, "elapsed_time": "0:03:48", "remaining_time": "0:17:06"}
92
+ {"current_steps": 920, "total_steps": 5000, "loss": 0.6015, "lr": 4.893298743830168e-05, "epoch": 0.184, "percentage": 18.4, "elapsed_time": "0:03:50", "remaining_time": "0:17:03"}
93
+ {"current_steps": 930, "total_steps": 5000, "loss": 0.7006, "lr": 4.888195905305859e-05, "epoch": 0.186, "percentage": 18.6, "elapsed_time": "0:03:53", "remaining_time": "0:16:59"}
94
+ {"current_steps": 940, "total_steps": 5000, "loss": 0.3036, "lr": 4.882976669482367e-05, "epoch": 0.188, "percentage": 18.8, "elapsed_time": "0:03:54", "remaining_time": "0:16:54"}
95
+ {"current_steps": 950, "total_steps": 5000, "loss": 0.2464, "lr": 4.877641290737884e-05, "epoch": 0.19, "percentage": 19.0, "elapsed_time": "0:03:56", "remaining_time": "0:16:47"}
96
+ {"current_steps": 960, "total_steps": 5000, "loss": 0.4622, "lr": 4.8721900291112415e-05, "epoch": 0.192, "percentage": 19.2, "elapsed_time": "0:03:58", "remaining_time": "0:16:44"}
97
+ {"current_steps": 970, "total_steps": 5000, "loss": 0.5846, "lr": 4.8666231502892415e-05, "epoch": 0.194, "percentage": 19.4, "elapsed_time": "0:04:01", "remaining_time": "0:16:41"}
98
+ {"current_steps": 980, "total_steps": 5000, "loss": 0.5897, "lr": 4.860940925593703e-05, "epoch": 0.196, "percentage": 19.6, "elapsed_time": "0:04:03", "remaining_time": "0:16:39"}
99
+ {"current_steps": 990, "total_steps": 5000, "loss": 0.6564, "lr": 4.855143631968242e-05, "epoch": 0.198, "percentage": 19.8, "elapsed_time": "0:04:05", "remaining_time": "0:16:34"}
100
+ {"current_steps": 1000, "total_steps": 5000, "loss": 0.6761, "lr": 4.849231551964771e-05, "epoch": 0.2, "percentage": 20.0, "elapsed_time": "0:04:07", "remaining_time": "0:16:31"}
101
+ {"current_steps": 1010, "total_steps": 5000, "loss": 0.9705, "lr": 4.843204973729729e-05, "epoch": 0.202, "percentage": 20.2, "elapsed_time": "0:04:10", "remaining_time": "0:16:29"}
102
+ {"current_steps": 1020, "total_steps": 5000, "loss": 0.6534, "lr": 4.837064190990036e-05, "epoch": 0.204, "percentage": 20.4, "elapsed_time": "0:04:12", "remaining_time": "0:16:27"}
103
+ {"current_steps": 1030, "total_steps": 5000, "loss": 1.8363, "lr": 4.830809503038781e-05, "epoch": 0.206, "percentage": 20.6, "elapsed_time": "0:04:16", "remaining_time": "0:16:27"}
104
+ {"current_steps": 1040, "total_steps": 5000, "loss": 1.5076, "lr": 4.8244412147206284e-05, "epoch": 0.208, "percentage": 20.8, "elapsed_time": "0:04:17", "remaining_time": "0:16:22"}
105
+ {"current_steps": 1050, "total_steps": 5000, "loss": 0.9317, "lr": 4.817959636416969e-05, "epoch": 0.21, "percentage": 21.0, "elapsed_time": "0:04:20", "remaining_time": "0:16:20"}
106
+ {"current_steps": 1060, "total_steps": 5000, "loss": 1.2908, "lr": 4.8113650840307834e-05, "epoch": 0.212, "percentage": 21.2, "elapsed_time": "0:04:23", "remaining_time": "0:16:20"}
107
+ {"current_steps": 1070, "total_steps": 5000, "loss": 0.5742, "lr": 4.8046578789712515e-05, "epoch": 0.214, "percentage": 21.4, "elapsed_time": "0:04:26", "remaining_time": "0:16:19"}
108
+ {"current_steps": 1080, "total_steps": 5000, "loss": 1.2138, "lr": 4.797838348138086e-05, "epoch": 0.216, "percentage": 21.6, "elapsed_time": "0:04:28", "remaining_time": "0:16:14"}
109
+ {"current_steps": 1090, "total_steps": 5000, "loss": 1.9534, "lr": 4.790906823905599e-05, "epoch": 0.218, "percentage": 21.8, "elapsed_time": "0:04:31", "remaining_time": "0:16:13"}
110
+ {"current_steps": 1100, "total_steps": 5000, "loss": 1.2392, "lr": 4.783863644106502e-05, "epoch": 0.22, "percentage": 22.0, "elapsed_time": "0:04:33", "remaining_time": "0:16:10"}
111
+ {"current_steps": 1110, "total_steps": 5000, "loss": 0.7664, "lr": 4.776709152015443e-05, "epoch": 0.222, "percentage": 22.2, "elapsed_time": "0:04:35", "remaining_time": "0:16:05"}
112
+ {"current_steps": 1120, "total_steps": 5000, "loss": 0.6328, "lr": 4.769443696332272e-05, "epoch": 0.224, "percentage": 22.4, "elapsed_time": "0:04:37", "remaining_time": "0:16:00"}
113
+ {"current_steps": 1130, "total_steps": 5000, "loss": 1.3397, "lr": 4.762067631165049e-05, "epoch": 0.226, "percentage": 22.6, "elapsed_time": "0:04:39", "remaining_time": "0:15:56"}
114
+ {"current_steps": 1140, "total_steps": 5000, "loss": 0.5316, "lr": 4.754581316012785e-05, "epoch": 0.228, "percentage": 22.8, "elapsed_time": "0:04:41", "remaining_time": "0:15:52"}
115
+ {"current_steps": 1150, "total_steps": 5000, "loss": 1.7526, "lr": 4.7469851157479177e-05, "epoch": 0.23, "percentage": 23.0, "elapsed_time": "0:04:43", "remaining_time": "0:15:49"}
116
+ {"current_steps": 1160, "total_steps": 5000, "loss": 1.6985, "lr": 4.7392794005985326e-05, "epoch": 0.232, "percentage": 23.2, "elapsed_time": "0:04:46", "remaining_time": "0:15:49"}
117
+ {"current_steps": 1170, "total_steps": 5000, "loss": 1.7021, "lr": 4.731464546130314e-05, "epoch": 0.234, "percentage": 23.4, "elapsed_time": "0:04:49", "remaining_time": "0:15:47"}
118
+ {"current_steps": 1180, "total_steps": 5000, "loss": 0.6692, "lr": 4.723540933228244e-05, "epoch": 0.236, "percentage": 23.6, "elapsed_time": "0:04:52", "remaining_time": "0:15:45"}
119
+ {"current_steps": 1190, "total_steps": 5000, "loss": 0.8183, "lr": 4.715508948078037e-05, "epoch": 0.238, "percentage": 23.8, "elapsed_time": "0:04:54", "remaining_time": "0:15:43"}
120
+ {"current_steps": 1200, "total_steps": 5000, "loss": 0.7391, "lr": 4.707368982147318e-05, "epoch": 0.24, "percentage": 24.0, "elapsed_time": "0:04:56", "remaining_time": "0:15:39"}
121
+ {"current_steps": 1210, "total_steps": 5000, "loss": 1.0601, "lr": 4.6991214321665414e-05, "epoch": 0.242, "percentage": 24.2, "elapsed_time": "0:04:58", "remaining_time": "0:15:36"}
122
+ {"current_steps": 1220, "total_steps": 5000, "loss": 0.6689, "lr": 4.690766700109659e-05, "epoch": 0.244, "percentage": 24.4, "elapsed_time": "0:05:01", "remaining_time": "0:15:34"}
123
+ {"current_steps": 1230, "total_steps": 5000, "loss": 1.2384, "lr": 4.682305193174524e-05, "epoch": 0.246, "percentage": 24.6, "elapsed_time": "0:05:03", "remaining_time": "0:15:31"}
124
+ {"current_steps": 1240, "total_steps": 5000, "loss": 0.5366, "lr": 4.6737373237630476e-05, "epoch": 0.248, "percentage": 24.8, "elapsed_time": "0:05:05", "remaining_time": "0:15:27"}
125
+ {"current_steps": 1250, "total_steps": 5000, "loss": 0.9924, "lr": 4.665063509461097e-05, "epoch": 0.25, "percentage": 25.0, "elapsed_time": "0:05:08", "remaining_time": "0:15:24"}
126
+ {"current_steps": 1260, "total_steps": 5000, "loss": 1.1548, "lr": 4.656284173018144e-05, "epoch": 0.252, "percentage": 25.2, "elapsed_time": "0:05:10", "remaining_time": "0:15:21"}
127
+ {"current_steps": 1270, "total_steps": 5000, "loss": 0.798, "lr": 4.6473997423266614e-05, "epoch": 0.254, "percentage": 25.4, "elapsed_time": "0:05:12", "remaining_time": "0:15:18"}
128
+ {"current_steps": 1280, "total_steps": 5000, "loss": 0.8444, "lr": 4.638410650401267e-05, "epoch": 0.256, "percentage": 25.6, "elapsed_time": "0:05:15", "remaining_time": "0:15:18"}
129
+ {"current_steps": 1290, "total_steps": 5000, "loss": 1.4516, "lr": 4.629317335357619e-05, "epoch": 0.258, "percentage": 25.8, "elapsed_time": "0:05:18", "remaining_time": "0:15:16"}
130
+ {"current_steps": 1300, "total_steps": 5000, "loss": 0.4612, "lr": 4.620120240391065e-05, "epoch": 0.26, "percentage": 26.0, "elapsed_time": "0:05:21", "remaining_time": "0:15:13"}
131
+ {"current_steps": 1310, "total_steps": 5000, "loss": 0.8674, "lr": 4.610819813755038e-05, "epoch": 0.262, "percentage": 26.2, "elapsed_time": "0:05:23", "remaining_time": "0:15:10"}
132
+ {"current_steps": 1320, "total_steps": 5000, "loss": 0.8115, "lr": 4.601416508739211e-05, "epoch": 0.264, "percentage": 26.4, "elapsed_time": "0:05:26", "remaining_time": "0:15:09"}
133
+ {"current_steps": 1330, "total_steps": 5000, "loss": 0.4957, "lr": 4.591910783647404e-05, "epoch": 0.266, "percentage": 26.6, "elapsed_time": "0:05:29", "remaining_time": "0:15:07"}
134
+ {"current_steps": 1340, "total_steps": 5000, "loss": 0.862, "lr": 4.5823031017752485e-05, "epoch": 0.268, "percentage": 26.8, "elapsed_time": "0:05:31", "remaining_time": "0:15:04"}
135
+ {"current_steps": 1350, "total_steps": 5000, "loss": 0.2812, "lr": 4.572593931387604e-05, "epoch": 0.27, "percentage": 27.0, "elapsed_time": "0:05:32", "remaining_time": "0:14:59"}
136
+ {"current_steps": 1360, "total_steps": 5000, "loss": 2.1906, "lr": 4.562783745695738e-05, "epoch": 0.272, "percentage": 27.2, "elapsed_time": "0:05:35", "remaining_time": "0:14:57"}
137
+ {"current_steps": 1370, "total_steps": 5000, "loss": 0.9072, "lr": 4.5528730228342605e-05, "epoch": 0.274, "percentage": 27.4, "elapsed_time": "0:05:37", "remaining_time": "0:14:54"}
138
+ {"current_steps": 1380, "total_steps": 5000, "loss": 0.5203, "lr": 4.542862245837821e-05, "epoch": 0.276, "percentage": 27.6, "elapsed_time": "0:05:39", "remaining_time": "0:14:51"}
139
+ {"current_steps": 1390, "total_steps": 5000, "loss": 0.8603, "lr": 4.532751902617569e-05, "epoch": 0.278, "percentage": 27.8, "elapsed_time": "0:05:42", "remaining_time": "0:14:48"}
140
+ {"current_steps": 1400, "total_steps": 5000, "loss": 0.7643, "lr": 4.522542485937369e-05, "epoch": 0.28, "percentage": 28.0, "elapsed_time": "0:05:44", "remaining_time": "0:14:45"}
141
+ {"current_steps": 1410, "total_steps": 5000, "loss": 0.8807, "lr": 4.512234493389785e-05, "epoch": 0.282, "percentage": 28.2, "elapsed_time": "0:05:47", "remaining_time": "0:14:45"}
142
+ {"current_steps": 1420, "total_steps": 5000, "loss": 0.8509, "lr": 4.5018284273718336e-05, "epoch": 0.284, "percentage": 28.4, "elapsed_time": "0:05:50", "remaining_time": "0:14:43"}
143
+ {"current_steps": 1430, "total_steps": 5000, "loss": 1.7827, "lr": 4.491324795060491e-05, "epoch": 0.286, "percentage": 28.6, "elapsed_time": "0:05:53", "remaining_time": "0:14:41"}
144
+ {"current_steps": 1440, "total_steps": 5000, "loss": 0.7381, "lr": 4.480724108387977e-05, "epoch": 0.288, "percentage": 28.8, "elapsed_time": "0:05:56", "remaining_time": "0:14:40"}
145
+ {"current_steps": 1450, "total_steps": 5000, "loss": 1.691, "lr": 4.4700268840168045e-05, "epoch": 0.29, "percentage": 29.0, "elapsed_time": "0:05:58", "remaining_time": "0:14:38"}
146
+ {"current_steps": 1460, "total_steps": 5000, "loss": 2.0687, "lr": 4.4592336433146e-05, "epoch": 0.292, "percentage": 29.2, "elapsed_time": "0:06:01", "remaining_time": "0:14:35"}
147
+ {"current_steps": 1470, "total_steps": 5000, "loss": 1.8049, "lr": 4.448344912328686e-05, "epoch": 0.294, "percentage": 29.4, "elapsed_time": "0:06:04", "remaining_time": "0:14:34"}
148
+ {"current_steps": 1480, "total_steps": 5000, "loss": 0.8578, "lr": 4.4373612217604496e-05, "epoch": 0.296, "percentage": 29.6, "elapsed_time": "0:06:05", "remaining_time": "0:14:30"}
149
+ {"current_steps": 1490, "total_steps": 5000, "loss": 1.169, "lr": 4.426283106939474e-05, "epoch": 0.298, "percentage": 29.8, "elapsed_time": "0:06:08", "remaining_time": "0:14:28"}
150
+ {"current_steps": 1500, "total_steps": 5000, "loss": 0.3027, "lr": 4.415111107797445e-05, "epoch": 0.3, "percentage": 30.0, "elapsed_time": "0:06:11", "remaining_time": "0:14:26"}
151
+ {"current_steps": 1510, "total_steps": 5000, "loss": 1.0326, "lr": 4.403845768841842e-05, "epoch": 0.302, "percentage": 30.2, "elapsed_time": "0:06:14", "remaining_time": "0:14:24"}
152
+ {"current_steps": 1520, "total_steps": 5000, "loss": 1.0785, "lr": 4.3924876391293915e-05, "epoch": 0.304, "percentage": 30.4, "elapsed_time": "0:06:15", "remaining_time": "0:14:20"}
153
+ {"current_steps": 1530, "total_steps": 5000, "loss": 1.4215, "lr": 4.381037272239311e-05, "epoch": 0.306, "percentage": 30.6, "elapsed_time": "0:06:17", "remaining_time": "0:14:17"}
154
+ {"current_steps": 1540, "total_steps": 5000, "loss": 0.4891, "lr": 4.36949522624633e-05, "epoch": 0.308, "percentage": 30.8, "elapsed_time": "0:06:20", "remaining_time": "0:14:15"}
155
+ {"current_steps": 1550, "total_steps": 5000, "loss": 1.3672, "lr": 4.357862063693486e-05, "epoch": 0.31, "percentage": 31.0, "elapsed_time": "0:06:22", "remaining_time": "0:14:12"}
156
+ {"current_steps": 1560, "total_steps": 5000, "loss": 1.0202, "lr": 4.3461383515647106e-05, "epoch": 0.312, "percentage": 31.2, "elapsed_time": "0:06:25", "remaining_time": "0:14:10"}
157
+ {"current_steps": 1570, "total_steps": 5000, "loss": 0.9313, "lr": 4.334324661257191e-05, "epoch": 0.314, "percentage": 31.4, "elapsed_time": "0:06:28", "remaining_time": "0:14:07"}
158
+ {"current_steps": 1580, "total_steps": 5000, "loss": 0.4453, "lr": 4.3224215685535294e-05, "epoch": 0.316, "percentage": 31.6, "elapsed_time": "0:06:30", "remaining_time": "0:14:04"}
159
+ {"current_steps": 1590, "total_steps": 5000, "loss": 1.694, "lr": 4.3104296535936695e-05, "epoch": 0.318, "percentage": 31.8, "elapsed_time": "0:06:33", "remaining_time": "0:14:03"}
160
+ {"current_steps": 1600, "total_steps": 5000, "loss": 1.4264, "lr": 4.2983495008466276e-05, "epoch": 0.32, "percentage": 32.0, "elapsed_time": "0:06:36", "remaining_time": "0:14:01"}
161
+ {"current_steps": 1610, "total_steps": 5000, "loss": 0.5798, "lr": 4.2861816990820084e-05, "epoch": 0.322, "percentage": 32.2, "elapsed_time": "0:06:38", "remaining_time": "0:14:00"}
162
+ {"current_steps": 1620, "total_steps": 5000, "loss": 0.6301, "lr": 4.273926841341302e-05, "epoch": 0.324, "percentage": 32.4, "elapsed_time": "0:06:41", "remaining_time": "0:13:58"}
163
+ {"current_steps": 1630, "total_steps": 5000, "loss": 0.9712, "lr": 4.261585524908987e-05, "epoch": 0.326, "percentage": 32.6, "elapsed_time": "0:06:44", "remaining_time": "0:13:55"}
164
+ {"current_steps": 1640, "total_steps": 5000, "loss": 0.7751, "lr": 4.249158351283414e-05, "epoch": 0.328, "percentage": 32.8, "elapsed_time": "0:06:46", "remaining_time": "0:13:52"}
165
+ {"current_steps": 1650, "total_steps": 5000, "loss": 0.5724, "lr": 4.2366459261474933e-05, "epoch": 0.33, "percentage": 33.0, "elapsed_time": "0:06:48", "remaining_time": "0:13:48"}
166
+ {"current_steps": 1660, "total_steps": 5000, "loss": 0.9216, "lr": 4.224048859339175e-05, "epoch": 0.332, "percentage": 33.2, "elapsed_time": "0:06:50", "remaining_time": "0:13:45"}
167
+ {"current_steps": 1670, "total_steps": 5000, "loss": 0.5461, "lr": 4.211367764821722e-05, "epoch": 0.334, "percentage": 33.4, "elapsed_time": "0:06:52", "remaining_time": "0:13:42"}
168
+ {"current_steps": 1680, "total_steps": 5000, "loss": 1.0833, "lr": 4.198603260653792e-05, "epoch": 0.336, "percentage": 33.6, "elapsed_time": "0:06:55", "remaining_time": "0:13:40"}
169
+ {"current_steps": 1690, "total_steps": 5000, "loss": 1.6468, "lr": 4.185755968959308e-05, "epoch": 0.338, "percentage": 33.8, "elapsed_time": "0:06:58", "remaining_time": "0:13:38"}
170
+ {"current_steps": 1700, "total_steps": 5000, "loss": 0.6583, "lr": 4.172826515897146e-05, "epoch": 0.34, "percentage": 34.0, "elapsed_time": "0:07:00", "remaining_time": "0:13:35"}
171
+ {"current_steps": 1710, "total_steps": 5000, "loss": 1.1721, "lr": 4.1598155316306044e-05, "epoch": 0.342, "percentage": 34.2, "elapsed_time": "0:07:02", "remaining_time": "0:13:33"}
172
+ {"current_steps": 1720, "total_steps": 5000, "loss": 0.8588, "lr": 4.146723650296701e-05, "epoch": 0.344, "percentage": 34.4, "elapsed_time": "0:07:05", "remaining_time": "0:13:30"}
173
+ {"current_steps": 1730, "total_steps": 5000, "loss": 0.4645, "lr": 4.133551509975264e-05, "epoch": 0.346, "percentage": 34.6, "elapsed_time": "0:07:07", "remaining_time": "0:13:28"}
174
+ {"current_steps": 1740, "total_steps": 5000, "loss": 1.4741, "lr": 4.1202997526578276e-05, "epoch": 0.348, "percentage": 34.8, "elapsed_time": "0:07:10", "remaining_time": "0:13:26"}
175
+ {"current_steps": 1750, "total_steps": 5000, "loss": 0.9873, "lr": 4.1069690242163484e-05, "epoch": 0.35, "percentage": 35.0, "elapsed_time": "0:07:12", "remaining_time": "0:13:23"}
176
+ {"current_steps": 1760, "total_steps": 5000, "loss": 0.6202, "lr": 4.093559974371725e-05, "epoch": 0.352, "percentage": 35.2, "elapsed_time": "0:07:15", "remaining_time": "0:13:22"}
177
+ {"current_steps": 1770, "total_steps": 5000, "loss": 0.7872, "lr": 4.080073256662127e-05, "epoch": 0.354, "percentage": 35.4, "elapsed_time": "0:07:18", "remaining_time": "0:13:20"}
178
+ {"current_steps": 1780, "total_steps": 5000, "loss": 1.9155, "lr": 4.066509528411152e-05, "epoch": 0.356, "percentage": 35.6, "elapsed_time": "0:07:21", "remaining_time": "0:13:18"}
179
+ {"current_steps": 1790, "total_steps": 5000, "loss": 0.5979, "lr": 4.052869450695776e-05, "epoch": 0.358, "percentage": 35.8, "elapsed_time": "0:07:23", "remaining_time": "0:13:15"}
180
+ {"current_steps": 1800, "total_steps": 5000, "loss": 0.8478, "lr": 4.039153688314145e-05, "epoch": 0.36, "percentage": 36.0, "elapsed_time": "0:07:26", "remaining_time": "0:13:13"}
181
+ {"current_steps": 1810, "total_steps": 5000, "loss": 0.623, "lr": 4.02536290975317e-05, "epoch": 0.362, "percentage": 36.2, "elapsed_time": "0:07:28", "remaining_time": "0:13:10"}
182
+ {"current_steps": 1820, "total_steps": 5000, "loss": 1.4658, "lr": 4.011497787155938e-05, "epoch": 0.364, "percentage": 36.4, "elapsed_time": "0:07:30", "remaining_time": "0:13:07"}
183
+ {"current_steps": 1830, "total_steps": 5000, "loss": 1.9824, "lr": 3.997558996288965e-05, "epoch": 0.366, "percentage": 36.6, "elapsed_time": "0:07:33", "remaining_time": "0:13:04"}
184
+ {"current_steps": 1840, "total_steps": 5000, "loss": 0.626, "lr": 3.983547216509254e-05, "epoch": 0.368, "percentage": 36.8, "elapsed_time": "0:07:35", "remaining_time": "0:13:02"}
185
+ {"current_steps": 1850, "total_steps": 5000, "loss": 0.5819, "lr": 3.969463130731183e-05, "epoch": 0.37, "percentage": 37.0, "elapsed_time": "0:07:37", "remaining_time": "0:12:59"}
186
+ {"current_steps": 1860, "total_steps": 5000, "loss": 0.7553, "lr": 3.955307425393224e-05, "epoch": 0.372, "percentage": 37.2, "elapsed_time": "0:07:40", "remaining_time": "0:12:57"}
187
+ {"current_steps": 1870, "total_steps": 5000, "loss": 1.3426, "lr": 3.941080790424484e-05, "epoch": 0.374, "percentage": 37.4, "elapsed_time": "0:07:42", "remaining_time": "0:12:54"}
188
+ {"current_steps": 1880, "total_steps": 5000, "loss": 0.8386, "lr": 3.92678391921108e-05, "epoch": 0.376, "percentage": 37.6, "elapsed_time": "0:07:45", "remaining_time": "0:12:52"}
189
+ {"current_steps": 1890, "total_steps": 5000, "loss": 0.5088, "lr": 3.912417508562345e-05, "epoch": 0.378, "percentage": 37.8, "elapsed_time": "0:07:47", "remaining_time": "0:12:50"}
190
+ {"current_steps": 1900, "total_steps": 5000, "loss": 2.6207, "lr": 3.897982258676867e-05, "epoch": 0.38, "percentage": 38.0, "elapsed_time": "0:07:50", "remaining_time": "0:12:47"}
191
+ {"current_steps": 1910, "total_steps": 5000, "loss": 0.8114, "lr": 3.883478873108361e-05, "epoch": 0.382, "percentage": 38.2, "elapsed_time": "0:07:53", "remaining_time": "0:12:46"}
192
+ {"current_steps": 1920, "total_steps": 5000, "loss": 0.489, "lr": 3.868908058731376e-05, "epoch": 0.384, "percentage": 38.4, "elapsed_time": "0:07:55", "remaining_time": "0:12:43"}
193
+ {"current_steps": 1930, "total_steps": 5000, "loss": 0.6035, "lr": 3.85427052570685e-05, "epoch": 0.386, "percentage": 38.6, "elapsed_time": "0:07:58", "remaining_time": "0:12:40"}
194
+ {"current_steps": 1940, "total_steps": 5000, "loss": 0.9305, "lr": 3.8395669874474915e-05, "epoch": 0.388, "percentage": 38.8, "elapsed_time": "0:08:00", "remaining_time": "0:12:38"}
195
+ {"current_steps": 1950, "total_steps": 5000, "loss": 0.5244, "lr": 3.824798160583012e-05, "epoch": 0.39, "percentage": 39.0, "elapsed_time": "0:08:02", "remaining_time": "0:12:34"}
196
+ {"current_steps": 1960, "total_steps": 5000, "loss": 1.4947, "lr": 3.8099647649251986e-05, "epoch": 0.392, "percentage": 39.2, "elapsed_time": "0:08:05", "remaining_time": "0:12:32"}
197
+ {"current_steps": 1970, "total_steps": 5000, "loss": 0.6931, "lr": 3.795067523432826e-05, "epoch": 0.394, "percentage": 39.4, "elapsed_time": "0:08:07", "remaining_time": "0:12:30"}
198
+ {"current_steps": 1980, "total_steps": 5000, "loss": 0.8547, "lr": 3.780107162176429e-05, "epoch": 0.396, "percentage": 39.6, "elapsed_time": "0:08:09", "remaining_time": "0:12:27"}
199
+ {"current_steps": 1990, "total_steps": 5000, "loss": 2.4946, "lr": 3.765084410302909e-05, "epoch": 0.398, "percentage": 39.8, "elapsed_time": "0:08:12", "remaining_time": "0:12:25"}
200
+ {"current_steps": 2000, "total_steps": 5000, "loss": 1.2444, "lr": 3.7500000000000003e-05, "epoch": 0.4, "percentage": 40.0, "elapsed_time": "0:08:15", "remaining_time": "0:12:23"}
201
+ {"current_steps": 2010, "total_steps": 5000, "loss": 1.2603, "lr": 3.7348546664605777e-05, "epoch": 0.402, "percentage": 40.2, "elapsed_time": "0:08:18", "remaining_time": "0:12:21"}
202
+ {"current_steps": 2020, "total_steps": 5000, "loss": 0.5348, "lr": 3.719649147846832e-05, "epoch": 0.404, "percentage": 40.4, "elapsed_time": "0:08:20", "remaining_time": "0:12:19"}
203
+ {"current_steps": 2030, "total_steps": 5000, "loss": 0.6968, "lr": 3.704384185254288e-05, "epoch": 0.406, "percentage": 40.6, "elapsed_time": "0:08:23", "remaining_time": "0:12:16"}
204
+ {"current_steps": 2040, "total_steps": 5000, "loss": 2.8761, "lr": 3.689060522675689e-05, "epoch": 0.408, "percentage": 40.8, "elapsed_time": "0:08:25", "remaining_time": "0:12:13"}
205
+ {"current_steps": 2050, "total_steps": 5000, "loss": 1.6509, "lr": 3.673678906964727e-05, "epoch": 0.41, "percentage": 41.0, "elapsed_time": "0:08:27", "remaining_time": "0:12:10"}
206
+ {"current_steps": 2060, "total_steps": 5000, "loss": 0.8742, "lr": 3.6582400877996546e-05, "epoch": 0.412, "percentage": 41.2, "elapsed_time": "0:08:30", "remaining_time": "0:12:08"}
207
+ {"current_steps": 2070, "total_steps": 5000, "loss": 1.241, "lr": 3.642744817646736e-05, "epoch": 0.414, "percentage": 41.4, "elapsed_time": "0:08:33", "remaining_time": "0:12:06"}
208
+ {"current_steps": 2080, "total_steps": 5000, "loss": 0.6697, "lr": 3.627193851723577e-05, "epoch": 0.416, "percentage": 41.6, "elapsed_time": "0:08:36", "remaining_time": "0:12:04"}
209
+ {"current_steps": 2090, "total_steps": 5000, "loss": 0.8601, "lr": 3.611587947962319e-05, "epoch": 0.418, "percentage": 41.8, "elapsed_time": "0:08:38", "remaining_time": "0:12:01"}
210
+ {"current_steps": 2100, "total_steps": 5000, "loss": 1.6495, "lr": 3.5959278669726935e-05, "epoch": 0.42, "percentage": 42.0, "elapsed_time": "0:08:41", "remaining_time": "0:11:59"}
211
+ {"current_steps": 2110, "total_steps": 5000, "loss": 1.1695, "lr": 3.580214372004956e-05, "epoch": 0.422, "percentage": 42.2, "elapsed_time": "0:08:44", "remaining_time": "0:11:57"}
212
+ {"current_steps": 2120, "total_steps": 5000, "loss": 2.1528, "lr": 3.564448228912682e-05, "epoch": 0.424, "percentage": 42.4, "elapsed_time": "0:08:46", "remaining_time": "0:11:54"}
213
+ {"current_steps": 2130, "total_steps": 5000, "loss": 0.7449, "lr": 3.548630206115443e-05, "epoch": 0.426, "percentage": 42.6, "elapsed_time": "0:08:48", "remaining_time": "0:11:52"}
214
+ {"current_steps": 2140, "total_steps": 5000, "loss": 0.7268, "lr": 3.532761074561355e-05, "epoch": 0.428, "percentage": 42.8, "elapsed_time": "0:08:51", "remaining_time": "0:11:49"}
215
+ {"current_steps": 2150, "total_steps": 5000, "loss": 0.7794, "lr": 3.516841607689501e-05, "epoch": 0.43, "percentage": 43.0, "elapsed_time": "0:08:53", "remaining_time": "0:11:47"}
216
+ {"current_steps": 2160, "total_steps": 5000, "loss": 0.727, "lr": 3.5008725813922386e-05, "epoch": 0.432, "percentage": 43.2, "elapsed_time": "0:08:56", "remaining_time": "0:11:45"}
217
+ {"current_steps": 2170, "total_steps": 5000, "loss": 2.1652, "lr": 3.484854773977378e-05, "epoch": 0.434, "percentage": 43.4, "elapsed_time": "0:08:59", "remaining_time": "0:11:43"}
218
+ {"current_steps": 2180, "total_steps": 5000, "loss": 0.4095, "lr": 3.4687889661302576e-05, "epoch": 0.436, "percentage": 43.6, "elapsed_time": "0:09:02", "remaining_time": "0:11:41"}
219
+ {"current_steps": 2190, "total_steps": 5000, "loss": 0.8675, "lr": 3.452675940875686e-05, "epoch": 0.438, "percentage": 43.8, "elapsed_time": "0:09:04", "remaining_time": "0:11:38"}
220
+ {"current_steps": 2200, "total_steps": 5000, "loss": 0.8658, "lr": 3.436516483539781e-05, "epoch": 0.44, "percentage": 44.0, "elapsed_time": "0:09:07", "remaining_time": "0:11:36"}
221
+ {"current_steps": 2210, "total_steps": 5000, "loss": 0.3739, "lr": 3.4203113817116957e-05, "epoch": 0.442, "percentage": 44.2, "elapsed_time": "0:09:09", "remaining_time": "0:11:33"}
222
+ {"current_steps": 2220, "total_steps": 5000, "loss": 0.8909, "lr": 3.4040614252052305e-05, "epoch": 0.444, "percentage": 44.4, "elapsed_time": "0:09:12", "remaining_time": "0:11:31"}
223
+ {"current_steps": 2230, "total_steps": 5000, "loss": 1.4721, "lr": 3.387767406020343e-05, "epoch": 0.446, "percentage": 44.6, "elapsed_time": "0:09:15", "remaining_time": "0:11:29"}
224
+ {"current_steps": 2240, "total_steps": 5000, "loss": 1.001, "lr": 3.3714301183045385e-05, "epoch": 0.448, "percentage": 44.8, "elapsed_time": "0:09:18", "remaining_time": "0:11:27"}
225
+ {"current_steps": 2250, "total_steps": 5000, "loss": 0.7794, "lr": 3.355050358314172e-05, "epoch": 0.45, "percentage": 45.0, "elapsed_time": "0:09:20", "remaining_time": "0:11:25"}
226
+ {"current_steps": 2260, "total_steps": 5000, "loss": 0.3814, "lr": 3.338628924375638e-05, "epoch": 0.452, "percentage": 45.2, "elapsed_time": "0:09:22", "remaining_time": "0:11:22"}
227
+ {"current_steps": 2270, "total_steps": 5000, "loss": 0.5824, "lr": 3.322166616846458e-05, "epoch": 0.454, "percentage": 45.4, "elapsed_time": "0:09:25", "remaining_time": "0:11:20"}
228
+ {"current_steps": 2280, "total_steps": 5000, "loss": 1.3243, "lr": 3.305664238076278e-05, "epoch": 0.456, "percentage": 45.6, "elapsed_time": "0:09:28", "remaining_time": "0:11:17"}
229
+ {"current_steps": 2290, "total_steps": 5000, "loss": 0.4481, "lr": 3.289122592367757e-05, "epoch": 0.458, "percentage": 45.8, "elapsed_time": "0:09:30", "remaining_time": "0:11:14"}
230
+ {"current_steps": 2300, "total_steps": 5000, "loss": 0.5601, "lr": 3.272542485937369e-05, "epoch": 0.46, "percentage": 46.0, "elapsed_time": "0:09:32", "remaining_time": "0:11:12"}
231
+ {"current_steps": 2310, "total_steps": 5000, "loss": 0.6695, "lr": 3.2559247268761115e-05, "epoch": 0.462, "percentage": 46.2, "elapsed_time": "0:09:34", "remaining_time": "0:11:09"}
232
+ {"current_steps": 2320, "total_steps": 5000, "loss": 0.9525, "lr": 3.239270125110117e-05, "epoch": 0.464, "percentage": 46.4, "elapsed_time": "0:09:37", "remaining_time": "0:11:07"}
233
+ {"current_steps": 2330, "total_steps": 5000, "loss": 0.5488, "lr": 3.222579492361179e-05, "epoch": 0.466, "percentage": 46.6, "elapsed_time": "0:09:39", "remaining_time": "0:11:04"}
234
+ {"current_steps": 2340, "total_steps": 5000, "loss": 0.8862, "lr": 3.205853642107192e-05, "epoch": 0.468, "percentage": 46.8, "elapsed_time": "0:09:41", "remaining_time": "0:11:01"}
235
+ {"current_steps": 2350, "total_steps": 5000, "loss": 1.1218, "lr": 3.1890933895424976e-05, "epoch": 0.47, "percentage": 47.0, "elapsed_time": "0:09:44", "remaining_time": "0:10:58"}
236
+ {"current_steps": 2360, "total_steps": 5000, "loss": 1.4365, "lr": 3.172299551538164e-05, "epoch": 0.472, "percentage": 47.2, "elapsed_time": "0:09:47", "remaining_time": "0:10:56"}
237
+ {"current_steps": 2370, "total_steps": 5000, "loss": 1.4355, "lr": 3.155472946602162e-05, "epoch": 0.474, "percentage": 47.4, "elapsed_time": "0:09:49", "remaining_time": "0:10:54"}
238
+ {"current_steps": 2380, "total_steps": 5000, "loss": 1.6462, "lr": 3.138614394839476e-05, "epoch": 0.476, "percentage": 47.6, "elapsed_time": "0:09:52", "remaining_time": "0:10:52"}
239
+ {"current_steps": 2390, "total_steps": 5000, "loss": 0.6112, "lr": 3.121724717912138e-05, "epoch": 0.478, "percentage": 47.8, "elapsed_time": "0:09:54", "remaining_time": "0:10:48"}
240
+ {"current_steps": 2400, "total_steps": 5000, "loss": 0.2338, "lr": 3.104804738999169e-05, "epoch": 0.48, "percentage": 48.0, "elapsed_time": "0:09:56", "remaining_time": "0:10:45"}
241
+ {"current_steps": 2410, "total_steps": 5000, "loss": 0.5969, "lr": 3.087855282756475e-05, "epoch": 0.482, "percentage": 48.2, "elapsed_time": "0:09:59", "remaining_time": "0:10:44"}
242
+ {"current_steps": 2420, "total_steps": 5000, "loss": 0.8462, "lr": 3.0708771752766394e-05, "epoch": 0.484, "percentage": 48.4, "elapsed_time": "0:10:01", "remaining_time": "0:10:41"}
243
+ {"current_steps": 2430, "total_steps": 5000, "loss": 0.4652, "lr": 3.053871244048669e-05, "epoch": 0.486, "percentage": 48.6, "elapsed_time": "0:10:04", "remaining_time": "0:10:38"}
244
+ {"current_steps": 2440, "total_steps": 5000, "loss": 0.212, "lr": 3.0368383179176585e-05, "epoch": 0.488, "percentage": 48.8, "elapsed_time": "0:10:06", "remaining_time": "0:10:36"}
245
+ {"current_steps": 2450, "total_steps": 5000, "loss": 0.697, "lr": 3.0197792270443982e-05, "epoch": 0.49, "percentage": 49.0, "elapsed_time": "0:10:09", "remaining_time": "0:10:34"}
246
+ {"current_steps": 2460, "total_steps": 5000, "loss": 1.3351, "lr": 3.002694802864912e-05, "epoch": 0.492, "percentage": 49.2, "elapsed_time": "0:10:12", "remaining_time": "0:10:32"}
247
+ {"current_steps": 2470, "total_steps": 5000, "loss": 1.8147, "lr": 2.98558587804993e-05, "epoch": 0.494, "percentage": 49.4, "elapsed_time": "0:10:15", "remaining_time": "0:10:29"}
248
+ {"current_steps": 2480, "total_steps": 5000, "loss": 0.9978, "lr": 2.9684532864643122e-05, "epoch": 0.496, "percentage": 49.6, "elapsed_time": "0:10:18", "remaining_time": "0:10:28"}
249
+ {"current_steps": 2490, "total_steps": 5000, "loss": 1.6129, "lr": 2.9512978631264006e-05, "epoch": 0.498, "percentage": 49.8, "elapsed_time": "0:10:20", "remaining_time": "0:10:25"}
250
+ {"current_steps": 2500, "total_steps": 5000, "loss": 0.745, "lr": 2.9341204441673266e-05, "epoch": 0.5, "percentage": 50.0, "elapsed_time": "0:10:23", "remaining_time": "0:10:23"}
251
+ {"current_steps": 2510, "total_steps": 5000, "loss": 0.5694, "lr": 2.916921866790256e-05, "epoch": 0.502, "percentage": 50.2, "elapsed_time": "0:10:25", "remaining_time": "0:10:20"}
252
+ {"current_steps": 2520, "total_steps": 5000, "loss": 0.453, "lr": 2.8997029692295874e-05, "epoch": 0.504, "percentage": 50.4, "elapsed_time": "0:10:27", "remaining_time": "0:10:17"}
253
+ {"current_steps": 2530, "total_steps": 5000, "loss": 0.7539, "lr": 2.8824645907100954e-05, "epoch": 0.506, "percentage": 50.6, "elapsed_time": "0:10:29", "remaining_time": "0:10:14"}
254
+ {"current_steps": 2540, "total_steps": 5000, "loss": 0.5247, "lr": 2.8652075714060295e-05, "epoch": 0.508, "percentage": 50.8, "elapsed_time": "0:10:31", "remaining_time": "0:10:11"}
255
+ {"current_steps": 2550, "total_steps": 5000, "loss": 0.7769, "lr": 2.8479327524001636e-05, "epoch": 0.51, "percentage": 51.0, "elapsed_time": "0:10:33", "remaining_time": "0:10:08"}
256
+ {"current_steps": 2560, "total_steps": 5000, "loss": 0.7204, "lr": 2.8306409756428064e-05, "epoch": 0.512, "percentage": 51.2, "elapsed_time": "0:10:36", "remaining_time": "0:10:06"}
257
+ {"current_steps": 2570, "total_steps": 5000, "loss": 0.9135, "lr": 2.8133330839107608e-05, "epoch": 0.514, "percentage": 51.4, "elapsed_time": "0:10:39", "remaining_time": "0:10:04"}
258
+ {"current_steps": 2580, "total_steps": 5000, "loss": 0.5892, "lr": 2.7960099207662532e-05, "epoch": 0.516, "percentage": 51.6, "elapsed_time": "0:10:41", "remaining_time": "0:10:01"}
259
+ {"current_steps": 2590, "total_steps": 5000, "loss": 0.5568, "lr": 2.7786723305158136e-05, "epoch": 0.518, "percentage": 51.8, "elapsed_time": "0:10:43", "remaining_time": "0:09:58"}
260
+ {"current_steps": 2600, "total_steps": 5000, "loss": 1.3712, "lr": 2.761321158169134e-05, "epoch": 0.52, "percentage": 52.0, "elapsed_time": "0:10:46", "remaining_time": "0:09:56"}
261
+ {"current_steps": 2610, "total_steps": 5000, "loss": 0.9695, "lr": 2.7439572493978736e-05, "epoch": 0.522, "percentage": 52.2, "elapsed_time": "0:10:48", "remaining_time": "0:09:54"}
262
+ {"current_steps": 2620, "total_steps": 5000, "loss": 0.7138, "lr": 2.726581450494451e-05, "epoch": 0.524, "percentage": 52.4, "elapsed_time": "0:10:51", "remaining_time": "0:09:51"}
263
+ {"current_steps": 2630, "total_steps": 5000, "loss": 1.0675, "lr": 2.7091946083307896e-05, "epoch": 0.526, "percentage": 52.6, "elapsed_time": "0:10:53", "remaining_time": "0:09:48"}
264
+ {"current_steps": 2640, "total_steps": 5000, "loss": 0.8781, "lr": 2.6917975703170466e-05, "epoch": 0.528, "percentage": 52.8, "elapsed_time": "0:10:55", "remaining_time": "0:09:46"}
265
+ {"current_steps": 2650, "total_steps": 5000, "loss": 0.5354, "lr": 2.674391184360313e-05, "epoch": 0.53, "percentage": 53.0, "elapsed_time": "0:10:58", "remaining_time": "0:09:43"}
266
+ {"current_steps": 2660, "total_steps": 5000, "loss": 0.456, "lr": 2.656976298823284e-05, "epoch": 0.532, "percentage": 53.2, "elapsed_time": "0:11:00", "remaining_time": "0:09:41"}
267
+ {"current_steps": 2670, "total_steps": 5000, "loss": 2.0864, "lr": 2.6395537624829096e-05, "epoch": 0.534, "percentage": 53.4, "elapsed_time": "0:11:02", "remaining_time": "0:09:38"}
268
+ {"current_steps": 2680, "total_steps": 5000, "loss": 0.8204, "lr": 2.6221244244890336e-05, "epoch": 0.536, "percentage": 53.6, "elapsed_time": "0:11:05", "remaining_time": "0:09:35"}
269
+ {"current_steps": 2690, "total_steps": 5000, "loss": 0.6519, "lr": 2.604689134322999e-05, "epoch": 0.538, "percentage": 53.8, "elapsed_time": "0:11:06", "remaining_time": "0:09:32"}
270
+ {"current_steps": 2700, "total_steps": 5000, "loss": 0.7875, "lr": 2.587248741756253e-05, "epoch": 0.54, "percentage": 54.0, "elapsed_time": "0:11:08", "remaining_time": "0:09:29"}
271
+ {"current_steps": 2710, "total_steps": 5000, "loss": 1.4801, "lr": 2.5698040968089225e-05, "epoch": 0.542, "percentage": 54.2, "elapsed_time": "0:11:11", "remaining_time": "0:09:27"}
272
+ {"current_steps": 2720, "total_steps": 5000, "loss": 0.9097, "lr": 2.5523560497083926e-05, "epoch": 0.544, "percentage": 54.4, "elapsed_time": "0:11:14", "remaining_time": "0:09:25"}
273
+ {"current_steps": 2730, "total_steps": 5000, "loss": 0.5698, "lr": 2.5349054508478637e-05, "epoch": 0.546, "percentage": 54.6, "elapsed_time": "0:11:16", "remaining_time": "0:09:22"}
274
+ {"current_steps": 2740, "total_steps": 5000, "loss": 0.4508, "lr": 2.517453150744904e-05, "epoch": 0.548, "percentage": 54.8, "elapsed_time": "0:11:17", "remaining_time": "0:09:19"}
275
+ {"current_steps": 2750, "total_steps": 5000, "loss": 0.4544, "lr": 2.5e-05, "epoch": 0.55, "percentage": 55.0, "elapsed_time": "0:11:20", "remaining_time": "0:09:16"}
276
+ {"current_steps": 2760, "total_steps": 5000, "loss": 1.0367, "lr": 2.4825468492550964e-05, "epoch": 0.552, "percentage": 55.2, "elapsed_time": "0:11:23", "remaining_time": "0:09:14"}
277
+ {"current_steps": 2770, "total_steps": 5000, "loss": 5.9188, "lr": 2.4650945491521372e-05, "epoch": 0.554, "percentage": 55.4, "elapsed_time": "0:11:25", "remaining_time": "0:09:11"}
278
+ {"current_steps": 2780, "total_steps": 5000, "loss": 0.784, "lr": 2.447643950291608e-05, "epoch": 0.556, "percentage": 55.6, "elapsed_time": "0:11:28", "remaining_time": "0:09:09"}
279
+ {"current_steps": 2790, "total_steps": 5000, "loss": 1.3575, "lr": 2.4301959031910784e-05, "epoch": 0.558, "percentage": 55.8, "elapsed_time": "0:11:31", "remaining_time": "0:09:07"}
280
+ {"current_steps": 2800, "total_steps": 5000, "loss": 1.2835, "lr": 2.4127512582437485e-05, "epoch": 0.56, "percentage": 56.0, "elapsed_time": "0:11:33", "remaining_time": "0:09:04"}
281
+ {"current_steps": 2810, "total_steps": 5000, "loss": 2.173, "lr": 2.3953108656770016e-05, "epoch": 0.562, "percentage": 56.2, "elapsed_time": "0:11:35", "remaining_time": "0:09:02"}
282
+ {"current_steps": 2820, "total_steps": 5000, "loss": 0.354, "lr": 2.377875575510967e-05, "epoch": 0.564, "percentage": 56.4, "elapsed_time": "0:11:38", "remaining_time": "0:08:59"}
283
+ {"current_steps": 2830, "total_steps": 5000, "loss": 0.8905, "lr": 2.3604462375170906e-05, "epoch": 0.566, "percentage": 56.6, "elapsed_time": "0:11:41", "remaining_time": "0:08:57"}
284
+ {"current_steps": 2840, "total_steps": 5000, "loss": 0.9289, "lr": 2.3430237011767167e-05, "epoch": 0.568, "percentage": 56.8, "elapsed_time": "0:11:44", "remaining_time": "0:08:55"}
285
+ {"current_steps": 2850, "total_steps": 5000, "loss": 0.512, "lr": 2.3256088156396868e-05, "epoch": 0.57, "percentage": 57.0, "elapsed_time": "0:11:46", "remaining_time": "0:08:52"}
286
+ {"current_steps": 2860, "total_steps": 5000, "loss": 0.6724, "lr": 2.3082024296829536e-05, "epoch": 0.572, "percentage": 57.2, "elapsed_time": "0:11:49", "remaining_time": "0:08:50"}
287
+ {"current_steps": 2870, "total_steps": 5000, "loss": 0.7876, "lr": 2.2908053916692117e-05, "epoch": 0.574, "percentage": 57.4, "elapsed_time": "0:11:51", "remaining_time": "0:08:48"}
288
+ {"current_steps": 2880, "total_steps": 5000, "loss": 1.3706, "lr": 2.2734185495055503e-05, "epoch": 0.576, "percentage": 57.6, "elapsed_time": "0:11:54", "remaining_time": "0:08:45"}
289
+ {"current_steps": 2890, "total_steps": 5000, "loss": 1.0091, "lr": 2.2560427506021266e-05, "epoch": 0.578, "percentage": 57.8, "elapsed_time": "0:11:57", "remaining_time": "0:08:43"}
290
+ {"current_steps": 2900, "total_steps": 5000, "loss": 0.4665, "lr": 2.238678841830867e-05, "epoch": 0.58, "percentage": 58.0, "elapsed_time": "0:12:00", "remaining_time": "0:08:41"}
291
+ {"current_steps": 2910, "total_steps": 5000, "loss": 1.412, "lr": 2.2213276694841866e-05, "epoch": 0.582, "percentage": 58.2, "elapsed_time": "0:12:02", "remaining_time": "0:08:38"}
292
+ {"current_steps": 2920, "total_steps": 5000, "loss": 1.3582, "lr": 2.2039900792337474e-05, "epoch": 0.584, "percentage": 58.4, "elapsed_time": "0:12:04", "remaining_time": "0:08:36"}
293
+ {"current_steps": 2930, "total_steps": 5000, "loss": 0.3297, "lr": 2.186666916089239e-05, "epoch": 0.586, "percentage": 58.6, "elapsed_time": "0:12:06", "remaining_time": "0:08:33"}
294
+ {"current_steps": 2940, "total_steps": 5000, "loss": 0.5486, "lr": 2.1693590243571938e-05, "epoch": 0.588, "percentage": 58.8, "elapsed_time": "0:12:08", "remaining_time": "0:08:30"}
295
+ {"current_steps": 2950, "total_steps": 5000, "loss": 0.341, "lr": 2.1520672475998373e-05, "epoch": 0.59, "percentage": 59.0, "elapsed_time": "0:12:11", "remaining_time": "0:08:28"}
296
+ {"current_steps": 2960, "total_steps": 5000, "loss": 1.8438, "lr": 2.1347924285939714e-05, "epoch": 0.592, "percentage": 59.2, "elapsed_time": "0:12:14", "remaining_time": "0:08:26"}
297
+ {"current_steps": 2970, "total_steps": 5000, "loss": 1.0661, "lr": 2.117535409289905e-05, "epoch": 0.594, "percentage": 59.4, "elapsed_time": "0:12:17", "remaining_time": "0:08:23"}
298
+ {"current_steps": 2980, "total_steps": 5000, "loss": 0.5445, "lr": 2.1002970307704132e-05, "epoch": 0.596, "percentage": 59.6, "elapsed_time": "0:12:19", "remaining_time": "0:08:21"}
299
+ {"current_steps": 2990, "total_steps": 5000, "loss": 0.735, "lr": 2.0830781332097446e-05, "epoch": 0.598, "percentage": 59.8, "elapsed_time": "0:12:22", "remaining_time": "0:08:18"}
300
+ {"current_steps": 3000, "total_steps": 5000, "loss": 1.2389, "lr": 2.0658795558326743e-05, "epoch": 0.6, "percentage": 60.0, "elapsed_time": "0:12:24", "remaining_time": "0:08:16"}
301
+ {"current_steps": 3010, "total_steps": 5000, "loss": 0.6412, "lr": 2.0487021368736003e-05, "epoch": 0.602, "percentage": 60.2, "elapsed_time": "0:12:27", "remaining_time": "0:08:14"}
302
+ {"current_steps": 3020, "total_steps": 5000, "loss": 0.6114, "lr": 2.031546713535688e-05, "epoch": 0.604, "percentage": 60.4, "elapsed_time": "0:12:30", "remaining_time": "0:08:11"}
303
+ {"current_steps": 3030, "total_steps": 5000, "loss": 0.846, "lr": 2.0144141219500705e-05, "epoch": 0.606, "percentage": 60.6, "elapsed_time": "0:12:32", "remaining_time": "0:08:09"}
304
+ {"current_steps": 3040, "total_steps": 5000, "loss": 1.1015, "lr": 1.9973051971350888e-05, "epoch": 0.608, "percentage": 60.8, "elapsed_time": "0:12:35", "remaining_time": "0:08:07"}
305
+ {"current_steps": 3050, "total_steps": 5000, "loss": 0.2744, "lr": 1.980220772955602e-05, "epoch": 0.61, "percentage": 61.0, "elapsed_time": "0:12:37", "remaining_time": "0:08:04"}
306
+ {"current_steps": 3060, "total_steps": 5000, "loss": 1.048, "lr": 1.963161682082342e-05, "epoch": 0.612, "percentage": 61.2, "elapsed_time": "0:12:40", "remaining_time": "0:08:02"}
307
+ {"current_steps": 3070, "total_steps": 5000, "loss": 1.5106, "lr": 1.946128755951332e-05, "epoch": 0.614, "percentage": 61.4, "elapsed_time": "0:12:43", "remaining_time": "0:07:59"}
308
+ {"current_steps": 3080, "total_steps": 5000, "loss": 1.3313, "lr": 1.9291228247233605e-05, "epoch": 0.616, "percentage": 61.6, "elapsed_time": "0:12:45", "remaining_time": "0:07:57"}
309
+ {"current_steps": 3090, "total_steps": 5000, "loss": 1.1983, "lr": 1.912144717243525e-05, "epoch": 0.618, "percentage": 61.8, "elapsed_time": "0:12:48", "remaining_time": "0:07:55"}
310
+ {"current_steps": 3100, "total_steps": 5000, "loss": 0.8728, "lr": 1.895195261000831e-05, "epoch": 0.62, "percentage": 62.0, "elapsed_time": "0:12:51", "remaining_time": "0:07:52"}
311
+ {"current_steps": 3110, "total_steps": 5000, "loss": 0.6307, "lr": 1.8782752820878634e-05, "epoch": 0.622, "percentage": 62.2, "elapsed_time": "0:12:53", "remaining_time": "0:07:50"}
312
+ {"current_steps": 3120, "total_steps": 5000, "loss": 0.7477, "lr": 1.8613856051605243e-05, "epoch": 0.624, "percentage": 62.4, "elapsed_time": "0:12:56", "remaining_time": "0:07:47"}
313
+ {"current_steps": 3130, "total_steps": 5000, "loss": 1.0535, "lr": 1.8445270533978388e-05, "epoch": 0.626, "percentage": 62.6, "elapsed_time": "0:12:58", "remaining_time": "0:07:45"}
314
+ {"current_steps": 3140, "total_steps": 5000, "loss": 1.0675, "lr": 1.827700448461836e-05, "epoch": 0.628, "percentage": 62.8, "elapsed_time": "0:13:01", "remaining_time": "0:07:43"}
315
+ {"current_steps": 3150, "total_steps": 5000, "loss": 1.6361, "lr": 1.8109066104575023e-05, "epoch": 0.63, "percentage": 63.0, "elapsed_time": "0:13:03", "remaining_time": "0:07:40"}
316
+ {"current_steps": 3160, "total_steps": 5000, "loss": 1.1624, "lr": 1.7941463578928086e-05, "epoch": 0.632, "percentage": 63.2, "elapsed_time": "0:13:06", "remaining_time": "0:07:38"}
317
+ {"current_steps": 3170, "total_steps": 5000, "loss": 0.8855, "lr": 1.7774205076388206e-05, "epoch": 0.634, "percentage": 63.4, "elapsed_time": "0:13:08", "remaining_time": "0:07:35"}
318
+ {"current_steps": 3180, "total_steps": 5000, "loss": 1.0965, "lr": 1.7607298748898842e-05, "epoch": 0.636, "percentage": 63.6, "elapsed_time": "0:13:11", "remaining_time": "0:07:33"}
319
+ {"current_steps": 3190, "total_steps": 5000, "loss": 0.3191, "lr": 1.744075273123889e-05, "epoch": 0.638, "percentage": 63.8, "elapsed_time": "0:13:13", "remaining_time": "0:07:30"}
320
+ {"current_steps": 3200, "total_steps": 5000, "loss": 0.6535, "lr": 1.7274575140626318e-05, "epoch": 0.64, "percentage": 64.0, "elapsed_time": "0:13:15", "remaining_time": "0:07:27"}
321
+ {"current_steps": 3210, "total_steps": 5000, "loss": 0.4069, "lr": 1.7108774076322443e-05, "epoch": 0.642, "percentage": 64.2, "elapsed_time": "0:13:16", "remaining_time": "0:07:24"}
322
+ {"current_steps": 3220, "total_steps": 5000, "loss": 0.5898, "lr": 1.6943357619237226e-05, "epoch": 0.644, "percentage": 64.4, "elapsed_time": "0:13:18", "remaining_time": "0:07:21"}
323
+ {"current_steps": 3230, "total_steps": 5000, "loss": 0.3429, "lr": 1.677833383153542e-05, "epoch": 0.646, "percentage": 64.6, "elapsed_time": "0:13:21", "remaining_time": "0:07:19"}
324
+ {"current_steps": 3240, "total_steps": 5000, "loss": 0.6403, "lr": 1.6613710756243626e-05, "epoch": 0.648, "percentage": 64.8, "elapsed_time": "0:13:23", "remaining_time": "0:07:16"}
325
+ {"current_steps": 3250, "total_steps": 5000, "loss": 1.1898, "lr": 1.6449496416858284e-05, "epoch": 0.65, "percentage": 65.0, "elapsed_time": "0:13:26", "remaining_time": "0:07:14"}
326
+ {"current_steps": 3260, "total_steps": 5000, "loss": 0.4049, "lr": 1.6285698816954624e-05, "epoch": 0.652, "percentage": 65.2, "elapsed_time": "0:13:28", "remaining_time": "0:07:11"}
327
+ {"current_steps": 3270, "total_steps": 5000, "loss": 0.6081, "lr": 1.612232593979658e-05, "epoch": 0.654, "percentage": 65.4, "elapsed_time": "0:13:30", "remaining_time": "0:07:09"}
328
+ {"current_steps": 3280, "total_steps": 5000, "loss": 0.8107, "lr": 1.5959385747947698e-05, "epoch": 0.656, "percentage": 65.6, "elapsed_time": "0:13:33", "remaining_time": "0:07:06"}
329
+ {"current_steps": 3290, "total_steps": 5000, "loss": 0.6606, "lr": 1.5796886182883053e-05, "epoch": 0.658, "percentage": 65.8, "elapsed_time": "0:13:35", "remaining_time": "0:07:03"}
330
+ {"current_steps": 3300, "total_steps": 5000, "loss": 0.4767, "lr": 1.56348351646022e-05, "epoch": 0.66, "percentage": 66.0, "elapsed_time": "0:13:37", "remaining_time": "0:07:01"}
331
+ {"current_steps": 3310, "total_steps": 5000, "loss": 0.988, "lr": 1.547324059124315e-05, "epoch": 0.662, "percentage": 66.2, "elapsed_time": "0:13:39", "remaining_time": "0:06:58"}
332
+ {"current_steps": 3320, "total_steps": 5000, "loss": 1.3237, "lr": 1.5312110338697426e-05, "epoch": 0.664, "percentage": 66.4, "elapsed_time": "0:13:42", "remaining_time": "0:06:56"}
333
+ {"current_steps": 3330, "total_steps": 5000, "loss": 0.4173, "lr": 1.5151452260226224e-05, "epoch": 0.666, "percentage": 66.6, "elapsed_time": "0:13:44", "remaining_time": "0:06:53"}
334
+ {"current_steps": 3340, "total_steps": 5000, "loss": 0.7819, "lr": 1.4991274186077632e-05, "epoch": 0.668, "percentage": 66.8, "elapsed_time": "0:13:47", "remaining_time": "0:06:51"}
335
+ {"current_steps": 3350, "total_steps": 5000, "loss": 0.334, "lr": 1.4831583923104999e-05, "epoch": 0.67, "percentage": 67.0, "elapsed_time": "0:13:48", "remaining_time": "0:06:48"}
336
+ {"current_steps": 3360, "total_steps": 5000, "loss": 0.8675, "lr": 1.467238925438646e-05, "epoch": 0.672, "percentage": 67.2, "elapsed_time": "0:13:51", "remaining_time": "0:06:46"}
337
+ {"current_steps": 3370, "total_steps": 5000, "loss": 0.6056, "lr": 1.4513697938845572e-05, "epoch": 0.674, "percentage": 67.4, "elapsed_time": "0:13:54", "remaining_time": "0:06:43"}
338
+ {"current_steps": 3380, "total_steps": 5000, "loss": 0.2135, "lr": 1.4355517710873184e-05, "epoch": 0.676, "percentage": 67.6, "elapsed_time": "0:13:56", "remaining_time": "0:06:41"}
339
+ {"current_steps": 3390, "total_steps": 5000, "loss": 1.0051, "lr": 1.4197856279950438e-05, "epoch": 0.678, "percentage": 67.8, "elapsed_time": "0:13:59", "remaining_time": "0:06:38"}
340
+ {"current_steps": 3400, "total_steps": 5000, "loss": 0.4415, "lr": 1.4040721330273062e-05, "epoch": 0.68, "percentage": 68.0, "elapsed_time": "0:14:02", "remaining_time": "0:06:36"}
341
+ {"current_steps": 3410, "total_steps": 5000, "loss": 0.625, "lr": 1.388412052037682e-05, "epoch": 0.682, "percentage": 68.2, "elapsed_time": "0:14:05", "remaining_time": "0:06:34"}
342
+ {"current_steps": 3420, "total_steps": 5000, "loss": 1.0217, "lr": 1.3728061482764238e-05, "epoch": 0.684, "percentage": 68.4, "elapsed_time": "0:14:08", "remaining_time": "0:06:31"}
343
+ {"current_steps": 3430, "total_steps": 5000, "loss": 1.101, "lr": 1.3572551823532654e-05, "epoch": 0.686, "percentage": 68.6, "elapsed_time": "0:14:10", "remaining_time": "0:06:29"}
344
+ {"current_steps": 3440, "total_steps": 5000, "loss": 0.9139, "lr": 1.3417599122003464e-05, "epoch": 0.688, "percentage": 68.8, "elapsed_time": "0:14:13", "remaining_time": "0:06:27"}
345
+ {"current_steps": 3450, "total_steps": 5000, "loss": 0.7426, "lr": 1.3263210930352737e-05, "epoch": 0.69, "percentage": 69.0, "elapsed_time": "0:14:16", "remaining_time": "0:06:24"}
346
+ {"current_steps": 3460, "total_steps": 5000, "loss": 1.0954, "lr": 1.3109394773243117e-05, "epoch": 0.692, "percentage": 69.2, "elapsed_time": "0:14:18", "remaining_time": "0:06:22"}
347
+ {"current_steps": 3470, "total_steps": 5000, "loss": 0.6788, "lr": 1.2956158147457115e-05, "epoch": 0.694, "percentage": 69.4, "elapsed_time": "0:14:21", "remaining_time": "0:06:19"}
348
+ {"current_steps": 3480, "total_steps": 5000, "loss": 0.8617, "lr": 1.280350852153168e-05, "epoch": 0.696, "percentage": 69.6, "elapsed_time": "0:14:24", "remaining_time": "0:06:17"}
349
+ {"current_steps": 3490, "total_steps": 5000, "loss": 0.653, "lr": 1.2651453335394231e-05, "epoch": 0.698, "percentage": 69.8, "elapsed_time": "0:14:27", "remaining_time": "0:06:15"}
350
+ {"current_steps": 3500, "total_steps": 5000, "loss": 0.8003, "lr": 1.2500000000000006e-05, "epoch": 0.7, "percentage": 70.0, "elapsed_time": "0:14:29", "remaining_time": "0:06:12"}
351
+ {"current_steps": 3510, "total_steps": 5000, "loss": 0.5579, "lr": 1.234915589697091e-05, "epoch": 0.702, "percentage": 70.2, "elapsed_time": "0:14:31", "remaining_time": "0:06:10"}
352
+ {"current_steps": 3520, "total_steps": 5000, "loss": 1.5354, "lr": 1.2198928378235716e-05, "epoch": 0.704, "percentage": 70.4, "elapsed_time": "0:14:34", "remaining_time": "0:06:07"}
353
+ {"current_steps": 3530, "total_steps": 5000, "loss": 1.6175, "lr": 1.2049324765671749e-05, "epoch": 0.706, "percentage": 70.6, "elapsed_time": "0:14:37", "remaining_time": "0:06:05"}
354
+ {"current_steps": 3540, "total_steps": 5000, "loss": 0.4771, "lr": 1.1900352350748026e-05, "epoch": 0.708, "percentage": 70.8, "elapsed_time": "0:14:39", "remaining_time": "0:06:02"}
355
+ {"current_steps": 3550, "total_steps": 5000, "loss": 0.779, "lr": 1.175201839416988e-05, "epoch": 0.71, "percentage": 71.0, "elapsed_time": "0:14:42", "remaining_time": "0:06:00"}
356
+ {"current_steps": 3560, "total_steps": 5000, "loss": 1.1478, "lr": 1.1604330125525079e-05, "epoch": 0.712, "percentage": 71.2, "elapsed_time": "0:14:44", "remaining_time": "0:05:57"}
357
+ {"current_steps": 3570, "total_steps": 5000, "loss": 0.7484, "lr": 1.1457294742931507e-05, "epoch": 0.714, "percentage": 71.4, "elapsed_time": "0:14:47", "remaining_time": "0:05:55"}
358
+ {"current_steps": 3580, "total_steps": 5000, "loss": 1.0581, "lr": 1.1310919412686247e-05, "epoch": 0.716, "percentage": 71.6, "elapsed_time": "0:14:49", "remaining_time": "0:05:52"}
359
+ {"current_steps": 3590, "total_steps": 5000, "loss": 0.989, "lr": 1.11652112689164e-05, "epoch": 0.718, "percentage": 71.8, "elapsed_time": "0:14:51", "remaining_time": "0:05:50"}
360
+ {"current_steps": 3600, "total_steps": 5000, "loss": 1.5538, "lr": 1.1020177413231334e-05, "epoch": 0.72, "percentage": 72.0, "elapsed_time": "0:14:53", "remaining_time": "0:05:47"}
361
+ {"current_steps": 3610, "total_steps": 5000, "loss": 0.9328, "lr": 1.0875824914376553e-05, "epoch": 0.722, "percentage": 72.2, "elapsed_time": "0:14:55", "remaining_time": "0:05:44"}
362
+ {"current_steps": 3620, "total_steps": 5000, "loss": 1.4623, "lr": 1.0732160807889211e-05, "epoch": 0.724, "percentage": 72.4, "elapsed_time": "0:14:58", "remaining_time": "0:05:42"}
363
+ {"current_steps": 3630, "total_steps": 5000, "loss": 1.0274, "lr": 1.058919209575517e-05, "epoch": 0.726, "percentage": 72.6, "elapsed_time": "0:15:00", "remaining_time": "0:05:40"}
364
+ {"current_steps": 3640, "total_steps": 5000, "loss": 0.5183, "lr": 1.0446925746067768e-05, "epoch": 0.728, "percentage": 72.8, "elapsed_time": "0:15:03", "remaining_time": "0:05:37"}
365
+ {"current_steps": 3650, "total_steps": 5000, "loss": 0.8523, "lr": 1.0305368692688174e-05, "epoch": 0.73, "percentage": 73.0, "elapsed_time": "0:15:05", "remaining_time": "0:05:34"}
366
+ {"current_steps": 3660, "total_steps": 5000, "loss": 0.9334, "lr": 1.0164527834907467e-05, "epoch": 0.732, "percentage": 73.2, "elapsed_time": "0:15:07", "remaining_time": "0:05:32"}
367
+ {"current_steps": 3670, "total_steps": 5000, "loss": 1.2435, "lr": 1.0024410037110357e-05, "epoch": 0.734, "percentage": 73.4, "elapsed_time": "0:15:09", "remaining_time": "0:05:29"}
368
+ {"current_steps": 3680, "total_steps": 5000, "loss": 1.5611, "lr": 9.88502212844063e-06, "epoch": 0.736, "percentage": 73.6, "elapsed_time": "0:15:12", "remaining_time": "0:05:27"}
369
+ {"current_steps": 3690, "total_steps": 5000, "loss": 0.9649, "lr": 9.746370902468311e-06, "epoch": 0.738, "percentage": 73.8, "elapsed_time": "0:15:14", "remaining_time": "0:05:24"}
370
+ {"current_steps": 3700, "total_steps": 5000, "loss": 0.6053, "lr": 9.608463116858542e-06, "epoch": 0.74, "percentage": 74.0, "elapsed_time": "0:15:17", "remaining_time": "0:05:22"}
371
+ {"current_steps": 3710, "total_steps": 5000, "loss": 0.6377, "lr": 9.471305493042243e-06, "epoch": 0.742, "percentage": 74.2, "elapsed_time": "0:15:19", "remaining_time": "0:05:19"}
372
+ {"current_steps": 3720, "total_steps": 5000, "loss": 1.0698, "lr": 9.334904715888495e-06, "epoch": 0.744, "percentage": 74.4, "elapsed_time": "0:15:21", "remaining_time": "0:05:17"}
373
+ {"current_steps": 3730, "total_steps": 5000, "loss": 0.9858, "lr": 9.199267433378727e-06, "epoch": 0.746, "percentage": 74.6, "elapsed_time": "0:15:24", "remaining_time": "0:05:14"}
374
+ {"current_steps": 3740, "total_steps": 5000, "loss": 0.3602, "lr": 9.064400256282757e-06, "epoch": 0.748, "percentage": 74.8, "elapsed_time": "0:15:26", "remaining_time": "0:05:12"}
375
+ {"current_steps": 3750, "total_steps": 5000, "loss": 0.6672, "lr": 8.930309757836517e-06, "epoch": 0.75, "percentage": 75.0, "elapsed_time": "0:15:28", "remaining_time": "0:05:09"}
376
+ {"current_steps": 3760, "total_steps": 5000, "loss": 0.5627, "lr": 8.797002473421728e-06, "epoch": 0.752, "percentage": 75.2, "elapsed_time": "0:15:30", "remaining_time": "0:05:06"}
377
+ {"current_steps": 3770, "total_steps": 5000, "loss": 1.0572, "lr": 8.664484900247363e-06, "epoch": 0.754, "percentage": 75.4, "elapsed_time": "0:15:33", "remaining_time": "0:05:04"}
378
+ {"current_steps": 3780, "total_steps": 5000, "loss": 1.2507, "lr": 8.532763497032987e-06, "epoch": 0.756, "percentage": 75.6, "elapsed_time": "0:15:36", "remaining_time": "0:05:02"}
379
+ {"current_steps": 3790, "total_steps": 5000, "loss": 0.7735, "lr": 8.40184468369396e-06, "epoch": 0.758, "percentage": 75.8, "elapsed_time": "0:15:39", "remaining_time": "0:04:59"}
380
+ {"current_steps": 3800, "total_steps": 5000, "loss": 1.298, "lr": 8.271734841028553e-06, "epoch": 0.76, "percentage": 76.0, "elapsed_time": "0:15:42", "remaining_time": "0:04:57"}
381
+ {"current_steps": 3810, "total_steps": 5000, "loss": 0.7805, "lr": 8.142440310406924e-06, "epoch": 0.762, "percentage": 76.2, "elapsed_time": "0:15:45", "remaining_time": "0:04:55"}
382
+ {"current_steps": 3820, "total_steps": 5000, "loss": 0.7498, "lr": 8.013967393462094e-06, "epoch": 0.764, "percentage": 76.4, "elapsed_time": "0:15:47", "remaining_time": "0:04:52"}
383
+ {"current_steps": 3830, "total_steps": 5000, "loss": 1.1696, "lr": 7.886322351782783e-06, "epoch": 0.766, "percentage": 76.6, "elapsed_time": "0:15:50", "remaining_time": "0:04:50"}
384
+ {"current_steps": 3840, "total_steps": 5000, "loss": 0.6709, "lr": 7.759511406608255e-06, "epoch": 0.768, "percentage": 76.8, "elapsed_time": "0:15:52", "remaining_time": "0:04:47"}
385
+ {"current_steps": 3850, "total_steps": 5000, "loss": 0.9733, "lr": 7.633540738525066e-06, "epoch": 0.77, "percentage": 77.0, "elapsed_time": "0:15:55", "remaining_time": "0:04:45"}
386
+ {"current_steps": 3860, "total_steps": 5000, "loss": 0.6711, "lr": 7.508416487165862e-06, "epoch": 0.772, "percentage": 77.2, "elapsed_time": "0:15:57", "remaining_time": "0:04:42"}
387
+ {"current_steps": 3870, "total_steps": 5000, "loss": 0.7134, "lr": 7.384144750910133e-06, "epoch": 0.774, "percentage": 77.4, "elapsed_time": "0:16:00", "remaining_time": "0:04:40"}
388
+ {"current_steps": 3880, "total_steps": 5000, "loss": 1.0323, "lr": 7.260731586586983e-06, "epoch": 0.776, "percentage": 77.6, "elapsed_time": "0:16:02", "remaining_time": "0:04:37"}
389
+ {"current_steps": 3890, "total_steps": 5000, "loss": 0.4097, "lr": 7.138183009179922e-06, "epoch": 0.778, "percentage": 77.8, "elapsed_time": "0:16:04", "remaining_time": "0:04:35"}
390
+ {"current_steps": 3900, "total_steps": 5000, "loss": 1.0325, "lr": 7.016504991533726e-06, "epoch": 0.78, "percentage": 78.0, "elapsed_time": "0:16:07", "remaining_time": "0:04:32"}
391
+ {"current_steps": 3910, "total_steps": 5000, "loss": 0.2871, "lr": 6.895703464063319e-06, "epoch": 0.782, "percentage": 78.2, "elapsed_time": "0:16:10", "remaining_time": "0:04:30"}
392
+ {"current_steps": 3920, "total_steps": 5000, "loss": 0.8634, "lr": 6.775784314464717e-06, "epoch": 0.784, "percentage": 78.4, "elapsed_time": "0:16:12", "remaining_time": "0:04:27"}
393
+ {"current_steps": 3930, "total_steps": 5000, "loss": 1.8368, "lr": 6.656753387428089e-06, "epoch": 0.786, "percentage": 78.6, "elapsed_time": "0:16:15", "remaining_time": "0:04:25"}
394
+ {"current_steps": 3940, "total_steps": 5000, "loss": 0.5746, "lr": 6.538616484352902e-06, "epoch": 0.788, "percentage": 78.8, "elapsed_time": "0:16:17", "remaining_time": "0:04:23"}
395
+ {"current_steps": 3950, "total_steps": 5000, "loss": 2.333, "lr": 6.421379363065142e-06, "epoch": 0.79, "percentage": 79.0, "elapsed_time": "0:16:20", "remaining_time": "0:04:20"}
396
+ {"current_steps": 3960, "total_steps": 5000, "loss": 0.393, "lr": 6.305047737536707e-06, "epoch": 0.792, "percentage": 79.2, "elapsed_time": "0:16:22", "remaining_time": "0:04:18"}
397
+ {"current_steps": 3970, "total_steps": 5000, "loss": 1.0324, "lr": 6.189627277606894e-06, "epoch": 0.794, "percentage": 79.4, "elapsed_time": "0:16:25", "remaining_time": "0:04:15"}
398
+ {"current_steps": 3980, "total_steps": 5000, "loss": 0.9407, "lr": 6.075123608706093e-06, "epoch": 0.796, "percentage": 79.6, "elapsed_time": "0:16:27", "remaining_time": "0:04:13"}
399
+ {"current_steps": 3990, "total_steps": 5000, "loss": 1.2251, "lr": 5.961542311581586e-06, "epoch": 0.798, "percentage": 79.8, "elapsed_time": "0:16:30", "remaining_time": "0:04:10"}
400
+ {"current_steps": 4000, "total_steps": 5000, "loss": 0.635, "lr": 5.848888922025553e-06, "epoch": 0.8, "percentage": 80.0, "elapsed_time": "0:16:32", "remaining_time": "0:04:08"}
401
+ {"current_steps": 4010, "total_steps": 5000, "loss": 0.3039, "lr": 5.737168930605272e-06, "epoch": 0.802, "percentage": 80.2, "elapsed_time": "0:16:35", "remaining_time": "0:04:05"}
402
+ {"current_steps": 4020, "total_steps": 5000, "loss": 1.0606, "lr": 5.626387782395512e-06, "epoch": 0.804, "percentage": 80.4, "elapsed_time": "0:16:38", "remaining_time": "0:04:03"}
403
+ {"current_steps": 4030, "total_steps": 5000, "loss": 2.3978, "lr": 5.5165508767131415e-06, "epoch": 0.806, "percentage": 80.6, "elapsed_time": "0:16:40", "remaining_time": "0:04:00"}
404
+ {"current_steps": 4040, "total_steps": 5000, "loss": 1.7771, "lr": 5.4076635668540075e-06, "epoch": 0.808, "percentage": 80.8, "elapsed_time": "0:16:42", "remaining_time": "0:03:58"}
405
+ {"current_steps": 4050, "total_steps": 5000, "loss": 1.5544, "lr": 5.299731159831953e-06, "epoch": 0.81, "percentage": 81.0, "elapsed_time": "0:16:45", "remaining_time": "0:03:55"}
406
+ {"current_steps": 4060, "total_steps": 5000, "loss": 0.574, "lr": 5.192758916120236e-06, "epoch": 0.812, "percentage": 81.2, "elapsed_time": "0:16:48", "remaining_time": "0:03:53"}
407
+ {"current_steps": 4070, "total_steps": 5000, "loss": 0.9646, "lr": 5.086752049395094e-06, "epoch": 0.814, "percentage": 81.4, "elapsed_time": "0:16:50", "remaining_time": "0:03:50"}
408
+ {"current_steps": 4080, "total_steps": 5000, "loss": 0.422, "lr": 4.981715726281666e-06, "epoch": 0.816, "percentage": 81.6, "elapsed_time": "0:16:52", "remaining_time": "0:03:48"}
409
+ {"current_steps": 4090, "total_steps": 5000, "loss": 0.6738, "lr": 4.877655066102149e-06, "epoch": 0.818, "percentage": 81.8, "elapsed_time": "0:16:54", "remaining_time": "0:03:45"}
410
+ {"current_steps": 4100, "total_steps": 5000, "loss": 0.1991, "lr": 4.7745751406263165e-06, "epoch": 0.82, "percentage": 82.0, "elapsed_time": "0:16:57", "remaining_time": "0:03:43"}
411
+ {"current_steps": 4110, "total_steps": 5000, "loss": 0.7171, "lr": 4.672480973824311e-06, "epoch": 0.822, "percentage": 82.2, "elapsed_time": "0:17:00", "remaining_time": "0:03:40"}
412
+ {"current_steps": 4120, "total_steps": 5000, "loss": 0.7478, "lr": 4.571377541621788e-06, "epoch": 0.824, "percentage": 82.4, "elapsed_time": "0:17:02", "remaining_time": "0:03:38"}
413
+ {"current_steps": 4130, "total_steps": 5000, "loss": 0.6985, "lr": 4.4712697716574e-06, "epoch": 0.826, "percentage": 82.6, "elapsed_time": "0:17:05", "remaining_time": "0:03:35"}
414
+ {"current_steps": 4140, "total_steps": 5000, "loss": 0.7893, "lr": 4.372162543042624e-06, "epoch": 0.828, "percentage": 82.8, "elapsed_time": "0:17:07", "remaining_time": "0:03:33"}
415
+ {"current_steps": 4150, "total_steps": 5000, "loss": 0.8127, "lr": 4.274060686123959e-06, "epoch": 0.83, "percentage": 83.0, "elapsed_time": "0:17:10", "remaining_time": "0:03:31"}
416
+ {"current_steps": 4160, "total_steps": 5000, "loss": 0.5259, "lr": 4.176968982247514e-06, "epoch": 0.832, "percentage": 83.2, "elapsed_time": "0:17:13", "remaining_time": "0:03:28"}
417
+ {"current_steps": 4170, "total_steps": 5000, "loss": 0.9835, "lr": 4.08089216352596e-06, "epoch": 0.834, "percentage": 83.4, "elapsed_time": "0:17:15", "remaining_time": "0:03:26"}
418
+ {"current_steps": 4180, "total_steps": 5000, "loss": 0.6508, "lr": 3.985834912607894e-06, "epoch": 0.836, "percentage": 83.6, "elapsed_time": "0:17:17", "remaining_time": "0:03:23"}
419
+ {"current_steps": 4190, "total_steps": 5000, "loss": 0.437, "lr": 3.891801862449629e-06, "epoch": 0.838, "percentage": 83.8, "elapsed_time": "0:17:19", "remaining_time": "0:03:20"}
420
+ {"current_steps": 4200, "total_steps": 5000, "loss": 1.6048, "lr": 3.798797596089351e-06, "epoch": 0.84, "percentage": 84.0, "elapsed_time": "0:17:22", "remaining_time": "0:03:18"}
421
+ {"current_steps": 4210, "total_steps": 5000, "loss": 0.8315, "lr": 3.7068266464238084e-06, "epoch": 0.842, "percentage": 84.2, "elapsed_time": "0:17:25", "remaining_time": "0:03:16"}
422
+ {"current_steps": 4220, "total_steps": 5000, "loss": 0.3853, "lr": 3.6158934959873353e-06, "epoch": 0.844, "percentage": 84.4, "elapsed_time": "0:17:26", "remaining_time": "0:03:13"}
423
+ {"current_steps": 4230, "total_steps": 5000, "loss": 1.3448, "lr": 3.5260025767333893e-06, "epoch": 0.846, "percentage": 84.6, "elapsed_time": "0:17:29", "remaining_time": "0:03:11"}
424
+ {"current_steps": 4240, "total_steps": 5000, "loss": 0.2405, "lr": 3.4371582698185633e-06, "epoch": 0.848, "percentage": 84.8, "elapsed_time": "0:17:31", "remaining_time": "0:03:08"}
425
+ {"current_steps": 4250, "total_steps": 5000, "loss": 0.8112, "lr": 3.3493649053890326e-06, "epoch": 0.85, "percentage": 85.0, "elapsed_time": "0:17:34", "remaining_time": "0:03:06"}
426
+ {"current_steps": 4260, "total_steps": 5000, "loss": 1.3852, "lr": 3.262626762369525e-06, "epoch": 0.852, "percentage": 85.2, "elapsed_time": "0:17:36", "remaining_time": "0:03:03"}
427
+ {"current_steps": 4270, "total_steps": 5000, "loss": 0.3188, "lr": 3.176948068254762e-06, "epoch": 0.854, "percentage": 85.4, "elapsed_time": "0:17:38", "remaining_time": "0:03:01"}
428
+ {"current_steps": 4280, "total_steps": 5000, "loss": 0.4712, "lr": 3.092332998903416e-06, "epoch": 0.856, "percentage": 85.6, "elapsed_time": "0:17:41", "remaining_time": "0:02:58"}
429
+ {"current_steps": 4290, "total_steps": 5000, "loss": 0.4124, "lr": 3.0087856783345914e-06, "epoch": 0.858, "percentage": 85.8, "elapsed_time": "0:17:43", "remaining_time": "0:02:56"}
430
+ {"current_steps": 4300, "total_steps": 5000, "loss": 1.4114, "lr": 2.9263101785268254e-06, "epoch": 0.86, "percentage": 86.0, "elapsed_time": "0:17:46", "remaining_time": "0:02:53"}
431
+ {"current_steps": 4310, "total_steps": 5000, "loss": 1.5525, "lr": 2.8449105192196316e-06, "epoch": 0.862, "percentage": 86.2, "elapsed_time": "0:17:49", "remaining_time": "0:02:51"}
432
+ {"current_steps": 4320, "total_steps": 5000, "loss": 1.1899, "lr": 2.764590667717562e-06, "epoch": 0.864, "percentage": 86.4, "elapsed_time": "0:17:51", "remaining_time": "0:02:48"}
433
+ {"current_steps": 4330, "total_steps": 5000, "loss": 0.5857, "lr": 2.6853545386968606e-06, "epoch": 0.866, "percentage": 86.6, "elapsed_time": "0:17:53", "remaining_time": "0:02:46"}
434
+ {"current_steps": 4340, "total_steps": 5000, "loss": 1.6246, "lr": 2.6072059940146775e-06, "epoch": 0.868, "percentage": 86.8, "elapsed_time": "0:17:56", "remaining_time": "0:02:43"}
435
+ {"current_steps": 4350, "total_steps": 5000, "loss": 0.2832, "lr": 2.5301488425208296e-06, "epoch": 0.87, "percentage": 87.0, "elapsed_time": "0:17:58", "remaining_time": "0:02:41"}
436
+ {"current_steps": 4360, "total_steps": 5000, "loss": 1.5246, "lr": 2.454186839872158e-06, "epoch": 0.872, "percentage": 87.2, "elapsed_time": "0:18:01", "remaining_time": "0:02:38"}
437
+ {"current_steps": 4370, "total_steps": 5000, "loss": 1.5678, "lr": 2.379323688349516e-06, "epoch": 0.874, "percentage": 87.4, "elapsed_time": "0:18:03", "remaining_time": "0:02:36"}
438
+ {"current_steps": 4380, "total_steps": 5000, "loss": 1.3038, "lr": 2.3055630366772856e-06, "epoch": 0.876, "percentage": 87.6, "elapsed_time": "0:18:05", "remaining_time": "0:02:33"}
439
+ {"current_steps": 4390, "total_steps": 5000, "loss": 0.7802, "lr": 2.2329084798455746e-06, "epoch": 0.878, "percentage": 87.8, "elapsed_time": "0:18:08", "remaining_time": "0:02:31"}
440
+ {"current_steps": 4400, "total_steps": 5000, "loss": 1.1187, "lr": 2.1613635589349756e-06, "epoch": 0.88, "percentage": 88.0, "elapsed_time": "0:18:10", "remaining_time": "0:02:28"}
441
+ {"current_steps": 4410, "total_steps": 5000, "loss": 1.7619, "lr": 2.0909317609440095e-06, "epoch": 0.882, "percentage": 88.2, "elapsed_time": "0:18:13", "remaining_time": "0:02:26"}
442
+ {"current_steps": 4420, "total_steps": 5000, "loss": 1.2435, "lr": 2.0216165186191407e-06, "epoch": 0.884, "percentage": 88.4, "elapsed_time": "0:18:16", "remaining_time": "0:02:23"}
443
+ {"current_steps": 4430, "total_steps": 5000, "loss": 1.7134, "lr": 1.95342121028749e-06, "epoch": 0.886, "percentage": 88.6, "elapsed_time": "0:18:19", "remaining_time": "0:02:21"}
444
+ {"current_steps": 4440, "total_steps": 5000, "loss": 0.8158, "lr": 1.8863491596921745e-06, "epoch": 0.888, "percentage": 88.8, "elapsed_time": "0:18:22", "remaining_time": "0:02:19"}
445
+ {"current_steps": 4450, "total_steps": 5000, "loss": 0.7814, "lr": 1.8204036358303173e-06, "epoch": 0.89, "percentage": 89.0, "elapsed_time": "0:18:25", "remaining_time": "0:02:16"}
446
+ {"current_steps": 4460, "total_steps": 5000, "loss": 0.6062, "lr": 1.7555878527937164e-06, "epoch": 0.892, "percentage": 89.2, "elapsed_time": "0:18:27", "remaining_time": "0:02:14"}
447
+ {"current_steps": 4470, "total_steps": 5000, "loss": 1.6605, "lr": 1.6919049696121958e-06, "epoch": 0.894, "percentage": 89.4, "elapsed_time": "0:18:29", "remaining_time": "0:02:11"}
448
+ {"current_steps": 4480, "total_steps": 5000, "loss": 0.6406, "lr": 1.629358090099639e-06, "epoch": 0.896, "percentage": 89.6, "elapsed_time": "0:18:31", "remaining_time": "0:02:09"}
449
+ {"current_steps": 4490, "total_steps": 5000, "loss": 0.8206, "lr": 1.5679502627027136e-06, "epoch": 0.898, "percentage": 89.8, "elapsed_time": "0:18:33", "remaining_time": "0:02:06"}
450
+ {"current_steps": 4500, "total_steps": 5000, "loss": 0.3695, "lr": 1.5076844803522922e-06, "epoch": 0.9, "percentage": 90.0, "elapsed_time": "0:18:35", "remaining_time": "0:02:03"}
451
+ {"current_steps": 4510, "total_steps": 5000, "loss": 1.0353, "lr": 1.4485636803175829e-06, "epoch": 0.902, "percentage": 90.2, "elapsed_time": "0:18:38", "remaining_time": "0:02:01"}
452
+ {"current_steps": 4520, "total_steps": 5000, "loss": 0.8934, "lr": 1.3905907440629752e-06, "epoch": 0.904, "percentage": 90.4, "elapsed_time": "0:18:41", "remaining_time": "0:01:59"}
453
+ {"current_steps": 4530, "total_steps": 5000, "loss": 0.5448, "lr": 1.333768497107593e-06, "epoch": 0.906, "percentage": 90.6, "elapsed_time": "0:18:44", "remaining_time": "0:01:56"}
454
+ {"current_steps": 4540, "total_steps": 5000, "loss": 0.6171, "lr": 1.2780997088875869e-06, "epoch": 0.908, "percentage": 90.8, "elapsed_time": "0:18:46", "remaining_time": "0:01:54"}
455
+ {"current_steps": 4550, "total_steps": 5000, "loss": 0.8105, "lr": 1.2235870926211619e-06, "epoch": 0.91, "percentage": 91.0, "elapsed_time": "0:18:49", "remaining_time": "0:01:51"}
456
+ {"current_steps": 4560, "total_steps": 5000, "loss": 0.6286, "lr": 1.170233305176327e-06, "epoch": 0.912, "percentage": 91.2, "elapsed_time": "0:18:51", "remaining_time": "0:01:49"}
457
+ {"current_steps": 4570, "total_steps": 5000, "loss": 1.37, "lr": 1.1180409469414094e-06, "epoch": 0.914, "percentage": 91.4, "elapsed_time": "0:18:54", "remaining_time": "0:01:46"}
458
+ {"current_steps": 4580, "total_steps": 5000, "loss": 0.6792, "lr": 1.067012561698319e-06, "epoch": 0.916, "percentage": 91.6, "elapsed_time": "0:18:55", "remaining_time": "0:01:44"}
459
+ {"current_steps": 4590, "total_steps": 5000, "loss": 3.2842, "lr": 1.0171506364985622e-06, "epoch": 0.918, "percentage": 91.8, "elapsed_time": "0:18:58", "remaining_time": "0:01:41"}
460
+ {"current_steps": 4600, "total_steps": 5000, "loss": 0.7875, "lr": 9.684576015420278e-07, "epoch": 0.92, "percentage": 92.0, "elapsed_time": "0:19:01", "remaining_time": "0:01:39"}
461
+ {"current_steps": 4610, "total_steps": 5000, "loss": 0.7768, "lr": 9.209358300585474e-07, "epoch": 0.922, "percentage": 92.2, "elapsed_time": "0:19:03", "remaining_time": "0:01:36"}
462
+ {"current_steps": 4620, "total_steps": 5000, "loss": 0.9518, "lr": 8.745876381922147e-07, "epoch": 0.924, "percentage": 92.4, "elapsed_time": "0:19:06", "remaining_time": "0:01:34"}
463
+ {"current_steps": 4630, "total_steps": 5000, "loss": 0.4208, "lr": 8.294152848885157e-07, "epoch": 0.926, "percentage": 92.6, "elapsed_time": "0:19:09", "remaining_time": "0:01:31"}
464
+ {"current_steps": 4640, "total_steps": 5000, "loss": 0.2602, "lr": 7.854209717842231e-07, "epoch": 0.928, "percentage": 92.8, "elapsed_time": "0:19:11", "remaining_time": "0:01:29"}
465
+ {"current_steps": 4650, "total_steps": 5000, "loss": 0.8418, "lr": 7.426068431000882e-07, "epoch": 0.93, "percentage": 93.0, "elapsed_time": "0:19:14", "remaining_time": "0:01:26"}
466
+ {"current_steps": 4660, "total_steps": 5000, "loss": 0.4949, "lr": 7.009749855363456e-07, "epoch": 0.932, "percentage": 93.2, "elapsed_time": "0:19:16", "remaining_time": "0:01:24"}
467
+ {"current_steps": 4670, "total_steps": 5000, "loss": 0.5919, "lr": 6.605274281709928e-07, "epoch": 0.934, "percentage": 93.4, "elapsed_time": "0:19:19", "remaining_time": "0:01:21"}
468
+ {"current_steps": 4680, "total_steps": 5000, "loss": 0.9009, "lr": 6.212661423609184e-07, "epoch": 0.936, "percentage": 93.6, "elapsed_time": "0:19:21", "remaining_time": "0:01:19"}
469
+ {"current_steps": 4690, "total_steps": 5000, "loss": 0.4119, "lr": 5.83193041645802e-07, "epoch": 0.938, "percentage": 93.8, "elapsed_time": "0:19:23", "remaining_time": "0:01:16"}
470
+ {"current_steps": 4700, "total_steps": 5000, "loss": 0.4794, "lr": 5.463099816548579e-07, "epoch": 0.94, "percentage": 94.0, "elapsed_time": "0:19:25", "remaining_time": "0:01:14"}
471
+ {"current_steps": 4710, "total_steps": 5000, "loss": 0.8312, "lr": 5.106187600163987e-07, "epoch": 0.942, "percentage": 94.2, "elapsed_time": "0:19:27", "remaining_time": "0:01:11"}
472
+ {"current_steps": 4720, "total_steps": 5000, "loss": 1.0177, "lr": 4.7612111627021175e-07, "epoch": 0.944, "percentage": 94.4, "elapsed_time": "0:19:30", "remaining_time": "0:01:09"}
473
+ {"current_steps": 4730, "total_steps": 5000, "loss": 0.7843, "lr": 4.4281873178278475e-07, "epoch": 0.946, "percentage": 94.6, "elapsed_time": "0:19:32", "remaining_time": "0:01:06"}
474
+ {"current_steps": 4740, "total_steps": 5000, "loss": 0.7314, "lr": 4.107132296653549e-07, "epoch": 0.948, "percentage": 94.8, "elapsed_time": "0:19:35", "remaining_time": "0:01:04"}
475
+ {"current_steps": 4750, "total_steps": 5000, "loss": 0.5168, "lr": 3.7980617469479953e-07, "epoch": 0.95, "percentage": 95.0, "elapsed_time": "0:19:37", "remaining_time": "0:01:01"}
476
+ {"current_steps": 4760, "total_steps": 5000, "loss": 1.4444, "lr": 3.5009907323737825e-07, "epoch": 0.952, "percentage": 95.2, "elapsed_time": "0:19:40", "remaining_time": "0:00:59"}
477
+ {"current_steps": 4770, "total_steps": 5000, "loss": 1.1199, "lr": 3.215933731753024e-07, "epoch": 0.954, "percentage": 95.4, "elapsed_time": "0:19:42", "remaining_time": "0:00:57"}
478
+ {"current_steps": 4780, "total_steps": 5000, "loss": 0.5519, "lr": 2.942904638361804e-07, "epoch": 0.956, "percentage": 95.6, "elapsed_time": "0:19:45", "remaining_time": "0:00:54"}
479
+ {"current_steps": 4790, "total_steps": 5000, "loss": 0.9452, "lr": 2.681916759252917e-07, "epoch": 0.958, "percentage": 95.8, "elapsed_time": "0:19:47", "remaining_time": "0:00:52"}
480
+ {"current_steps": 4800, "total_steps": 5000, "loss": 3.2854, "lr": 2.4329828146074095e-07, "epoch": 0.96, "percentage": 96.0, "elapsed_time": "0:19:49", "remaining_time": "0:00:49"}
481
+ {"current_steps": 4810, "total_steps": 5000, "loss": 0.5169, "lr": 2.1961149371145795e-07, "epoch": 0.962, "percentage": 96.2, "elapsed_time": "0:19:51", "remaining_time": "0:00:47"}
482
+ {"current_steps": 4820, "total_steps": 5000, "loss": 0.8747, "lr": 1.9713246713805588e-07, "epoch": 0.964, "percentage": 96.4, "elapsed_time": "0:19:53", "remaining_time": "0:00:44"}
483
+ {"current_steps": 4830, "total_steps": 5000, "loss": 1.0667, "lr": 1.7586229733657644e-07, "epoch": 0.966, "percentage": 96.6, "elapsed_time": "0:19:56", "remaining_time": "0:00:42"}
484
+ {"current_steps": 4840, "total_steps": 5000, "loss": 1.4877, "lr": 1.5580202098509077e-07, "epoch": 0.968, "percentage": 96.8, "elapsed_time": "0:19:59", "remaining_time": "0:00:39"}
485
+ {"current_steps": 4850, "total_steps": 5000, "loss": 0.9893, "lr": 1.3695261579316777e-07, "epoch": 0.97, "percentage": 97.0, "elapsed_time": "0:20:01", "remaining_time": "0:00:37"}
486
+ {"current_steps": 4860, "total_steps": 5000, "loss": 0.9399, "lr": 1.193150004542204e-07, "epoch": 0.972, "percentage": 97.2, "elapsed_time": "0:20:04", "remaining_time": "0:00:34"}
487
+ {"current_steps": 4870, "total_steps": 5000, "loss": 0.4823, "lr": 1.0289003460074165e-07, "epoch": 0.974, "percentage": 97.4, "elapsed_time": "0:20:07", "remaining_time": "0:00:32"}
488
+ {"current_steps": 4880, "total_steps": 5000, "loss": 0.999, "lr": 8.767851876239074e-08, "epoch": 0.976, "percentage": 97.6, "elapsed_time": "0:20:10", "remaining_time": "0:00:29"}
489
+ {"current_steps": 4890, "total_steps": 5000, "loss": 0.6337, "lr": 7.368119432699383e-08, "epoch": 0.978, "percentage": 97.8, "elapsed_time": "0:20:13", "remaining_time": "0:00:27"}
490
+ {"current_steps": 4900, "total_steps": 5000, "loss": 0.5041, "lr": 6.089874350439506e-08, "epoch": 0.98, "percentage": 98.0, "elapsed_time": "0:20:16", "remaining_time": "0:00:24"}
491
+ {"current_steps": 4910, "total_steps": 5000, "loss": 0.9211, "lr": 4.9331789293211026e-08, "epoch": 0.982, "percentage": 98.2, "elapsed_time": "0:20:18", "remaining_time": "0:00:22"}
492
+ {"current_steps": 4920, "total_steps": 5000, "loss": 0.5071, "lr": 3.8980895450474455e-08, "epoch": 0.984, "percentage": 98.4, "elapsed_time": "0:20:20", "remaining_time": "0:00:19"}
493
+ {"current_steps": 4930, "total_steps": 5000, "loss": 0.7782, "lr": 2.9846566464150626e-08, "epoch": 0.986, "percentage": 98.6, "elapsed_time": "0:20:23", "remaining_time": "0:00:17"}
494
+ {"current_steps": 4940, "total_steps": 5000, "loss": 0.9379, "lr": 2.192924752854042e-08, "epoch": 0.988, "percentage": 98.8, "elapsed_time": "0:20:25", "remaining_time": "0:00:14"}
495
+ {"current_steps": 4950, "total_steps": 5000, "loss": 0.552, "lr": 1.522932452260595e-08, "epoch": 0.99, "percentage": 99.0, "elapsed_time": "0:20:28", "remaining_time": "0:00:12"}
496
+ {"current_steps": 4960, "total_steps": 5000, "loss": 0.5151, "lr": 9.747123991141194e-09, "epoch": 0.992, "percentage": 99.2, "elapsed_time": "0:20:30", "remaining_time": "0:00:09"}
497
+ {"current_steps": 4970, "total_steps": 5000, "loss": 0.7505, "lr": 5.48291312886251e-09, "epoch": 0.994, "percentage": 99.4, "elapsed_time": "0:20:32", "remaining_time": "0:00:07"}
498
+ {"current_steps": 4980, "total_steps": 5000, "loss": 0.8377, "lr": 2.4368997673940297e-09, "epoch": 0.996, "percentage": 99.6, "elapsed_time": "0:20:35", "remaining_time": "0:00:04"}
499
+ {"current_steps": 4990, "total_steps": 5000, "loss": 1.0276, "lr": 6.092323651313292e-10, "epoch": 0.998, "percentage": 99.8, "elapsed_time": "0:20:38", "remaining_time": "0:00:02"}
500
+ {"current_steps": 5000, "total_steps": 5000, "loss": 0.4942, "lr": 0.0, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:20:41", "remaining_time": "0:00:00"}
501
+ {"current_steps": 5000, "total_steps": 5000, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:20:41", "remaining_time": "0:00:00"}
Llama-2-13b-chat-hf/DomainBench/Geography/trainer_state.json ADDED
@@ -0,0 +1,3542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002,
13
+ "grad_norm": 0.12749333679676056,
14
+ "learning_rate": 1.0000000000000002e-06,
15
+ "loss": 1.5419,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.004,
20
+ "grad_norm": 0.4542064964771271,
21
+ "learning_rate": 2.0000000000000003e-06,
22
+ "loss": 2.7315,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.006,
27
+ "grad_norm": 0.33891889452934265,
28
+ "learning_rate": 3e-06,
29
+ "loss": 1.8219,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.008,
34
+ "grad_norm": 0.26358193159103394,
35
+ "learning_rate": 4.000000000000001e-06,
36
+ "loss": 1.5216,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.01,
41
+ "grad_norm": 0.5897868275642395,
42
+ "learning_rate": 5e-06,
43
+ "loss": 5.3329,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.012,
48
+ "grad_norm": 0.07071671634912491,
49
+ "learning_rate": 6e-06,
50
+ "loss": 2.0144,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.014,
55
+ "grad_norm": 1.6330965757369995,
56
+ "learning_rate": 7.000000000000001e-06,
57
+ "loss": 2.6128,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.016,
62
+ "grad_norm": 3.5290110111236572,
63
+ "learning_rate": 8.000000000000001e-06,
64
+ "loss": 2.3414,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.018,
69
+ "grad_norm": 0.41117197275161743,
70
+ "learning_rate": 9e-06,
71
+ "loss": 3.2327,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.02,
76
+ "grad_norm": 0.8093858957290649,
77
+ "learning_rate": 1e-05,
78
+ "loss": 2.7985,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.022,
83
+ "grad_norm": 6.350818157196045,
84
+ "learning_rate": 1.1000000000000001e-05,
85
+ "loss": 2.397,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.024,
90
+ "grad_norm": 0.4250973165035248,
91
+ "learning_rate": 1.2e-05,
92
+ "loss": 1.0396,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.026,
97
+ "grad_norm": 1.2124313116073608,
98
+ "learning_rate": 1.3000000000000001e-05,
99
+ "loss": 3.2413,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.028,
104
+ "grad_norm": 0.0,
105
+ "learning_rate": 1.4000000000000001e-05,
106
+ "loss": 3.3462,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.03,
111
+ "grad_norm": 1.524579644203186,
112
+ "learning_rate": 1.5e-05,
113
+ "loss": 1.3302,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.032,
118
+ "grad_norm": 4.701014518737793,
119
+ "learning_rate": 1.6000000000000003e-05,
120
+ "loss": 1.2748,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.034,
125
+ "grad_norm": 1.1295222043991089,
126
+ "learning_rate": 1.7000000000000003e-05,
127
+ "loss": 3.3206,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.036,
132
+ "grad_norm": 4.18114709854126,
133
+ "learning_rate": 1.8e-05,
134
+ "loss": 1.3943,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.038,
139
+ "grad_norm": 0.0,
140
+ "learning_rate": 1.9e-05,
141
+ "loss": 1.2942,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.04,
146
+ "grad_norm": 0.5731304883956909,
147
+ "learning_rate": 2e-05,
148
+ "loss": 1.4252,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.042,
153
+ "grad_norm": 0.0,
154
+ "learning_rate": 2.1e-05,
155
+ "loss": 1.6539,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.044,
160
+ "grad_norm": 0.9085258841514587,
161
+ "learning_rate": 2.2000000000000003e-05,
162
+ "loss": 1.8091,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.046,
167
+ "grad_norm": 0.22193272411823273,
168
+ "learning_rate": 2.3000000000000003e-05,
169
+ "loss": 1.2866,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.048,
174
+ "grad_norm": 0.32045862078666687,
175
+ "learning_rate": 2.4e-05,
176
+ "loss": 1.7432,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.05,
181
+ "grad_norm": 0.15331393480300903,
182
+ "learning_rate": 2.5e-05,
183
+ "loss": 1.6918,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.052,
188
+ "grad_norm": 0.13029654324054718,
189
+ "learning_rate": 2.6000000000000002e-05,
190
+ "loss": 0.9121,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.054,
195
+ "grad_norm": 0.27324891090393066,
196
+ "learning_rate": 2.7000000000000002e-05,
197
+ "loss": 0.6088,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.056,
202
+ "grad_norm": 0.3138855993747711,
203
+ "learning_rate": 2.8000000000000003e-05,
204
+ "loss": 0.6236,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.058,
209
+ "grad_norm": 0.0,
210
+ "learning_rate": 2.9e-05,
211
+ "loss": 0.847,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.06,
216
+ "grad_norm": 0.5300617218017578,
217
+ "learning_rate": 3e-05,
218
+ "loss": 0.9911,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.062,
223
+ "grad_norm": 0.41367462277412415,
224
+ "learning_rate": 3.1e-05,
225
+ "loss": 1.4213,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.064,
230
+ "grad_norm": 0.0,
231
+ "learning_rate": 3.2000000000000005e-05,
232
+ "loss": 0.436,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.066,
237
+ "grad_norm": 0.4312784671783447,
238
+ "learning_rate": 3.3e-05,
239
+ "loss": 1.0218,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.068,
244
+ "grad_norm": 2.3603317737579346,
245
+ "learning_rate": 3.4000000000000007e-05,
246
+ "loss": 1.2908,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.07,
251
+ "grad_norm": 0.43904104828834534,
252
+ "learning_rate": 3.5e-05,
253
+ "loss": 1.0009,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.072,
258
+ "grad_norm": 0.23096807301044464,
259
+ "learning_rate": 3.6e-05,
260
+ "loss": 0.6865,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.074,
265
+ "grad_norm": 1.1962610483169556,
266
+ "learning_rate": 3.7e-05,
267
+ "loss": 0.9106,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.076,
272
+ "grad_norm": 0.38495856523513794,
273
+ "learning_rate": 3.8e-05,
274
+ "loss": 1.4635,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.078,
279
+ "grad_norm": 1.5584137439727783,
280
+ "learning_rate": 3.9000000000000006e-05,
281
+ "loss": 1.3782,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.08,
286
+ "grad_norm": 2.283113956451416,
287
+ "learning_rate": 4e-05,
288
+ "loss": 0.8778,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.082,
293
+ "grad_norm": 0.4340592324733734,
294
+ "learning_rate": 4.1e-05,
295
+ "loss": 0.8344,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.084,
300
+ "grad_norm": 0.7569859623908997,
301
+ "learning_rate": 4.2e-05,
302
+ "loss": 0.5733,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.086,
307
+ "grad_norm": 0.17662915587425232,
308
+ "learning_rate": 4.3e-05,
309
+ "loss": 0.6683,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.088,
314
+ "grad_norm": 0.0,
315
+ "learning_rate": 4.4000000000000006e-05,
316
+ "loss": 3.1046,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.09,
321
+ "grad_norm": 0.09121380746364594,
322
+ "learning_rate": 4.5e-05,
323
+ "loss": 0.981,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.092,
328
+ "grad_norm": 5.895723342895508,
329
+ "learning_rate": 4.600000000000001e-05,
330
+ "loss": 2.1118,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.094,
335
+ "grad_norm": 0.0,
336
+ "learning_rate": 4.7e-05,
337
+ "loss": 0.3037,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.096,
342
+ "grad_norm": 2.7312169075012207,
343
+ "learning_rate": 4.8e-05,
344
+ "loss": 1.2638,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.098,
349
+ "grad_norm": 39.255226135253906,
350
+ "learning_rate": 4.9e-05,
351
+ "loss": 2.3644,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.1,
356
+ "grad_norm": 1.8758808374404907,
357
+ "learning_rate": 5e-05,
358
+ "loss": 0.8317,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.102,
363
+ "grad_norm": 14.16323184967041,
364
+ "learning_rate": 4.999939076763487e-05,
365
+ "loss": 2.1864,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 0.104,
370
+ "grad_norm": 0.8826745748519897,
371
+ "learning_rate": 4.999756310023261e-05,
372
+ "loss": 1.3502,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 0.106,
377
+ "grad_norm": 1.1676051616668701,
378
+ "learning_rate": 4.999451708687114e-05,
379
+ "loss": 1.161,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 0.108,
384
+ "grad_norm": 1.338937759399414,
385
+ "learning_rate": 4.999025287600886e-05,
386
+ "loss": 0.7291,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 0.11,
391
+ "grad_norm": 0.0,
392
+ "learning_rate": 4.99847706754774e-05,
393
+ "loss": 0.4316,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 0.112,
398
+ "grad_norm": 1.1105668544769287,
399
+ "learning_rate": 4.997807075247146e-05,
400
+ "loss": 1.2009,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 0.114,
405
+ "grad_norm": 4.630878448486328,
406
+ "learning_rate": 4.997015343353585e-05,
407
+ "loss": 0.5649,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 0.116,
412
+ "grad_norm": 0.0,
413
+ "learning_rate": 4.996101910454953e-05,
414
+ "loss": 0.4128,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 0.118,
419
+ "grad_norm": 3.0937798023223877,
420
+ "learning_rate": 4.995066821070679e-05,
421
+ "loss": 0.977,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 0.12,
426
+ "grad_norm": 0.31115856766700745,
427
+ "learning_rate": 4.993910125649561e-05,
428
+ "loss": 1.3527,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 0.122,
433
+ "grad_norm": 2.9121241569519043,
434
+ "learning_rate": 4.992631880567301e-05,
435
+ "loss": 0.6522,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 0.124,
440
+ "grad_norm": 0.5493115782737732,
441
+ "learning_rate": 4.991232148123761e-05,
442
+ "loss": 0.7698,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 0.126,
447
+ "grad_norm": 0.0,
448
+ "learning_rate": 4.989710996539926e-05,
449
+ "loss": 0.9953,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 0.128,
454
+ "grad_norm": 31.73255729675293,
455
+ "learning_rate": 4.988068499954578e-05,
456
+ "loss": 0.8877,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 0.13,
461
+ "grad_norm": 0.3843584954738617,
462
+ "learning_rate": 4.9863047384206835e-05,
463
+ "loss": 0.564,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 0.132,
468
+ "grad_norm": 0.27858206629753113,
469
+ "learning_rate": 4.984419797901491e-05,
470
+ "loss": 0.451,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 0.134,
475
+ "grad_norm": 0.0,
476
+ "learning_rate": 4.982413770266342e-05,
477
+ "loss": 1.5067,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 0.136,
482
+ "grad_norm": 1.5187193155288696,
483
+ "learning_rate": 4.980286753286195e-05,
484
+ "loss": 1.6702,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 0.138,
489
+ "grad_norm": 0.8326213359832764,
490
+ "learning_rate": 4.978038850628854e-05,
491
+ "loss": 0.7115,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 0.14,
496
+ "grad_norm": 7.2340898513793945,
497
+ "learning_rate": 4.975670171853926e-05,
498
+ "loss": 0.9633,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 0.142,
503
+ "grad_norm": 1.582497477531433,
504
+ "learning_rate": 4.9731808324074717e-05,
505
+ "loss": 1.1906,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 0.144,
510
+ "grad_norm": 2.172502040863037,
511
+ "learning_rate": 4.9705709536163824e-05,
512
+ "loss": 1.7433,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 0.146,
517
+ "grad_norm": 1.8554096221923828,
518
+ "learning_rate": 4.96784066268247e-05,
519
+ "loss": 0.483,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 0.148,
524
+ "grad_norm": 2.1049275398254395,
525
+ "learning_rate": 4.964990092676263e-05,
526
+ "loss": 1.0321,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 0.15,
531
+ "grad_norm": 2.521188735961914,
532
+ "learning_rate": 4.962019382530521e-05,
533
+ "loss": 1.0468,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 0.152,
538
+ "grad_norm": 0.0,
539
+ "learning_rate": 4.9589286770334654e-05,
540
+ "loss": 0.5741,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 0.154,
545
+ "grad_norm": 0.0,
546
+ "learning_rate": 4.9557181268217227e-05,
547
+ "loss": 0.8734,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 0.156,
552
+ "grad_norm": 0.5660704970359802,
553
+ "learning_rate": 4.952387888372979e-05,
554
+ "loss": 2.3025,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 0.158,
559
+ "grad_norm": 0.5353797078132629,
560
+ "learning_rate": 4.94893812399836e-05,
561
+ "loss": 1.3188,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 0.16,
566
+ "grad_norm": 0.0,
567
+ "learning_rate": 4.9453690018345144e-05,
568
+ "loss": 2.0348,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 0.162,
573
+ "grad_norm": 0.0,
574
+ "learning_rate": 4.94168069583542e-05,
575
+ "loss": 1.8253,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 0.164,
580
+ "grad_norm": 1.0347248315811157,
581
+ "learning_rate": 4.937873385763908e-05,
582
+ "loss": 0.7857,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 0.166,
587
+ "grad_norm": 0.0,
588
+ "learning_rate": 4.933947257182901e-05,
589
+ "loss": 1.6698,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 0.168,
594
+ "grad_norm": 0.3763255774974823,
595
+ "learning_rate": 4.929902501446366e-05,
596
+ "loss": 1.3913,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 0.17,
601
+ "grad_norm": 4.2643327713012695,
602
+ "learning_rate": 4.925739315689991e-05,
603
+ "loss": 1.0676,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 0.172,
608
+ "grad_norm": 9.34708023071289,
609
+ "learning_rate": 4.9214579028215776e-05,
610
+ "loss": 1.6685,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 0.174,
615
+ "grad_norm": 0.23915188014507294,
616
+ "learning_rate": 4.917058471511149e-05,
617
+ "loss": 1.4978,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 0.176,
622
+ "grad_norm": 0.6251460909843445,
623
+ "learning_rate": 4.912541236180779e-05,
624
+ "loss": 8.4068,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 0.178,
629
+ "grad_norm": 6.69281005859375,
630
+ "learning_rate": 4.907906416994146e-05,
631
+ "loss": 1.0415,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 0.18,
636
+ "grad_norm": 0.18685762584209442,
637
+ "learning_rate": 4.9031542398457974e-05,
638
+ "loss": 1.7801,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 0.182,
643
+ "grad_norm": 0.0,
644
+ "learning_rate": 4.898284936350144e-05,
645
+ "loss": 0.4855,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 0.184,
650
+ "grad_norm": 0.500586986541748,
651
+ "learning_rate": 4.893298743830168e-05,
652
+ "loss": 0.6015,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 0.186,
657
+ "grad_norm": 0.0,
658
+ "learning_rate": 4.888195905305859e-05,
659
+ "loss": 0.7006,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 0.188,
664
+ "grad_norm": 0.9140529036521912,
665
+ "learning_rate": 4.882976669482367e-05,
666
+ "loss": 0.3036,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 0.19,
671
+ "grad_norm": 0.0,
672
+ "learning_rate": 4.877641290737884e-05,
673
+ "loss": 0.2464,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 0.192,
678
+ "grad_norm": 0.0,
679
+ "learning_rate": 4.8721900291112415e-05,
680
+ "loss": 0.4622,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 0.194,
685
+ "grad_norm": 0.4619162082672119,
686
+ "learning_rate": 4.8666231502892415e-05,
687
+ "loss": 0.5846,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 0.196,
692
+ "grad_norm": 0.39642179012298584,
693
+ "learning_rate": 4.860940925593703e-05,
694
+ "loss": 0.5897,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 0.198,
699
+ "grad_norm": 0.0,
700
+ "learning_rate": 4.855143631968242e-05,
701
+ "loss": 0.6564,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 0.2,
706
+ "grad_norm": 3.3878490924835205,
707
+ "learning_rate": 4.849231551964771e-05,
708
+ "loss": 0.6761,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 0.202,
713
+ "grad_norm": 1.3075069189071655,
714
+ "learning_rate": 4.843204973729729e-05,
715
+ "loss": 0.9705,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 0.204,
720
+ "grad_norm": 1.7418557405471802,
721
+ "learning_rate": 4.837064190990036e-05,
722
+ "loss": 0.6534,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 0.206,
727
+ "grad_norm": 0.7898538708686829,
728
+ "learning_rate": 4.830809503038781e-05,
729
+ "loss": 1.8363,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 0.208,
734
+ "grad_norm": 6.782915115356445,
735
+ "learning_rate": 4.8244412147206284e-05,
736
+ "loss": 1.5076,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 0.21,
741
+ "grad_norm": 0.13786576688289642,
742
+ "learning_rate": 4.817959636416969e-05,
743
+ "loss": 0.9317,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 0.212,
748
+ "grad_norm": 1.196662425994873,
749
+ "learning_rate": 4.8113650840307834e-05,
750
+ "loss": 1.2908,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 0.214,
755
+ "grad_norm": 3.733067750930786,
756
+ "learning_rate": 4.8046578789712515e-05,
757
+ "loss": 0.5742,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 0.216,
762
+ "grad_norm": 0.0,
763
+ "learning_rate": 4.797838348138086e-05,
764
+ "loss": 1.2138,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 0.218,
769
+ "grad_norm": 10.585489273071289,
770
+ "learning_rate": 4.790906823905599e-05,
771
+ "loss": 1.9534,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 0.22,
776
+ "grad_norm": 3.255509853363037,
777
+ "learning_rate": 4.783863644106502e-05,
778
+ "loss": 1.2392,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 0.222,
783
+ "grad_norm": 1.0870699882507324,
784
+ "learning_rate": 4.776709152015443e-05,
785
+ "loss": 0.7664,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 0.224,
790
+ "grad_norm": 0.0,
791
+ "learning_rate": 4.769443696332272e-05,
792
+ "loss": 0.6328,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 0.226,
797
+ "grad_norm": 12.5580472946167,
798
+ "learning_rate": 4.762067631165049e-05,
799
+ "loss": 1.3397,
800
+ "step": 1130
801
+ },
802
+ {
803
+ "epoch": 0.228,
804
+ "grad_norm": 4.46583890914917,
805
+ "learning_rate": 4.754581316012785e-05,
806
+ "loss": 0.5316,
807
+ "step": 1140
808
+ },
809
+ {
810
+ "epoch": 0.23,
811
+ "grad_norm": 0.9223441481590271,
812
+ "learning_rate": 4.7469851157479177e-05,
813
+ "loss": 1.7526,
814
+ "step": 1150
815
+ },
816
+ {
817
+ "epoch": 0.232,
818
+ "grad_norm": 0.2915666997432709,
819
+ "learning_rate": 4.7392794005985326e-05,
820
+ "loss": 1.6985,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 0.234,
825
+ "grad_norm": 0.0,
826
+ "learning_rate": 4.731464546130314e-05,
827
+ "loss": 1.7021,
828
+ "step": 1170
829
+ },
830
+ {
831
+ "epoch": 0.236,
832
+ "grad_norm": 3.659092426300049,
833
+ "learning_rate": 4.723540933228244e-05,
834
+ "loss": 0.6692,
835
+ "step": 1180
836
+ },
837
+ {
838
+ "epoch": 0.238,
839
+ "grad_norm": 2.8215394020080566,
840
+ "learning_rate": 4.715508948078037e-05,
841
+ "loss": 0.8183,
842
+ "step": 1190
843
+ },
844
+ {
845
+ "epoch": 0.24,
846
+ "grad_norm": 1.1959506273269653,
847
+ "learning_rate": 4.707368982147318e-05,
848
+ "loss": 0.7391,
849
+ "step": 1200
850
+ },
851
+ {
852
+ "epoch": 0.242,
853
+ "grad_norm": 2.793842315673828,
854
+ "learning_rate": 4.6991214321665414e-05,
855
+ "loss": 1.0601,
856
+ "step": 1210
857
+ },
858
+ {
859
+ "epoch": 0.244,
860
+ "grad_norm": 0.8359806537628174,
861
+ "learning_rate": 4.690766700109659e-05,
862
+ "loss": 0.6689,
863
+ "step": 1220
864
+ },
865
+ {
866
+ "epoch": 0.246,
867
+ "grad_norm": 2.5736119747161865,
868
+ "learning_rate": 4.682305193174524e-05,
869
+ "loss": 1.2384,
870
+ "step": 1230
871
+ },
872
+ {
873
+ "epoch": 0.248,
874
+ "grad_norm": 0.0,
875
+ "learning_rate": 4.6737373237630476e-05,
876
+ "loss": 0.5366,
877
+ "step": 1240
878
+ },
879
+ {
880
+ "epoch": 0.25,
881
+ "grad_norm": 0.0,
882
+ "learning_rate": 4.665063509461097e-05,
883
+ "loss": 0.9924,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 0.252,
888
+ "grad_norm": 2.5116872787475586,
889
+ "learning_rate": 4.656284173018144e-05,
890
+ "loss": 1.1548,
891
+ "step": 1260
892
+ },
893
+ {
894
+ "epoch": 0.254,
895
+ "grad_norm": 0.1771220713853836,
896
+ "learning_rate": 4.6473997423266614e-05,
897
+ "loss": 0.798,
898
+ "step": 1270
899
+ },
900
+ {
901
+ "epoch": 0.256,
902
+ "grad_norm": 3.1786563396453857,
903
+ "learning_rate": 4.638410650401267e-05,
904
+ "loss": 0.8444,
905
+ "step": 1280
906
+ },
907
+ {
908
+ "epoch": 0.258,
909
+ "grad_norm": 2.526992082595825,
910
+ "learning_rate": 4.629317335357619e-05,
911
+ "loss": 1.4516,
912
+ "step": 1290
913
+ },
914
+ {
915
+ "epoch": 0.26,
916
+ "grad_norm": 1.599918007850647,
917
+ "learning_rate": 4.620120240391065e-05,
918
+ "loss": 0.4612,
919
+ "step": 1300
920
+ },
921
+ {
922
+ "epoch": 0.262,
923
+ "grad_norm": 0.9092065691947937,
924
+ "learning_rate": 4.610819813755038e-05,
925
+ "loss": 0.8674,
926
+ "step": 1310
927
+ },
928
+ {
929
+ "epoch": 0.264,
930
+ "grad_norm": 1.3519505262374878,
931
+ "learning_rate": 4.601416508739211e-05,
932
+ "loss": 0.8115,
933
+ "step": 1320
934
+ },
935
+ {
936
+ "epoch": 0.266,
937
+ "grad_norm": 2.8794472217559814,
938
+ "learning_rate": 4.591910783647404e-05,
939
+ "loss": 0.4957,
940
+ "step": 1330
941
+ },
942
+ {
943
+ "epoch": 0.268,
944
+ "grad_norm": 3.0845656394958496,
945
+ "learning_rate": 4.5823031017752485e-05,
946
+ "loss": 0.862,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 0.27,
951
+ "grad_norm": 0.0,
952
+ "learning_rate": 4.572593931387604e-05,
953
+ "loss": 0.2812,
954
+ "step": 1350
955
+ },
956
+ {
957
+ "epoch": 0.272,
958
+ "grad_norm": 0.0,
959
+ "learning_rate": 4.562783745695738e-05,
960
+ "loss": 2.1906,
961
+ "step": 1360
962
+ },
963
+ {
964
+ "epoch": 0.274,
965
+ "grad_norm": 8.815956115722656,
966
+ "learning_rate": 4.5528730228342605e-05,
967
+ "loss": 0.9072,
968
+ "step": 1370
969
+ },
970
+ {
971
+ "epoch": 0.276,
972
+ "grad_norm": 1.1590182781219482,
973
+ "learning_rate": 4.542862245837821e-05,
974
+ "loss": 0.5203,
975
+ "step": 1380
976
+ },
977
+ {
978
+ "epoch": 0.278,
979
+ "grad_norm": 3.897277593612671,
980
+ "learning_rate": 4.532751902617569e-05,
981
+ "loss": 0.8603,
982
+ "step": 1390
983
+ },
984
+ {
985
+ "epoch": 0.28,
986
+ "grad_norm": 7.185044288635254,
987
+ "learning_rate": 4.522542485937369e-05,
988
+ "loss": 0.7643,
989
+ "step": 1400
990
+ },
991
+ {
992
+ "epoch": 0.282,
993
+ "grad_norm": 2.3547818660736084,
994
+ "learning_rate": 4.512234493389785e-05,
995
+ "loss": 0.8807,
996
+ "step": 1410
997
+ },
998
+ {
999
+ "epoch": 0.284,
1000
+ "grad_norm": 4.687290668487549,
1001
+ "learning_rate": 4.5018284273718336e-05,
1002
+ "loss": 0.8509,
1003
+ "step": 1420
1004
+ },
1005
+ {
1006
+ "epoch": 0.286,
1007
+ "grad_norm": 0.5399876832962036,
1008
+ "learning_rate": 4.491324795060491e-05,
1009
+ "loss": 1.7827,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 0.288,
1014
+ "grad_norm": 3.768984794616699,
1015
+ "learning_rate": 4.480724108387977e-05,
1016
+ "loss": 0.7381,
1017
+ "step": 1440
1018
+ },
1019
+ {
1020
+ "epoch": 0.29,
1021
+ "grad_norm": 4.747011184692383,
1022
+ "learning_rate": 4.4700268840168045e-05,
1023
+ "loss": 1.691,
1024
+ "step": 1450
1025
+ },
1026
+ {
1027
+ "epoch": 0.292,
1028
+ "grad_norm": 1.8903411626815796,
1029
+ "learning_rate": 4.4592336433146e-05,
1030
+ "loss": 2.0687,
1031
+ "step": 1460
1032
+ },
1033
+ {
1034
+ "epoch": 0.294,
1035
+ "grad_norm": 2.1586012840270996,
1036
+ "learning_rate": 4.448344912328686e-05,
1037
+ "loss": 1.8049,
1038
+ "step": 1470
1039
+ },
1040
+ {
1041
+ "epoch": 0.296,
1042
+ "grad_norm": 0.43952643871307373,
1043
+ "learning_rate": 4.4373612217604496e-05,
1044
+ "loss": 0.8578,
1045
+ "step": 1480
1046
+ },
1047
+ {
1048
+ "epoch": 0.298,
1049
+ "grad_norm": 0.4807486832141876,
1050
+ "learning_rate": 4.426283106939474e-05,
1051
+ "loss": 1.169,
1052
+ "step": 1490
1053
+ },
1054
+ {
1055
+ "epoch": 0.3,
1056
+ "grad_norm": 0.5909303426742554,
1057
+ "learning_rate": 4.415111107797445e-05,
1058
+ "loss": 0.3027,
1059
+ "step": 1500
1060
+ },
1061
+ {
1062
+ "epoch": 0.302,
1063
+ "grad_norm": 7.744114398956299,
1064
+ "learning_rate": 4.403845768841842e-05,
1065
+ "loss": 1.0326,
1066
+ "step": 1510
1067
+ },
1068
+ {
1069
+ "epoch": 0.304,
1070
+ "grad_norm": 0.0,
1071
+ "learning_rate": 4.3924876391293915e-05,
1072
+ "loss": 1.0785,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 0.306,
1077
+ "grad_norm": 11.178783416748047,
1078
+ "learning_rate": 4.381037272239311e-05,
1079
+ "loss": 1.4215,
1080
+ "step": 1530
1081
+ },
1082
+ {
1083
+ "epoch": 0.308,
1084
+ "grad_norm": 0.0,
1085
+ "learning_rate": 4.36949522624633e-05,
1086
+ "loss": 0.4891,
1087
+ "step": 1540
1088
+ },
1089
+ {
1090
+ "epoch": 0.31,
1091
+ "grad_norm": 0.0,
1092
+ "learning_rate": 4.357862063693486e-05,
1093
+ "loss": 1.3672,
1094
+ "step": 1550
1095
+ },
1096
+ {
1097
+ "epoch": 0.312,
1098
+ "grad_norm": 8.229108810424805,
1099
+ "learning_rate": 4.3461383515647106e-05,
1100
+ "loss": 1.0202,
1101
+ "step": 1560
1102
+ },
1103
+ {
1104
+ "epoch": 0.314,
1105
+ "grad_norm": 1.5745594501495361,
1106
+ "learning_rate": 4.334324661257191e-05,
1107
+ "loss": 0.9313,
1108
+ "step": 1570
1109
+ },
1110
+ {
1111
+ "epoch": 0.316,
1112
+ "grad_norm": 2.392512559890747,
1113
+ "learning_rate": 4.3224215685535294e-05,
1114
+ "loss": 0.4453,
1115
+ "step": 1580
1116
+ },
1117
+ {
1118
+ "epoch": 0.318,
1119
+ "grad_norm": 0.0,
1120
+ "learning_rate": 4.3104296535936695e-05,
1121
+ "loss": 1.694,
1122
+ "step": 1590
1123
+ },
1124
+ {
1125
+ "epoch": 0.32,
1126
+ "grad_norm": 0.782818615436554,
1127
+ "learning_rate": 4.2983495008466276e-05,
1128
+ "loss": 1.4264,
1129
+ "step": 1600
1130
+ },
1131
+ {
1132
+ "epoch": 0.322,
1133
+ "grad_norm": 2.611419916152954,
1134
+ "learning_rate": 4.2861816990820084e-05,
1135
+ "loss": 0.5798,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 0.324,
1140
+ "grad_norm": 1.1155253648757935,
1141
+ "learning_rate": 4.273926841341302e-05,
1142
+ "loss": 0.6301,
1143
+ "step": 1620
1144
+ },
1145
+ {
1146
+ "epoch": 0.326,
1147
+ "grad_norm": 3.934415102005005,
1148
+ "learning_rate": 4.261585524908987e-05,
1149
+ "loss": 0.9712,
1150
+ "step": 1630
1151
+ },
1152
+ {
1153
+ "epoch": 0.328,
1154
+ "grad_norm": 3.1011881828308105,
1155
+ "learning_rate": 4.249158351283414e-05,
1156
+ "loss": 0.7751,
1157
+ "step": 1640
1158
+ },
1159
+ {
1160
+ "epoch": 0.33,
1161
+ "grad_norm": 5.468195915222168,
1162
+ "learning_rate": 4.2366459261474933e-05,
1163
+ "loss": 0.5724,
1164
+ "step": 1650
1165
+ },
1166
+ {
1167
+ "epoch": 0.332,
1168
+ "grad_norm": 0.4597141742706299,
1169
+ "learning_rate": 4.224048859339175e-05,
1170
+ "loss": 0.9216,
1171
+ "step": 1660
1172
+ },
1173
+ {
1174
+ "epoch": 0.334,
1175
+ "grad_norm": 0.0,
1176
+ "learning_rate": 4.211367764821722e-05,
1177
+ "loss": 0.5461,
1178
+ "step": 1670
1179
+ },
1180
+ {
1181
+ "epoch": 0.336,
1182
+ "grad_norm": 1.2237460613250732,
1183
+ "learning_rate": 4.198603260653792e-05,
1184
+ "loss": 1.0833,
1185
+ "step": 1680
1186
+ },
1187
+ {
1188
+ "epoch": 0.338,
1189
+ "grad_norm": 28.4909725189209,
1190
+ "learning_rate": 4.185755968959308e-05,
1191
+ "loss": 1.6468,
1192
+ "step": 1690
1193
+ },
1194
+ {
1195
+ "epoch": 0.34,
1196
+ "grad_norm": 0.7980864644050598,
1197
+ "learning_rate": 4.172826515897146e-05,
1198
+ "loss": 0.6583,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 0.342,
1203
+ "grad_norm": 1.208788275718689,
1204
+ "learning_rate": 4.1598155316306044e-05,
1205
+ "loss": 1.1721,
1206
+ "step": 1710
1207
+ },
1208
+ {
1209
+ "epoch": 0.344,
1210
+ "grad_norm": 0.0,
1211
+ "learning_rate": 4.146723650296701e-05,
1212
+ "loss": 0.8588,
1213
+ "step": 1720
1214
+ },
1215
+ {
1216
+ "epoch": 0.346,
1217
+ "grad_norm": 0.0,
1218
+ "learning_rate": 4.133551509975264e-05,
1219
+ "loss": 0.4645,
1220
+ "step": 1730
1221
+ },
1222
+ {
1223
+ "epoch": 0.348,
1224
+ "grad_norm": 0.0,
1225
+ "learning_rate": 4.1202997526578276e-05,
1226
+ "loss": 1.4741,
1227
+ "step": 1740
1228
+ },
1229
+ {
1230
+ "epoch": 0.35,
1231
+ "grad_norm": 0.33904269337654114,
1232
+ "learning_rate": 4.1069690242163484e-05,
1233
+ "loss": 0.9873,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 0.352,
1238
+ "grad_norm": 0.7592562437057495,
1239
+ "learning_rate": 4.093559974371725e-05,
1240
+ "loss": 0.6202,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 0.354,
1245
+ "grad_norm": 0.0,
1246
+ "learning_rate": 4.080073256662127e-05,
1247
+ "loss": 0.7872,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 0.356,
1252
+ "grad_norm": 0.42382943630218506,
1253
+ "learning_rate": 4.066509528411152e-05,
1254
+ "loss": 1.9155,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 0.358,
1259
+ "grad_norm": 0.7281541228294373,
1260
+ "learning_rate": 4.052869450695776e-05,
1261
+ "loss": 0.5979,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 0.36,
1266
+ "grad_norm": 2.018998146057129,
1267
+ "learning_rate": 4.039153688314145e-05,
1268
+ "loss": 0.8478,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 0.362,
1273
+ "grad_norm": 0.34330451488494873,
1274
+ "learning_rate": 4.02536290975317e-05,
1275
+ "loss": 0.623,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 0.364,
1280
+ "grad_norm": 0.0,
1281
+ "learning_rate": 4.011497787155938e-05,
1282
+ "loss": 1.4658,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 0.366,
1287
+ "grad_norm": 0.0,
1288
+ "learning_rate": 3.997558996288965e-05,
1289
+ "loss": 1.9824,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 0.368,
1294
+ "grad_norm": 0.0,
1295
+ "learning_rate": 3.983547216509254e-05,
1296
+ "loss": 0.626,
1297
+ "step": 1840
1298
+ },
1299
+ {
1300
+ "epoch": 0.37,
1301
+ "grad_norm": 0.4369350075721741,
1302
+ "learning_rate": 3.969463130731183e-05,
1303
+ "loss": 0.5819,
1304
+ "step": 1850
1305
+ },
1306
+ {
1307
+ "epoch": 0.372,
1308
+ "grad_norm": 0.36127737164497375,
1309
+ "learning_rate": 3.955307425393224e-05,
1310
+ "loss": 0.7553,
1311
+ "step": 1860
1312
+ },
1313
+ {
1314
+ "epoch": 0.374,
1315
+ "grad_norm": 19.533721923828125,
1316
+ "learning_rate": 3.941080790424484e-05,
1317
+ "loss": 1.3426,
1318
+ "step": 1870
1319
+ },
1320
+ {
1321
+ "epoch": 0.376,
1322
+ "grad_norm": 1.5646860599517822,
1323
+ "learning_rate": 3.92678391921108e-05,
1324
+ "loss": 0.8386,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 0.378,
1329
+ "grad_norm": 0.8158350586891174,
1330
+ "learning_rate": 3.912417508562345e-05,
1331
+ "loss": 0.5088,
1332
+ "step": 1890
1333
+ },
1334
+ {
1335
+ "epoch": 0.38,
1336
+ "grad_norm": 9.138282775878906,
1337
+ "learning_rate": 3.897982258676867e-05,
1338
+ "loss": 2.6207,
1339
+ "step": 1900
1340
+ },
1341
+ {
1342
+ "epoch": 0.382,
1343
+ "grad_norm": 1.7073054313659668,
1344
+ "learning_rate": 3.883478873108361e-05,
1345
+ "loss": 0.8114,
1346
+ "step": 1910
1347
+ },
1348
+ {
1349
+ "epoch": 0.384,
1350
+ "grad_norm": 0.0,
1351
+ "learning_rate": 3.868908058731376e-05,
1352
+ "loss": 0.489,
1353
+ "step": 1920
1354
+ },
1355
+ {
1356
+ "epoch": 0.386,
1357
+ "grad_norm": 0.0,
1358
+ "learning_rate": 3.85427052570685e-05,
1359
+ "loss": 0.6035,
1360
+ "step": 1930
1361
+ },
1362
+ {
1363
+ "epoch": 0.388,
1364
+ "grad_norm": 0.5438594222068787,
1365
+ "learning_rate": 3.8395669874474915e-05,
1366
+ "loss": 0.9305,
1367
+ "step": 1940
1368
+ },
1369
+ {
1370
+ "epoch": 0.39,
1371
+ "grad_norm": 0.0,
1372
+ "learning_rate": 3.824798160583012e-05,
1373
+ "loss": 0.5244,
1374
+ "step": 1950
1375
+ },
1376
+ {
1377
+ "epoch": 0.392,
1378
+ "grad_norm": 0.6433933973312378,
1379
+ "learning_rate": 3.8099647649251986e-05,
1380
+ "loss": 1.4947,
1381
+ "step": 1960
1382
+ },
1383
+ {
1384
+ "epoch": 0.394,
1385
+ "grad_norm": 1.229801058769226,
1386
+ "learning_rate": 3.795067523432826e-05,
1387
+ "loss": 0.6931,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 0.396,
1392
+ "grad_norm": 4.736255645751953,
1393
+ "learning_rate": 3.780107162176429e-05,
1394
+ "loss": 0.8547,
1395
+ "step": 1980
1396
+ },
1397
+ {
1398
+ "epoch": 0.398,
1399
+ "grad_norm": 11.904961585998535,
1400
+ "learning_rate": 3.765084410302909e-05,
1401
+ "loss": 2.4946,
1402
+ "step": 1990
1403
+ },
1404
+ {
1405
+ "epoch": 0.4,
1406
+ "grad_norm": 6.879239082336426,
1407
+ "learning_rate": 3.7500000000000003e-05,
1408
+ "loss": 1.2444,
1409
+ "step": 2000
1410
+ },
1411
+ {
1412
+ "epoch": 0.402,
1413
+ "grad_norm": 0.3526920676231384,
1414
+ "learning_rate": 3.7348546664605777e-05,
1415
+ "loss": 1.2603,
1416
+ "step": 2010
1417
+ },
1418
+ {
1419
+ "epoch": 0.404,
1420
+ "grad_norm": 0.6010252237319946,
1421
+ "learning_rate": 3.719649147846832e-05,
1422
+ "loss": 0.5348,
1423
+ "step": 2020
1424
+ },
1425
+ {
1426
+ "epoch": 0.406,
1427
+ "grad_norm": 2.7081878185272217,
1428
+ "learning_rate": 3.704384185254288e-05,
1429
+ "loss": 0.6968,
1430
+ "step": 2030
1431
+ },
1432
+ {
1433
+ "epoch": 0.408,
1434
+ "grad_norm": 79.92549133300781,
1435
+ "learning_rate": 3.689060522675689e-05,
1436
+ "loss": 2.8761,
1437
+ "step": 2040
1438
+ },
1439
+ {
1440
+ "epoch": 0.41,
1441
+ "grad_norm": 1.650651216506958,
1442
+ "learning_rate": 3.673678906964727e-05,
1443
+ "loss": 1.6509,
1444
+ "step": 2050
1445
+ },
1446
+ {
1447
+ "epoch": 0.412,
1448
+ "grad_norm": 7.615381240844727,
1449
+ "learning_rate": 3.6582400877996546e-05,
1450
+ "loss": 0.8742,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 0.414,
1455
+ "grad_norm": 1.0772336721420288,
1456
+ "learning_rate": 3.642744817646736e-05,
1457
+ "loss": 1.241,
1458
+ "step": 2070
1459
+ },
1460
+ {
1461
+ "epoch": 0.416,
1462
+ "grad_norm": 0.4477096498012543,
1463
+ "learning_rate": 3.627193851723577e-05,
1464
+ "loss": 0.6697,
1465
+ "step": 2080
1466
+ },
1467
+ {
1468
+ "epoch": 0.418,
1469
+ "grad_norm": 6.794370174407959,
1470
+ "learning_rate": 3.611587947962319e-05,
1471
+ "loss": 0.8601,
1472
+ "step": 2090
1473
+ },
1474
+ {
1475
+ "epoch": 0.42,
1476
+ "grad_norm": 0.0,
1477
+ "learning_rate": 3.5959278669726935e-05,
1478
+ "loss": 1.6495,
1479
+ "step": 2100
1480
+ },
1481
+ {
1482
+ "epoch": 0.422,
1483
+ "grad_norm": 6.150529384613037,
1484
+ "learning_rate": 3.580214372004956e-05,
1485
+ "loss": 1.1695,
1486
+ "step": 2110
1487
+ },
1488
+ {
1489
+ "epoch": 0.424,
1490
+ "grad_norm": 1.6611673831939697,
1491
+ "learning_rate": 3.564448228912682e-05,
1492
+ "loss": 2.1528,
1493
+ "step": 2120
1494
+ },
1495
+ {
1496
+ "epoch": 0.426,
1497
+ "grad_norm": 20.955806732177734,
1498
+ "learning_rate": 3.548630206115443e-05,
1499
+ "loss": 0.7449,
1500
+ "step": 2130
1501
+ },
1502
+ {
1503
+ "epoch": 0.428,
1504
+ "grad_norm": 11.42070198059082,
1505
+ "learning_rate": 3.532761074561355e-05,
1506
+ "loss": 0.7268,
1507
+ "step": 2140
1508
+ },
1509
+ {
1510
+ "epoch": 0.43,
1511
+ "grad_norm": 3.5283825397491455,
1512
+ "learning_rate": 3.516841607689501e-05,
1513
+ "loss": 0.7794,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 0.432,
1518
+ "grad_norm": 1.6688228845596313,
1519
+ "learning_rate": 3.5008725813922386e-05,
1520
+ "loss": 0.727,
1521
+ "step": 2160
1522
+ },
1523
+ {
1524
+ "epoch": 0.434,
1525
+ "grad_norm": 2.2399706840515137,
1526
+ "learning_rate": 3.484854773977378e-05,
1527
+ "loss": 2.1652,
1528
+ "step": 2170
1529
+ },
1530
+ {
1531
+ "epoch": 0.436,
1532
+ "grad_norm": 1.4572434425354004,
1533
+ "learning_rate": 3.4687889661302576e-05,
1534
+ "loss": 0.4095,
1535
+ "step": 2180
1536
+ },
1537
+ {
1538
+ "epoch": 0.438,
1539
+ "grad_norm": 9.194727897644043,
1540
+ "learning_rate": 3.452675940875686e-05,
1541
+ "loss": 0.8675,
1542
+ "step": 2190
1543
+ },
1544
+ {
1545
+ "epoch": 0.44,
1546
+ "grad_norm": 6.2736053466796875,
1547
+ "learning_rate": 3.436516483539781e-05,
1548
+ "loss": 0.8658,
1549
+ "step": 2200
1550
+ },
1551
+ {
1552
+ "epoch": 0.442,
1553
+ "grad_norm": 0.8128998875617981,
1554
+ "learning_rate": 3.4203113817116957e-05,
1555
+ "loss": 0.3739,
1556
+ "step": 2210
1557
+ },
1558
+ {
1559
+ "epoch": 0.444,
1560
+ "grad_norm": 3.398008346557617,
1561
+ "learning_rate": 3.4040614252052305e-05,
1562
+ "loss": 0.8909,
1563
+ "step": 2220
1564
+ },
1565
+ {
1566
+ "epoch": 0.446,
1567
+ "grad_norm": 0.40850016474723816,
1568
+ "learning_rate": 3.387767406020343e-05,
1569
+ "loss": 1.4721,
1570
+ "step": 2230
1571
+ },
1572
+ {
1573
+ "epoch": 0.448,
1574
+ "grad_norm": 6.545433521270752,
1575
+ "learning_rate": 3.3714301183045385e-05,
1576
+ "loss": 1.001,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 0.45,
1581
+ "grad_norm": 1.5666799545288086,
1582
+ "learning_rate": 3.355050358314172e-05,
1583
+ "loss": 0.7794,
1584
+ "step": 2250
1585
+ },
1586
+ {
1587
+ "epoch": 0.452,
1588
+ "grad_norm": 3.0852999687194824,
1589
+ "learning_rate": 3.338628924375638e-05,
1590
+ "loss": 0.3814,
1591
+ "step": 2260
1592
+ },
1593
+ {
1594
+ "epoch": 0.454,
1595
+ "grad_norm": 0.48218706250190735,
1596
+ "learning_rate": 3.322166616846458e-05,
1597
+ "loss": 0.5824,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 0.456,
1602
+ "grad_norm": 1.3498566150665283,
1603
+ "learning_rate": 3.305664238076278e-05,
1604
+ "loss": 1.3243,
1605
+ "step": 2280
1606
+ },
1607
+ {
1608
+ "epoch": 0.458,
1609
+ "grad_norm": 0.0,
1610
+ "learning_rate": 3.289122592367757e-05,
1611
+ "loss": 0.4481,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 0.46,
1616
+ "grad_norm": 0.0,
1617
+ "learning_rate": 3.272542485937369e-05,
1618
+ "loss": 0.5601,
1619
+ "step": 2300
1620
+ },
1621
+ {
1622
+ "epoch": 0.462,
1623
+ "grad_norm": 0.0,
1624
+ "learning_rate": 3.2559247268761115e-05,
1625
+ "loss": 0.6695,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 0.464,
1630
+ "grad_norm": 11.921672821044922,
1631
+ "learning_rate": 3.239270125110117e-05,
1632
+ "loss": 0.9525,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 0.466,
1637
+ "grad_norm": 1.5229754447937012,
1638
+ "learning_rate": 3.222579492361179e-05,
1639
+ "loss": 0.5488,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 0.468,
1644
+ "grad_norm": 12.575560569763184,
1645
+ "learning_rate": 3.205853642107192e-05,
1646
+ "loss": 0.8862,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 0.47,
1651
+ "grad_norm": 0.0,
1652
+ "learning_rate": 3.1890933895424976e-05,
1653
+ "loss": 1.1218,
1654
+ "step": 2350
1655
+ },
1656
+ {
1657
+ "epoch": 0.472,
1658
+ "grad_norm": 0.77229905128479,
1659
+ "learning_rate": 3.172299551538164e-05,
1660
+ "loss": 1.4365,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 0.474,
1665
+ "grad_norm": 0.7523584961891174,
1666
+ "learning_rate": 3.155472946602162e-05,
1667
+ "loss": 1.4355,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 0.476,
1672
+ "grad_norm": 2.284735918045044,
1673
+ "learning_rate": 3.138614394839476e-05,
1674
+ "loss": 1.6462,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 0.478,
1679
+ "grad_norm": 0.0,
1680
+ "learning_rate": 3.121724717912138e-05,
1681
+ "loss": 0.6112,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 0.48,
1686
+ "grad_norm": 1.5213804244995117,
1687
+ "learning_rate": 3.104804738999169e-05,
1688
+ "loss": 0.2338,
1689
+ "step": 2400
1690
+ },
1691
+ {
1692
+ "epoch": 0.482,
1693
+ "grad_norm": 4.481227874755859,
1694
+ "learning_rate": 3.087855282756475e-05,
1695
+ "loss": 0.5969,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 0.484,
1700
+ "grad_norm": 2.4558937549591064,
1701
+ "learning_rate": 3.0708771752766394e-05,
1702
+ "loss": 0.8462,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 0.486,
1707
+ "grad_norm": 4.428265571594238,
1708
+ "learning_rate": 3.053871244048669e-05,
1709
+ "loss": 0.4652,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 0.488,
1714
+ "grad_norm": 0.458535760641098,
1715
+ "learning_rate": 3.0368383179176585e-05,
1716
+ "loss": 0.212,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 0.49,
1721
+ "grad_norm": 0.5157924294471741,
1722
+ "learning_rate": 3.0197792270443982e-05,
1723
+ "loss": 0.697,
1724
+ "step": 2450
1725
+ },
1726
+ {
1727
+ "epoch": 0.492,
1728
+ "grad_norm": 0.37859025597572327,
1729
+ "learning_rate": 3.002694802864912e-05,
1730
+ "loss": 1.3351,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 0.494,
1735
+ "grad_norm": 4.0266242027282715,
1736
+ "learning_rate": 2.98558587804993e-05,
1737
+ "loss": 1.8147,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 0.496,
1742
+ "grad_norm": 9.819830894470215,
1743
+ "learning_rate": 2.9684532864643122e-05,
1744
+ "loss": 0.9978,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 0.498,
1749
+ "grad_norm": 10.189305305480957,
1750
+ "learning_rate": 2.9512978631264006e-05,
1751
+ "loss": 1.6129,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 0.5,
1756
+ "grad_norm": 0.588488757610321,
1757
+ "learning_rate": 2.9341204441673266e-05,
1758
+ "loss": 0.745,
1759
+ "step": 2500
1760
+ },
1761
+ {
1762
+ "epoch": 0.502,
1763
+ "grad_norm": 0.0,
1764
+ "learning_rate": 2.916921866790256e-05,
1765
+ "loss": 0.5694,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 0.504,
1770
+ "grad_norm": 1.1335722208023071,
1771
+ "learning_rate": 2.8997029692295874e-05,
1772
+ "loss": 0.453,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 0.506,
1777
+ "grad_norm": 3.6554768085479736,
1778
+ "learning_rate": 2.8824645907100954e-05,
1779
+ "loss": 0.7539,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 0.508,
1784
+ "grad_norm": 3.770594596862793,
1785
+ "learning_rate": 2.8652075714060295e-05,
1786
+ "loss": 0.5247,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 0.51,
1791
+ "grad_norm": 0.0,
1792
+ "learning_rate": 2.8479327524001636e-05,
1793
+ "loss": 0.7769,
1794
+ "step": 2550
1795
+ },
1796
+ {
1797
+ "epoch": 0.512,
1798
+ "grad_norm": 1.4518766403198242,
1799
+ "learning_rate": 2.8306409756428064e-05,
1800
+ "loss": 0.7204,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 0.514,
1805
+ "grad_norm": 5.560041427612305,
1806
+ "learning_rate": 2.8133330839107608e-05,
1807
+ "loss": 0.9135,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 0.516,
1812
+ "grad_norm": 0.0,
1813
+ "learning_rate": 2.7960099207662532e-05,
1814
+ "loss": 0.5892,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 0.518,
1819
+ "grad_norm": 7.593757152557373,
1820
+ "learning_rate": 2.7786723305158136e-05,
1821
+ "loss": 0.5568,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 0.52,
1826
+ "grad_norm": 1.4705710411071777,
1827
+ "learning_rate": 2.761321158169134e-05,
1828
+ "loss": 1.3712,
1829
+ "step": 2600
1830
+ },
1831
+ {
1832
+ "epoch": 0.522,
1833
+ "grad_norm": 2.1466057300567627,
1834
+ "learning_rate": 2.7439572493978736e-05,
1835
+ "loss": 0.9695,
1836
+ "step": 2610
1837
+ },
1838
+ {
1839
+ "epoch": 0.524,
1840
+ "grad_norm": 5.329553604125977,
1841
+ "learning_rate": 2.726581450494451e-05,
1842
+ "loss": 0.7138,
1843
+ "step": 2620
1844
+ },
1845
+ {
1846
+ "epoch": 0.526,
1847
+ "grad_norm": 3.9968855381011963,
1848
+ "learning_rate": 2.7091946083307896e-05,
1849
+ "loss": 1.0675,
1850
+ "step": 2630
1851
+ },
1852
+ {
1853
+ "epoch": 0.528,
1854
+ "grad_norm": 4.9435553550720215,
1855
+ "learning_rate": 2.6917975703170466e-05,
1856
+ "loss": 0.8781,
1857
+ "step": 2640
1858
+ },
1859
+ {
1860
+ "epoch": 0.53,
1861
+ "grad_norm": 1.5722167491912842,
1862
+ "learning_rate": 2.674391184360313e-05,
1863
+ "loss": 0.5354,
1864
+ "step": 2650
1865
+ },
1866
+ {
1867
+ "epoch": 0.532,
1868
+ "grad_norm": 0.7694377303123474,
1869
+ "learning_rate": 2.656976298823284e-05,
1870
+ "loss": 0.456,
1871
+ "step": 2660
1872
+ },
1873
+ {
1874
+ "epoch": 0.534,
1875
+ "grad_norm": 0.0,
1876
+ "learning_rate": 2.6395537624829096e-05,
1877
+ "loss": 2.0864,
1878
+ "step": 2670
1879
+ },
1880
+ {
1881
+ "epoch": 0.536,
1882
+ "grad_norm": 0.564000129699707,
1883
+ "learning_rate": 2.6221244244890336e-05,
1884
+ "loss": 0.8204,
1885
+ "step": 2680
1886
+ },
1887
+ {
1888
+ "epoch": 0.538,
1889
+ "grad_norm": 1.3732776641845703,
1890
+ "learning_rate": 2.604689134322999e-05,
1891
+ "loss": 0.6519,
1892
+ "step": 2690
1893
+ },
1894
+ {
1895
+ "epoch": 0.54,
1896
+ "grad_norm": 9.038244247436523,
1897
+ "learning_rate": 2.587248741756253e-05,
1898
+ "loss": 0.7875,
1899
+ "step": 2700
1900
+ },
1901
+ {
1902
+ "epoch": 0.542,
1903
+ "grad_norm": 0.0,
1904
+ "learning_rate": 2.5698040968089225e-05,
1905
+ "loss": 1.4801,
1906
+ "step": 2710
1907
+ },
1908
+ {
1909
+ "epoch": 0.544,
1910
+ "grad_norm": 6.459538459777832,
1911
+ "learning_rate": 2.5523560497083926e-05,
1912
+ "loss": 0.9097,
1913
+ "step": 2720
1914
+ },
1915
+ {
1916
+ "epoch": 0.546,
1917
+ "grad_norm": 1.4535608291625977,
1918
+ "learning_rate": 2.5349054508478637e-05,
1919
+ "loss": 0.5698,
1920
+ "step": 2730
1921
+ },
1922
+ {
1923
+ "epoch": 0.548,
1924
+ "grad_norm": 1.2716917991638184,
1925
+ "learning_rate": 2.517453150744904e-05,
1926
+ "loss": 0.4508,
1927
+ "step": 2740
1928
+ },
1929
+ {
1930
+ "epoch": 0.55,
1931
+ "grad_norm": 0.0,
1932
+ "learning_rate": 2.5e-05,
1933
+ "loss": 0.4544,
1934
+ "step": 2750
1935
+ },
1936
+ {
1937
+ "epoch": 0.552,
1938
+ "grad_norm": 2.410973310470581,
1939
+ "learning_rate": 2.4825468492550964e-05,
1940
+ "loss": 1.0367,
1941
+ "step": 2760
1942
+ },
1943
+ {
1944
+ "epoch": 0.554,
1945
+ "grad_norm": 171.79568481445312,
1946
+ "learning_rate": 2.4650945491521372e-05,
1947
+ "loss": 5.9188,
1948
+ "step": 2770
1949
+ },
1950
+ {
1951
+ "epoch": 0.556,
1952
+ "grad_norm": 0.4065113365650177,
1953
+ "learning_rate": 2.447643950291608e-05,
1954
+ "loss": 0.784,
1955
+ "step": 2780
1956
+ },
1957
+ {
1958
+ "epoch": 0.558,
1959
+ "grad_norm": 8.022871017456055,
1960
+ "learning_rate": 2.4301959031910784e-05,
1961
+ "loss": 1.3575,
1962
+ "step": 2790
1963
+ },
1964
+ {
1965
+ "epoch": 0.56,
1966
+ "grad_norm": 0.0,
1967
+ "learning_rate": 2.4127512582437485e-05,
1968
+ "loss": 1.2835,
1969
+ "step": 2800
1970
+ },
1971
+ {
1972
+ "epoch": 0.562,
1973
+ "grad_norm": 27.728992462158203,
1974
+ "learning_rate": 2.3953108656770016e-05,
1975
+ "loss": 2.173,
1976
+ "step": 2810
1977
+ },
1978
+ {
1979
+ "epoch": 0.564,
1980
+ "grad_norm": 0.0,
1981
+ "learning_rate": 2.377875575510967e-05,
1982
+ "loss": 0.354,
1983
+ "step": 2820
1984
+ },
1985
+ {
1986
+ "epoch": 0.566,
1987
+ "grad_norm": 0.1998305767774582,
1988
+ "learning_rate": 2.3604462375170906e-05,
1989
+ "loss": 0.8905,
1990
+ "step": 2830
1991
+ },
1992
+ {
1993
+ "epoch": 0.568,
1994
+ "grad_norm": 2.29780650138855,
1995
+ "learning_rate": 2.3430237011767167e-05,
1996
+ "loss": 0.9289,
1997
+ "step": 2840
1998
+ },
1999
+ {
2000
+ "epoch": 0.57,
2001
+ "grad_norm": 0.0,
2002
+ "learning_rate": 2.3256088156396868e-05,
2003
+ "loss": 0.512,
2004
+ "step": 2850
2005
+ },
2006
+ {
2007
+ "epoch": 0.572,
2008
+ "grad_norm": 0.30271175503730774,
2009
+ "learning_rate": 2.3082024296829536e-05,
2010
+ "loss": 0.6724,
2011
+ "step": 2860
2012
+ },
2013
+ {
2014
+ "epoch": 0.574,
2015
+ "grad_norm": 16.384737014770508,
2016
+ "learning_rate": 2.2908053916692117e-05,
2017
+ "loss": 0.7876,
2018
+ "step": 2870
2019
+ },
2020
+ {
2021
+ "epoch": 0.576,
2022
+ "grad_norm": 0.38019096851348877,
2023
+ "learning_rate": 2.2734185495055503e-05,
2024
+ "loss": 1.3706,
2025
+ "step": 2880
2026
+ },
2027
+ {
2028
+ "epoch": 0.578,
2029
+ "grad_norm": 0.6822488903999329,
2030
+ "learning_rate": 2.2560427506021266e-05,
2031
+ "loss": 1.0091,
2032
+ "step": 2890
2033
+ },
2034
+ {
2035
+ "epoch": 0.58,
2036
+ "grad_norm": 6.04833984375,
2037
+ "learning_rate": 2.238678841830867e-05,
2038
+ "loss": 0.4665,
2039
+ "step": 2900
2040
+ },
2041
+ {
2042
+ "epoch": 0.582,
2043
+ "grad_norm": 24.140220642089844,
2044
+ "learning_rate": 2.2213276694841866e-05,
2045
+ "loss": 1.412,
2046
+ "step": 2910
2047
+ },
2048
+ {
2049
+ "epoch": 0.584,
2050
+ "grad_norm": 0.0,
2051
+ "learning_rate": 2.2039900792337474e-05,
2052
+ "loss": 1.3582,
2053
+ "step": 2920
2054
+ },
2055
+ {
2056
+ "epoch": 0.586,
2057
+ "grad_norm": 29.037202835083008,
2058
+ "learning_rate": 2.186666916089239e-05,
2059
+ "loss": 0.3297,
2060
+ "step": 2930
2061
+ },
2062
+ {
2063
+ "epoch": 0.588,
2064
+ "grad_norm": 3.5834364891052246,
2065
+ "learning_rate": 2.1693590243571938e-05,
2066
+ "loss": 0.5486,
2067
+ "step": 2940
2068
+ },
2069
+ {
2070
+ "epoch": 0.59,
2071
+ "grad_norm": 0.7133229970932007,
2072
+ "learning_rate": 2.1520672475998373e-05,
2073
+ "loss": 0.341,
2074
+ "step": 2950
2075
+ },
2076
+ {
2077
+ "epoch": 0.592,
2078
+ "grad_norm": 22.81307601928711,
2079
+ "learning_rate": 2.1347924285939714e-05,
2080
+ "loss": 1.8438,
2081
+ "step": 2960
2082
+ },
2083
+ {
2084
+ "epoch": 0.594,
2085
+ "grad_norm": 8.678629875183105,
2086
+ "learning_rate": 2.117535409289905e-05,
2087
+ "loss": 1.0661,
2088
+ "step": 2970
2089
+ },
2090
+ {
2091
+ "epoch": 0.596,
2092
+ "grad_norm": 0.36412814259529114,
2093
+ "learning_rate": 2.1002970307704132e-05,
2094
+ "loss": 0.5445,
2095
+ "step": 2980
2096
+ },
2097
+ {
2098
+ "epoch": 0.598,
2099
+ "grad_norm": 1.426496148109436,
2100
+ "learning_rate": 2.0830781332097446e-05,
2101
+ "loss": 0.735,
2102
+ "step": 2990
2103
+ },
2104
+ {
2105
+ "epoch": 0.6,
2106
+ "grad_norm": 1.3443537950515747,
2107
+ "learning_rate": 2.0658795558326743e-05,
2108
+ "loss": 1.2389,
2109
+ "step": 3000
2110
+ },
2111
+ {
2112
+ "epoch": 0.602,
2113
+ "grad_norm": 0.4442681670188904,
2114
+ "learning_rate": 2.0487021368736003e-05,
2115
+ "loss": 0.6412,
2116
+ "step": 3010
2117
+ },
2118
+ {
2119
+ "epoch": 0.604,
2120
+ "grad_norm": 9.767306327819824,
2121
+ "learning_rate": 2.031546713535688e-05,
2122
+ "loss": 0.6114,
2123
+ "step": 3020
2124
+ },
2125
+ {
2126
+ "epoch": 0.606,
2127
+ "grad_norm": 0.5233049392700195,
2128
+ "learning_rate": 2.0144141219500705e-05,
2129
+ "loss": 0.846,
2130
+ "step": 3030
2131
+ },
2132
+ {
2133
+ "epoch": 0.608,
2134
+ "grad_norm": 7.53090763092041,
2135
+ "learning_rate": 1.9973051971350888e-05,
2136
+ "loss": 1.1015,
2137
+ "step": 3040
2138
+ },
2139
+ {
2140
+ "epoch": 0.61,
2141
+ "grad_norm": 0.44011208415031433,
2142
+ "learning_rate": 1.980220772955602e-05,
2143
+ "loss": 0.2744,
2144
+ "step": 3050
2145
+ },
2146
+ {
2147
+ "epoch": 0.612,
2148
+ "grad_norm": 0.6516274213790894,
2149
+ "learning_rate": 1.963161682082342e-05,
2150
+ "loss": 1.048,
2151
+ "step": 3060
2152
+ },
2153
+ {
2154
+ "epoch": 0.614,
2155
+ "grad_norm": 14.277541160583496,
2156
+ "learning_rate": 1.946128755951332e-05,
2157
+ "loss": 1.5106,
2158
+ "step": 3070
2159
+ },
2160
+ {
2161
+ "epoch": 0.616,
2162
+ "grad_norm": 1.5146019458770752,
2163
+ "learning_rate": 1.9291228247233605e-05,
2164
+ "loss": 1.3313,
2165
+ "step": 3080
2166
+ },
2167
+ {
2168
+ "epoch": 0.618,
2169
+ "grad_norm": 6.09342098236084,
2170
+ "learning_rate": 1.912144717243525e-05,
2171
+ "loss": 1.1983,
2172
+ "step": 3090
2173
+ },
2174
+ {
2175
+ "epoch": 0.62,
2176
+ "grad_norm": 0.4981762170791626,
2177
+ "learning_rate": 1.895195261000831e-05,
2178
+ "loss": 0.8728,
2179
+ "step": 3100
2180
+ },
2181
+ {
2182
+ "epoch": 0.622,
2183
+ "grad_norm": 0.0,
2184
+ "learning_rate": 1.8782752820878634e-05,
2185
+ "loss": 0.6307,
2186
+ "step": 3110
2187
+ },
2188
+ {
2189
+ "epoch": 0.624,
2190
+ "grad_norm": 3.658278226852417,
2191
+ "learning_rate": 1.8613856051605243e-05,
2192
+ "loss": 0.7477,
2193
+ "step": 3120
2194
+ },
2195
+ {
2196
+ "epoch": 0.626,
2197
+ "grad_norm": 2.42810320854187,
2198
+ "learning_rate": 1.8445270533978388e-05,
2199
+ "loss": 1.0535,
2200
+ "step": 3130
2201
+ },
2202
+ {
2203
+ "epoch": 0.628,
2204
+ "grad_norm": 1.3915554285049438,
2205
+ "learning_rate": 1.827700448461836e-05,
2206
+ "loss": 1.0675,
2207
+ "step": 3140
2208
+ },
2209
+ {
2210
+ "epoch": 0.63,
2211
+ "grad_norm": 0.0,
2212
+ "learning_rate": 1.8109066104575023e-05,
2213
+ "loss": 1.6361,
2214
+ "step": 3150
2215
+ },
2216
+ {
2217
+ "epoch": 0.632,
2218
+ "grad_norm": 5.701220989227295,
2219
+ "learning_rate": 1.7941463578928086e-05,
2220
+ "loss": 1.1624,
2221
+ "step": 3160
2222
+ },
2223
+ {
2224
+ "epoch": 0.634,
2225
+ "grad_norm": 1.6050679683685303,
2226
+ "learning_rate": 1.7774205076388206e-05,
2227
+ "loss": 0.8855,
2228
+ "step": 3170
2229
+ },
2230
+ {
2231
+ "epoch": 0.636,
2232
+ "grad_norm": 6.965709209442139,
2233
+ "learning_rate": 1.7607298748898842e-05,
2234
+ "loss": 1.0965,
2235
+ "step": 3180
2236
+ },
2237
+ {
2238
+ "epoch": 0.638,
2239
+ "grad_norm": 4.4240570068359375,
2240
+ "learning_rate": 1.744075273123889e-05,
2241
+ "loss": 0.3191,
2242
+ "step": 3190
2243
+ },
2244
+ {
2245
+ "epoch": 0.64,
2246
+ "grad_norm": 0.0,
2247
+ "learning_rate": 1.7274575140626318e-05,
2248
+ "loss": 0.6535,
2249
+ "step": 3200
2250
+ },
2251
+ {
2252
+ "epoch": 0.642,
2253
+ "grad_norm": 0.0,
2254
+ "learning_rate": 1.7108774076322443e-05,
2255
+ "loss": 0.4069,
2256
+ "step": 3210
2257
+ },
2258
+ {
2259
+ "epoch": 0.644,
2260
+ "grad_norm": 0.47515714168548584,
2261
+ "learning_rate": 1.6943357619237226e-05,
2262
+ "loss": 0.5898,
2263
+ "step": 3220
2264
+ },
2265
+ {
2266
+ "epoch": 0.646,
2267
+ "grad_norm": 0.7474708557128906,
2268
+ "learning_rate": 1.677833383153542e-05,
2269
+ "loss": 0.3429,
2270
+ "step": 3230
2271
+ },
2272
+ {
2273
+ "epoch": 0.648,
2274
+ "grad_norm": 1.0888911485671997,
2275
+ "learning_rate": 1.6613710756243626e-05,
2276
+ "loss": 0.6403,
2277
+ "step": 3240
2278
+ },
2279
+ {
2280
+ "epoch": 0.65,
2281
+ "grad_norm": 0.7075658440589905,
2282
+ "learning_rate": 1.6449496416858284e-05,
2283
+ "loss": 1.1898,
2284
+ "step": 3250
2285
+ },
2286
+ {
2287
+ "epoch": 0.652,
2288
+ "grad_norm": 5.701323509216309,
2289
+ "learning_rate": 1.6285698816954624e-05,
2290
+ "loss": 0.4049,
2291
+ "step": 3260
2292
+ },
2293
+ {
2294
+ "epoch": 0.654,
2295
+ "grad_norm": 14.747136116027832,
2296
+ "learning_rate": 1.612232593979658e-05,
2297
+ "loss": 0.6081,
2298
+ "step": 3270
2299
+ },
2300
+ {
2301
+ "epoch": 0.656,
2302
+ "grad_norm": 0.0,
2303
+ "learning_rate": 1.5959385747947698e-05,
2304
+ "loss": 0.8107,
2305
+ "step": 3280
2306
+ },
2307
+ {
2308
+ "epoch": 0.658,
2309
+ "grad_norm": 0.0,
2310
+ "learning_rate": 1.5796886182883053e-05,
2311
+ "loss": 0.6606,
2312
+ "step": 3290
2313
+ },
2314
+ {
2315
+ "epoch": 0.66,
2316
+ "grad_norm": 0.8380840420722961,
2317
+ "learning_rate": 1.56348351646022e-05,
2318
+ "loss": 0.4767,
2319
+ "step": 3300
2320
+ },
2321
+ {
2322
+ "epoch": 0.662,
2323
+ "grad_norm": 0.0,
2324
+ "learning_rate": 1.547324059124315e-05,
2325
+ "loss": 0.988,
2326
+ "step": 3310
2327
+ },
2328
+ {
2329
+ "epoch": 0.664,
2330
+ "grad_norm": 0.0,
2331
+ "learning_rate": 1.5312110338697426e-05,
2332
+ "loss": 1.3237,
2333
+ "step": 3320
2334
+ },
2335
+ {
2336
+ "epoch": 0.666,
2337
+ "grad_norm": 3.07537841796875,
2338
+ "learning_rate": 1.5151452260226224e-05,
2339
+ "loss": 0.4173,
2340
+ "step": 3330
2341
+ },
2342
+ {
2343
+ "epoch": 0.668,
2344
+ "grad_norm": 2.070822238922119,
2345
+ "learning_rate": 1.4991274186077632e-05,
2346
+ "loss": 0.7819,
2347
+ "step": 3340
2348
+ },
2349
+ {
2350
+ "epoch": 0.67,
2351
+ "grad_norm": 0.3668426275253296,
2352
+ "learning_rate": 1.4831583923104999e-05,
2353
+ "loss": 0.334,
2354
+ "step": 3350
2355
+ },
2356
+ {
2357
+ "epoch": 0.672,
2358
+ "grad_norm": 1.416359543800354,
2359
+ "learning_rate": 1.467238925438646e-05,
2360
+ "loss": 0.8675,
2361
+ "step": 3360
2362
+ },
2363
+ {
2364
+ "epoch": 0.674,
2365
+ "grad_norm": 0.936389148235321,
2366
+ "learning_rate": 1.4513697938845572e-05,
2367
+ "loss": 0.6056,
2368
+ "step": 3370
2369
+ },
2370
+ {
2371
+ "epoch": 0.676,
2372
+ "grad_norm": 0.4283387362957001,
2373
+ "learning_rate": 1.4355517710873184e-05,
2374
+ "loss": 0.2135,
2375
+ "step": 3380
2376
+ },
2377
+ {
2378
+ "epoch": 0.678,
2379
+ "grad_norm": 9.048284530639648,
2380
+ "learning_rate": 1.4197856279950438e-05,
2381
+ "loss": 1.0051,
2382
+ "step": 3390
2383
+ },
2384
+ {
2385
+ "epoch": 0.68,
2386
+ "grad_norm": 4.453339099884033,
2387
+ "learning_rate": 1.4040721330273062e-05,
2388
+ "loss": 0.4415,
2389
+ "step": 3400
2390
+ },
2391
+ {
2392
+ "epoch": 0.682,
2393
+ "grad_norm": 0.0,
2394
+ "learning_rate": 1.388412052037682e-05,
2395
+ "loss": 0.625,
2396
+ "step": 3410
2397
+ },
2398
+ {
2399
+ "epoch": 0.684,
2400
+ "grad_norm": 2.6629364490509033,
2401
+ "learning_rate": 1.3728061482764238e-05,
2402
+ "loss": 1.0217,
2403
+ "step": 3420
2404
+ },
2405
+ {
2406
+ "epoch": 0.686,
2407
+ "grad_norm": 0.0,
2408
+ "learning_rate": 1.3572551823532654e-05,
2409
+ "loss": 1.101,
2410
+ "step": 3430
2411
+ },
2412
+ {
2413
+ "epoch": 0.688,
2414
+ "grad_norm": 3.9471435546875,
2415
+ "learning_rate": 1.3417599122003464e-05,
2416
+ "loss": 0.9139,
2417
+ "step": 3440
2418
+ },
2419
+ {
2420
+ "epoch": 0.69,
2421
+ "grad_norm": 0.805842936038971,
2422
+ "learning_rate": 1.3263210930352737e-05,
2423
+ "loss": 0.7426,
2424
+ "step": 3450
2425
+ },
2426
+ {
2427
+ "epoch": 0.692,
2428
+ "grad_norm": 0.8995339274406433,
2429
+ "learning_rate": 1.3109394773243117e-05,
2430
+ "loss": 1.0954,
2431
+ "step": 3460
2432
+ },
2433
+ {
2434
+ "epoch": 0.694,
2435
+ "grad_norm": 0.29068702459335327,
2436
+ "learning_rate": 1.2956158147457115e-05,
2437
+ "loss": 0.6788,
2438
+ "step": 3470
2439
+ },
2440
+ {
2441
+ "epoch": 0.696,
2442
+ "grad_norm": 0.0,
2443
+ "learning_rate": 1.280350852153168e-05,
2444
+ "loss": 0.8617,
2445
+ "step": 3480
2446
+ },
2447
+ {
2448
+ "epoch": 0.698,
2449
+ "grad_norm": 0.6072128415107727,
2450
+ "learning_rate": 1.2651453335394231e-05,
2451
+ "loss": 0.653,
2452
+ "step": 3490
2453
+ },
2454
+ {
2455
+ "epoch": 0.7,
2456
+ "grad_norm": 0.0,
2457
+ "learning_rate": 1.2500000000000006e-05,
2458
+ "loss": 0.8003,
2459
+ "step": 3500
2460
+ },
2461
+ {
2462
+ "epoch": 0.702,
2463
+ "grad_norm": 0.8060992360115051,
2464
+ "learning_rate": 1.234915589697091e-05,
2465
+ "loss": 0.5579,
2466
+ "step": 3510
2467
+ },
2468
+ {
2469
+ "epoch": 0.704,
2470
+ "grad_norm": 0.0,
2471
+ "learning_rate": 1.2198928378235716e-05,
2472
+ "loss": 1.5354,
2473
+ "step": 3520
2474
+ },
2475
+ {
2476
+ "epoch": 0.706,
2477
+ "grad_norm": 13.488487243652344,
2478
+ "learning_rate": 1.2049324765671749e-05,
2479
+ "loss": 1.6175,
2480
+ "step": 3530
2481
+ },
2482
+ {
2483
+ "epoch": 0.708,
2484
+ "grad_norm": 1.7564411163330078,
2485
+ "learning_rate": 1.1900352350748026e-05,
2486
+ "loss": 0.4771,
2487
+ "step": 3540
2488
+ },
2489
+ {
2490
+ "epoch": 0.71,
2491
+ "grad_norm": 0.0,
2492
+ "learning_rate": 1.175201839416988e-05,
2493
+ "loss": 0.779,
2494
+ "step": 3550
2495
+ },
2496
+ {
2497
+ "epoch": 0.712,
2498
+ "grad_norm": 25.30704689025879,
2499
+ "learning_rate": 1.1604330125525079e-05,
2500
+ "loss": 1.1478,
2501
+ "step": 3560
2502
+ },
2503
+ {
2504
+ "epoch": 0.714,
2505
+ "grad_norm": 0.45064303278923035,
2506
+ "learning_rate": 1.1457294742931507e-05,
2507
+ "loss": 0.7484,
2508
+ "step": 3570
2509
+ },
2510
+ {
2511
+ "epoch": 0.716,
2512
+ "grad_norm": 0.0,
2513
+ "learning_rate": 1.1310919412686247e-05,
2514
+ "loss": 1.0581,
2515
+ "step": 3580
2516
+ },
2517
+ {
2518
+ "epoch": 0.718,
2519
+ "grad_norm": 0.0,
2520
+ "learning_rate": 1.11652112689164e-05,
2521
+ "loss": 0.989,
2522
+ "step": 3590
2523
+ },
2524
+ {
2525
+ "epoch": 0.72,
2526
+ "grad_norm": 0.0,
2527
+ "learning_rate": 1.1020177413231334e-05,
2528
+ "loss": 1.5538,
2529
+ "step": 3600
2530
+ },
2531
+ {
2532
+ "epoch": 0.722,
2533
+ "grad_norm": 0.0,
2534
+ "learning_rate": 1.0875824914376553e-05,
2535
+ "loss": 0.9328,
2536
+ "step": 3610
2537
+ },
2538
+ {
2539
+ "epoch": 0.724,
2540
+ "grad_norm": 2.3531455993652344,
2541
+ "learning_rate": 1.0732160807889211e-05,
2542
+ "loss": 1.4623,
2543
+ "step": 3620
2544
+ },
2545
+ {
2546
+ "epoch": 0.726,
2547
+ "grad_norm": 0.0,
2548
+ "learning_rate": 1.058919209575517e-05,
2549
+ "loss": 1.0274,
2550
+ "step": 3630
2551
+ },
2552
+ {
2553
+ "epoch": 0.728,
2554
+ "grad_norm": 2.038518190383911,
2555
+ "learning_rate": 1.0446925746067768e-05,
2556
+ "loss": 0.5183,
2557
+ "step": 3640
2558
+ },
2559
+ {
2560
+ "epoch": 0.73,
2561
+ "grad_norm": 0.0,
2562
+ "learning_rate": 1.0305368692688174e-05,
2563
+ "loss": 0.8523,
2564
+ "step": 3650
2565
+ },
2566
+ {
2567
+ "epoch": 0.732,
2568
+ "grad_norm": 2.5623738765716553,
2569
+ "learning_rate": 1.0164527834907467e-05,
2570
+ "loss": 0.9334,
2571
+ "step": 3660
2572
+ },
2573
+ {
2574
+ "epoch": 0.734,
2575
+ "grad_norm": 0.0,
2576
+ "learning_rate": 1.0024410037110357e-05,
2577
+ "loss": 1.2435,
2578
+ "step": 3670
2579
+ },
2580
+ {
2581
+ "epoch": 0.736,
2582
+ "grad_norm": 0.3360690474510193,
2583
+ "learning_rate": 9.88502212844063e-06,
2584
+ "loss": 1.5611,
2585
+ "step": 3680
2586
+ },
2587
+ {
2588
+ "epoch": 0.738,
2589
+ "grad_norm": 3.1597137451171875,
2590
+ "learning_rate": 9.746370902468311e-06,
2591
+ "loss": 0.9649,
2592
+ "step": 3690
2593
+ },
2594
+ {
2595
+ "epoch": 0.74,
2596
+ "grad_norm": 0.0,
2597
+ "learning_rate": 9.608463116858542e-06,
2598
+ "loss": 0.6053,
2599
+ "step": 3700
2600
+ },
2601
+ {
2602
+ "epoch": 0.742,
2603
+ "grad_norm": 5.861716270446777,
2604
+ "learning_rate": 9.471305493042243e-06,
2605
+ "loss": 0.6377,
2606
+ "step": 3710
2607
+ },
2608
+ {
2609
+ "epoch": 0.744,
2610
+ "grad_norm": 1.0426483154296875,
2611
+ "learning_rate": 9.334904715888495e-06,
2612
+ "loss": 1.0698,
2613
+ "step": 3720
2614
+ },
2615
+ {
2616
+ "epoch": 0.746,
2617
+ "grad_norm": 3.7929494380950928,
2618
+ "learning_rate": 9.199267433378727e-06,
2619
+ "loss": 0.9858,
2620
+ "step": 3730
2621
+ },
2622
+ {
2623
+ "epoch": 0.748,
2624
+ "grad_norm": 0.8633703589439392,
2625
+ "learning_rate": 9.064400256282757e-06,
2626
+ "loss": 0.3602,
2627
+ "step": 3740
2628
+ },
2629
+ {
2630
+ "epoch": 0.75,
2631
+ "grad_norm": 2.713400363922119,
2632
+ "learning_rate": 8.930309757836517e-06,
2633
+ "loss": 0.6672,
2634
+ "step": 3750
2635
+ },
2636
+ {
2637
+ "epoch": 0.752,
2638
+ "grad_norm": 0.759688138961792,
2639
+ "learning_rate": 8.797002473421728e-06,
2640
+ "loss": 0.5627,
2641
+ "step": 3760
2642
+ },
2643
+ {
2644
+ "epoch": 0.754,
2645
+ "grad_norm": 1.423932671546936,
2646
+ "learning_rate": 8.664484900247363e-06,
2647
+ "loss": 1.0572,
2648
+ "step": 3770
2649
+ },
2650
+ {
2651
+ "epoch": 0.756,
2652
+ "grad_norm": 10.111916542053223,
2653
+ "learning_rate": 8.532763497032987e-06,
2654
+ "loss": 1.2507,
2655
+ "step": 3780
2656
+ },
2657
+ {
2658
+ "epoch": 0.758,
2659
+ "grad_norm": 0.9725751876831055,
2660
+ "learning_rate": 8.40184468369396e-06,
2661
+ "loss": 0.7735,
2662
+ "step": 3790
2663
+ },
2664
+ {
2665
+ "epoch": 0.76,
2666
+ "grad_norm": 16.589933395385742,
2667
+ "learning_rate": 8.271734841028553e-06,
2668
+ "loss": 1.298,
2669
+ "step": 3800
2670
+ },
2671
+ {
2672
+ "epoch": 0.762,
2673
+ "grad_norm": 1.6179834604263306,
2674
+ "learning_rate": 8.142440310406924e-06,
2675
+ "loss": 0.7805,
2676
+ "step": 3810
2677
+ },
2678
+ {
2679
+ "epoch": 0.764,
2680
+ "grad_norm": 0.0,
2681
+ "learning_rate": 8.013967393462094e-06,
2682
+ "loss": 0.7498,
2683
+ "step": 3820
2684
+ },
2685
+ {
2686
+ "epoch": 0.766,
2687
+ "grad_norm": 0.0,
2688
+ "learning_rate": 7.886322351782783e-06,
2689
+ "loss": 1.1696,
2690
+ "step": 3830
2691
+ },
2692
+ {
2693
+ "epoch": 0.768,
2694
+ "grad_norm": 0.0,
2695
+ "learning_rate": 7.759511406608255e-06,
2696
+ "loss": 0.6709,
2697
+ "step": 3840
2698
+ },
2699
+ {
2700
+ "epoch": 0.77,
2701
+ "grad_norm": 0.7475419044494629,
2702
+ "learning_rate": 7.633540738525066e-06,
2703
+ "loss": 0.9733,
2704
+ "step": 3850
2705
+ },
2706
+ {
2707
+ "epoch": 0.772,
2708
+ "grad_norm": 4.253176689147949,
2709
+ "learning_rate": 7.508416487165862e-06,
2710
+ "loss": 0.6711,
2711
+ "step": 3860
2712
+ },
2713
+ {
2714
+ "epoch": 0.774,
2715
+ "grad_norm": 9.327080726623535,
2716
+ "learning_rate": 7.384144750910133e-06,
2717
+ "loss": 0.7134,
2718
+ "step": 3870
2719
+ },
2720
+ {
2721
+ "epoch": 0.776,
2722
+ "grad_norm": 2.4807772636413574,
2723
+ "learning_rate": 7.260731586586983e-06,
2724
+ "loss": 1.0323,
2725
+ "step": 3880
2726
+ },
2727
+ {
2728
+ "epoch": 0.778,
2729
+ "grad_norm": 0.0,
2730
+ "learning_rate": 7.138183009179922e-06,
2731
+ "loss": 0.4097,
2732
+ "step": 3890
2733
+ },
2734
+ {
2735
+ "epoch": 0.78,
2736
+ "grad_norm": 1.1434834003448486,
2737
+ "learning_rate": 7.016504991533726e-06,
2738
+ "loss": 1.0325,
2739
+ "step": 3900
2740
+ },
2741
+ {
2742
+ "epoch": 0.782,
2743
+ "grad_norm": 0.7977542281150818,
2744
+ "learning_rate": 6.895703464063319e-06,
2745
+ "loss": 0.2871,
2746
+ "step": 3910
2747
+ },
2748
+ {
2749
+ "epoch": 0.784,
2750
+ "grad_norm": 3.6492695808410645,
2751
+ "learning_rate": 6.775784314464717e-06,
2752
+ "loss": 0.8634,
2753
+ "step": 3920
2754
+ },
2755
+ {
2756
+ "epoch": 0.786,
2757
+ "grad_norm": 0.0,
2758
+ "learning_rate": 6.656753387428089e-06,
2759
+ "loss": 1.8368,
2760
+ "step": 3930
2761
+ },
2762
+ {
2763
+ "epoch": 0.788,
2764
+ "grad_norm": 1.5954694747924805,
2765
+ "learning_rate": 6.538616484352902e-06,
2766
+ "loss": 0.5746,
2767
+ "step": 3940
2768
+ },
2769
+ {
2770
+ "epoch": 0.79,
2771
+ "grad_norm": 0.0,
2772
+ "learning_rate": 6.421379363065142e-06,
2773
+ "loss": 2.333,
2774
+ "step": 3950
2775
+ },
2776
+ {
2777
+ "epoch": 0.792,
2778
+ "grad_norm": 0.48572126030921936,
2779
+ "learning_rate": 6.305047737536707e-06,
2780
+ "loss": 0.393,
2781
+ "step": 3960
2782
+ },
2783
+ {
2784
+ "epoch": 0.794,
2785
+ "grad_norm": 0.7762076258659363,
2786
+ "learning_rate": 6.189627277606894e-06,
2787
+ "loss": 1.0324,
2788
+ "step": 3970
2789
+ },
2790
+ {
2791
+ "epoch": 0.796,
2792
+ "grad_norm": 0.4900152385234833,
2793
+ "learning_rate": 6.075123608706093e-06,
2794
+ "loss": 0.9407,
2795
+ "step": 3980
2796
+ },
2797
+ {
2798
+ "epoch": 0.798,
2799
+ "grad_norm": 4.45159912109375,
2800
+ "learning_rate": 5.961542311581586e-06,
2801
+ "loss": 1.2251,
2802
+ "step": 3990
2803
+ },
2804
+ {
2805
+ "epoch": 0.8,
2806
+ "grad_norm": 0.44430792331695557,
2807
+ "learning_rate": 5.848888922025553e-06,
2808
+ "loss": 0.635,
2809
+ "step": 4000
2810
+ },
2811
+ {
2812
+ "epoch": 0.802,
2813
+ "grad_norm": 0.338540256023407,
2814
+ "learning_rate": 5.737168930605272e-06,
2815
+ "loss": 0.3039,
2816
+ "step": 4010
2817
+ },
2818
+ {
2819
+ "epoch": 0.804,
2820
+ "grad_norm": 0.0,
2821
+ "learning_rate": 5.626387782395512e-06,
2822
+ "loss": 1.0606,
2823
+ "step": 4020
2824
+ },
2825
+ {
2826
+ "epoch": 0.806,
2827
+ "grad_norm": 47.97066879272461,
2828
+ "learning_rate": 5.5165508767131415e-06,
2829
+ "loss": 2.3978,
2830
+ "step": 4030
2831
+ },
2832
+ {
2833
+ "epoch": 0.808,
2834
+ "grad_norm": 17.087448120117188,
2835
+ "learning_rate": 5.4076635668540075e-06,
2836
+ "loss": 1.7771,
2837
+ "step": 4040
2838
+ },
2839
+ {
2840
+ "epoch": 0.81,
2841
+ "grad_norm": 0.0,
2842
+ "learning_rate": 5.299731159831953e-06,
2843
+ "loss": 1.5544,
2844
+ "step": 4050
2845
+ },
2846
+ {
2847
+ "epoch": 0.812,
2848
+ "grad_norm": 3.899841785430908,
2849
+ "learning_rate": 5.192758916120236e-06,
2850
+ "loss": 0.574,
2851
+ "step": 4060
2852
+ },
2853
+ {
2854
+ "epoch": 0.814,
2855
+ "grad_norm": 0.0,
2856
+ "learning_rate": 5.086752049395094e-06,
2857
+ "loss": 0.9646,
2858
+ "step": 4070
2859
+ },
2860
+ {
2861
+ "epoch": 0.816,
2862
+ "grad_norm": 0.9200013279914856,
2863
+ "learning_rate": 4.981715726281666e-06,
2864
+ "loss": 0.422,
2865
+ "step": 4080
2866
+ },
2867
+ {
2868
+ "epoch": 0.818,
2869
+ "grad_norm": 10.240484237670898,
2870
+ "learning_rate": 4.877655066102149e-06,
2871
+ "loss": 0.6738,
2872
+ "step": 4090
2873
+ },
2874
+ {
2875
+ "epoch": 0.82,
2876
+ "grad_norm": 0.5230764746665955,
2877
+ "learning_rate": 4.7745751406263165e-06,
2878
+ "loss": 0.1991,
2879
+ "step": 4100
2880
+ },
2881
+ {
2882
+ "epoch": 0.822,
2883
+ "grad_norm": 9.851215362548828,
2884
+ "learning_rate": 4.672480973824311e-06,
2885
+ "loss": 0.7171,
2886
+ "step": 4110
2887
+ },
2888
+ {
2889
+ "epoch": 0.824,
2890
+ "grad_norm": 3.019394874572754,
2891
+ "learning_rate": 4.571377541621788e-06,
2892
+ "loss": 0.7478,
2893
+ "step": 4120
2894
+ },
2895
+ {
2896
+ "epoch": 0.826,
2897
+ "grad_norm": 0.4723840355873108,
2898
+ "learning_rate": 4.4712697716574e-06,
2899
+ "loss": 0.6985,
2900
+ "step": 4130
2901
+ },
2902
+ {
2903
+ "epoch": 0.828,
2904
+ "grad_norm": 1.5289415121078491,
2905
+ "learning_rate": 4.372162543042624e-06,
2906
+ "loss": 0.7893,
2907
+ "step": 4140
2908
+ },
2909
+ {
2910
+ "epoch": 0.83,
2911
+ "grad_norm": 1.3816901445388794,
2912
+ "learning_rate": 4.274060686123959e-06,
2913
+ "loss": 0.8127,
2914
+ "step": 4150
2915
+ },
2916
+ {
2917
+ "epoch": 0.832,
2918
+ "grad_norm": 0.4731523394584656,
2919
+ "learning_rate": 4.176968982247514e-06,
2920
+ "loss": 0.5259,
2921
+ "step": 4160
2922
+ },
2923
+ {
2924
+ "epoch": 0.834,
2925
+ "grad_norm": 19.804990768432617,
2926
+ "learning_rate": 4.08089216352596e-06,
2927
+ "loss": 0.9835,
2928
+ "step": 4170
2929
+ },
2930
+ {
2931
+ "epoch": 0.836,
2932
+ "grad_norm": 14.022736549377441,
2933
+ "learning_rate": 3.985834912607894e-06,
2934
+ "loss": 0.6508,
2935
+ "step": 4180
2936
+ },
2937
+ {
2938
+ "epoch": 0.838,
2939
+ "grad_norm": 5.0765485763549805,
2940
+ "learning_rate": 3.891801862449629e-06,
2941
+ "loss": 0.437,
2942
+ "step": 4190
2943
+ },
2944
+ {
2945
+ "epoch": 0.84,
2946
+ "grad_norm": 13.543389320373535,
2947
+ "learning_rate": 3.798797596089351e-06,
2948
+ "loss": 1.6048,
2949
+ "step": 4200
2950
+ },
2951
+ {
2952
+ "epoch": 0.842,
2953
+ "grad_norm": 3.4361746311187744,
2954
+ "learning_rate": 3.7068266464238084e-06,
2955
+ "loss": 0.8315,
2956
+ "step": 4210
2957
+ },
2958
+ {
2959
+ "epoch": 0.844,
2960
+ "grad_norm": 0.3679335415363312,
2961
+ "learning_rate": 3.6158934959873353e-06,
2962
+ "loss": 0.3853,
2963
+ "step": 4220
2964
+ },
2965
+ {
2966
+ "epoch": 0.846,
2967
+ "grad_norm": 0.0,
2968
+ "learning_rate": 3.5260025767333893e-06,
2969
+ "loss": 1.3448,
2970
+ "step": 4230
2971
+ },
2972
+ {
2973
+ "epoch": 0.848,
2974
+ "grad_norm": 4.070678234100342,
2975
+ "learning_rate": 3.4371582698185633e-06,
2976
+ "loss": 0.2405,
2977
+ "step": 4240
2978
+ },
2979
+ {
2980
+ "epoch": 0.85,
2981
+ "grad_norm": 0.0,
2982
+ "learning_rate": 3.3493649053890326e-06,
2983
+ "loss": 0.8112,
2984
+ "step": 4250
2985
+ },
2986
+ {
2987
+ "epoch": 0.852,
2988
+ "grad_norm": 3.341031551361084,
2989
+ "learning_rate": 3.262626762369525e-06,
2990
+ "loss": 1.3852,
2991
+ "step": 4260
2992
+ },
2993
+ {
2994
+ "epoch": 0.854,
2995
+ "grad_norm": 0.310798317193985,
2996
+ "learning_rate": 3.176948068254762e-06,
2997
+ "loss": 0.3188,
2998
+ "step": 4270
2999
+ },
3000
+ {
3001
+ "epoch": 0.856,
3002
+ "grad_norm": 6.217565536499023,
3003
+ "learning_rate": 3.092332998903416e-06,
3004
+ "loss": 0.4712,
3005
+ "step": 4280
3006
+ },
3007
+ {
3008
+ "epoch": 0.858,
3009
+ "grad_norm": 0.0,
3010
+ "learning_rate": 3.0087856783345914e-06,
3011
+ "loss": 0.4124,
3012
+ "step": 4290
3013
+ },
3014
+ {
3015
+ "epoch": 0.86,
3016
+ "grad_norm": 1.9965441226959229,
3017
+ "learning_rate": 2.9263101785268254e-06,
3018
+ "loss": 1.4114,
3019
+ "step": 4300
3020
+ },
3021
+ {
3022
+ "epoch": 0.862,
3023
+ "grad_norm": 28.95586395263672,
3024
+ "learning_rate": 2.8449105192196316e-06,
3025
+ "loss": 1.5525,
3026
+ "step": 4310
3027
+ },
3028
+ {
3029
+ "epoch": 0.864,
3030
+ "grad_norm": 0.0,
3031
+ "learning_rate": 2.764590667717562e-06,
3032
+ "loss": 1.1899,
3033
+ "step": 4320
3034
+ },
3035
+ {
3036
+ "epoch": 0.866,
3037
+ "grad_norm": 3.072131872177124,
3038
+ "learning_rate": 2.6853545386968606e-06,
3039
+ "loss": 0.5857,
3040
+ "step": 4330
3041
+ },
3042
+ {
3043
+ "epoch": 0.868,
3044
+ "grad_norm": 0.0,
3045
+ "learning_rate": 2.6072059940146775e-06,
3046
+ "loss": 1.6246,
3047
+ "step": 4340
3048
+ },
3049
+ {
3050
+ "epoch": 0.87,
3051
+ "grad_norm": 0.0,
3052
+ "learning_rate": 2.5301488425208296e-06,
3053
+ "loss": 0.2832,
3054
+ "step": 4350
3055
+ },
3056
+ {
3057
+ "epoch": 0.872,
3058
+ "grad_norm": 1.8508862257003784,
3059
+ "learning_rate": 2.454186839872158e-06,
3060
+ "loss": 1.5246,
3061
+ "step": 4360
3062
+ },
3063
+ {
3064
+ "epoch": 0.874,
3065
+ "grad_norm": 2.0368053913116455,
3066
+ "learning_rate": 2.379323688349516e-06,
3067
+ "loss": 1.5678,
3068
+ "step": 4370
3069
+ },
3070
+ {
3071
+ "epoch": 0.876,
3072
+ "grad_norm": 11.109952926635742,
3073
+ "learning_rate": 2.3055630366772856e-06,
3074
+ "loss": 1.3038,
3075
+ "step": 4380
3076
+ },
3077
+ {
3078
+ "epoch": 0.878,
3079
+ "grad_norm": 0.0,
3080
+ "learning_rate": 2.2329084798455746e-06,
3081
+ "loss": 0.7802,
3082
+ "step": 4390
3083
+ },
3084
+ {
3085
+ "epoch": 0.88,
3086
+ "grad_norm": 5.772270202636719,
3087
+ "learning_rate": 2.1613635589349756e-06,
3088
+ "loss": 1.1187,
3089
+ "step": 4400
3090
+ },
3091
+ {
3092
+ "epoch": 0.882,
3093
+ "grad_norm": 0.8582921028137207,
3094
+ "learning_rate": 2.0909317609440095e-06,
3095
+ "loss": 1.7619,
3096
+ "step": 4410
3097
+ },
3098
+ {
3099
+ "epoch": 0.884,
3100
+ "grad_norm": 5.193258285522461,
3101
+ "learning_rate": 2.0216165186191407e-06,
3102
+ "loss": 1.2435,
3103
+ "step": 4420
3104
+ },
3105
+ {
3106
+ "epoch": 0.886,
3107
+ "grad_norm": 5.076511383056641,
3108
+ "learning_rate": 1.95342121028749e-06,
3109
+ "loss": 1.7134,
3110
+ "step": 4430
3111
+ },
3112
+ {
3113
+ "epoch": 0.888,
3114
+ "grad_norm": 0.6807665228843689,
3115
+ "learning_rate": 1.8863491596921745e-06,
3116
+ "loss": 0.8158,
3117
+ "step": 4440
3118
+ },
3119
+ {
3120
+ "epoch": 0.89,
3121
+ "grad_norm": 4.553281784057617,
3122
+ "learning_rate": 1.8204036358303173e-06,
3123
+ "loss": 0.7814,
3124
+ "step": 4450
3125
+ },
3126
+ {
3127
+ "epoch": 0.892,
3128
+ "grad_norm": 8.79820728302002,
3129
+ "learning_rate": 1.7555878527937164e-06,
3130
+ "loss": 0.6062,
3131
+ "step": 4460
3132
+ },
3133
+ {
3134
+ "epoch": 0.894,
3135
+ "grad_norm": 0.0,
3136
+ "learning_rate": 1.6919049696121958e-06,
3137
+ "loss": 1.6605,
3138
+ "step": 4470
3139
+ },
3140
+ {
3141
+ "epoch": 0.896,
3142
+ "grad_norm": 1.9376107454299927,
3143
+ "learning_rate": 1.629358090099639e-06,
3144
+ "loss": 0.6406,
3145
+ "step": 4480
3146
+ },
3147
+ {
3148
+ "epoch": 0.898,
3149
+ "grad_norm": 0.6032451391220093,
3150
+ "learning_rate": 1.5679502627027136e-06,
3151
+ "loss": 0.8206,
3152
+ "step": 4490
3153
+ },
3154
+ {
3155
+ "epoch": 0.9,
3156
+ "grad_norm": 6.2024078369140625,
3157
+ "learning_rate": 1.5076844803522922e-06,
3158
+ "loss": 0.3695,
3159
+ "step": 4500
3160
+ },
3161
+ {
3162
+ "epoch": 0.902,
3163
+ "grad_norm": 1.7813353538513184,
3164
+ "learning_rate": 1.4485636803175829e-06,
3165
+ "loss": 1.0353,
3166
+ "step": 4510
3167
+ },
3168
+ {
3169
+ "epoch": 0.904,
3170
+ "grad_norm": 0.858911395072937,
3171
+ "learning_rate": 1.3905907440629752e-06,
3172
+ "loss": 0.8934,
3173
+ "step": 4520
3174
+ },
3175
+ {
3176
+ "epoch": 0.906,
3177
+ "grad_norm": 2.2456307411193848,
3178
+ "learning_rate": 1.333768497107593e-06,
3179
+ "loss": 0.5448,
3180
+ "step": 4530
3181
+ },
3182
+ {
3183
+ "epoch": 0.908,
3184
+ "grad_norm": 6.542331695556641,
3185
+ "learning_rate": 1.2780997088875869e-06,
3186
+ "loss": 0.6171,
3187
+ "step": 4540
3188
+ },
3189
+ {
3190
+ "epoch": 0.91,
3191
+ "grad_norm": 0.3684110641479492,
3192
+ "learning_rate": 1.2235870926211619e-06,
3193
+ "loss": 0.8105,
3194
+ "step": 4550
3195
+ },
3196
+ {
3197
+ "epoch": 0.912,
3198
+ "grad_norm": 1.4687561988830566,
3199
+ "learning_rate": 1.170233305176327e-06,
3200
+ "loss": 0.6286,
3201
+ "step": 4560
3202
+ },
3203
+ {
3204
+ "epoch": 0.914,
3205
+ "grad_norm": 4.056073188781738,
3206
+ "learning_rate": 1.1180409469414094e-06,
3207
+ "loss": 1.37,
3208
+ "step": 4570
3209
+ },
3210
+ {
3211
+ "epoch": 0.916,
3212
+ "grad_norm": 0.0,
3213
+ "learning_rate": 1.067012561698319e-06,
3214
+ "loss": 0.6792,
3215
+ "step": 4580
3216
+ },
3217
+ {
3218
+ "epoch": 0.918,
3219
+ "grad_norm": 10.166934967041016,
3220
+ "learning_rate": 1.0171506364985622e-06,
3221
+ "loss": 3.2842,
3222
+ "step": 4590
3223
+ },
3224
+ {
3225
+ "epoch": 0.92,
3226
+ "grad_norm": 2.7631049156188965,
3227
+ "learning_rate": 9.684576015420278e-07,
3228
+ "loss": 0.7875,
3229
+ "step": 4600
3230
+ },
3231
+ {
3232
+ "epoch": 0.922,
3233
+ "grad_norm": 1.1183360815048218,
3234
+ "learning_rate": 9.209358300585474e-07,
3235
+ "loss": 0.7768,
3236
+ "step": 4610
3237
+ },
3238
+ {
3239
+ "epoch": 0.924,
3240
+ "grad_norm": 0.0,
3241
+ "learning_rate": 8.745876381922147e-07,
3242
+ "loss": 0.9518,
3243
+ "step": 4620
3244
+ },
3245
+ {
3246
+ "epoch": 0.926,
3247
+ "grad_norm": 0.7455951571464539,
3248
+ "learning_rate": 8.294152848885157e-07,
3249
+ "loss": 0.4208,
3250
+ "step": 4630
3251
+ },
3252
+ {
3253
+ "epoch": 0.928,
3254
+ "grad_norm": 0.0,
3255
+ "learning_rate": 7.854209717842231e-07,
3256
+ "loss": 0.2602,
3257
+ "step": 4640
3258
+ },
3259
+ {
3260
+ "epoch": 0.93,
3261
+ "grad_norm": 0.605595052242279,
3262
+ "learning_rate": 7.426068431000882e-07,
3263
+ "loss": 0.8418,
3264
+ "step": 4650
3265
+ },
3266
+ {
3267
+ "epoch": 0.932,
3268
+ "grad_norm": 7.5212554931640625,
3269
+ "learning_rate": 7.009749855363456e-07,
3270
+ "loss": 0.4949,
3271
+ "step": 4660
3272
+ },
3273
+ {
3274
+ "epoch": 0.934,
3275
+ "grad_norm": 0.0,
3276
+ "learning_rate": 6.605274281709928e-07,
3277
+ "loss": 0.5919,
3278
+ "step": 4670
3279
+ },
3280
+ {
3281
+ "epoch": 0.936,
3282
+ "grad_norm": 0.0,
3283
+ "learning_rate": 6.212661423609184e-07,
3284
+ "loss": 0.9009,
3285
+ "step": 4680
3286
+ },
3287
+ {
3288
+ "epoch": 0.938,
3289
+ "grad_norm": 0.7921484708786011,
3290
+ "learning_rate": 5.83193041645802e-07,
3291
+ "loss": 0.4119,
3292
+ "step": 4690
3293
+ },
3294
+ {
3295
+ "epoch": 0.94,
3296
+ "grad_norm": 0.0,
3297
+ "learning_rate": 5.463099816548579e-07,
3298
+ "loss": 0.4794,
3299
+ "step": 4700
3300
+ },
3301
+ {
3302
+ "epoch": 0.942,
3303
+ "grad_norm": 1.862557053565979,
3304
+ "learning_rate": 5.106187600163987e-07,
3305
+ "loss": 0.8312,
3306
+ "step": 4710
3307
+ },
3308
+ {
3309
+ "epoch": 0.944,
3310
+ "grad_norm": 1.6036162376403809,
3311
+ "learning_rate": 4.7612111627021175e-07,
3312
+ "loss": 1.0177,
3313
+ "step": 4720
3314
+ },
3315
+ {
3316
+ "epoch": 0.946,
3317
+ "grad_norm": 0.8620813488960266,
3318
+ "learning_rate": 4.4281873178278475e-07,
3319
+ "loss": 0.7843,
3320
+ "step": 4730
3321
+ },
3322
+ {
3323
+ "epoch": 0.948,
3324
+ "grad_norm": 0.0,
3325
+ "learning_rate": 4.107132296653549e-07,
3326
+ "loss": 0.7314,
3327
+ "step": 4740
3328
+ },
3329
+ {
3330
+ "epoch": 0.95,
3331
+ "grad_norm": 2.933771848678589,
3332
+ "learning_rate": 3.7980617469479953e-07,
3333
+ "loss": 0.5168,
3334
+ "step": 4750
3335
+ },
3336
+ {
3337
+ "epoch": 0.952,
3338
+ "grad_norm": 0.0,
3339
+ "learning_rate": 3.5009907323737825e-07,
3340
+ "loss": 1.4444,
3341
+ "step": 4760
3342
+ },
3343
+ {
3344
+ "epoch": 0.954,
3345
+ "grad_norm": 19.85219955444336,
3346
+ "learning_rate": 3.215933731753024e-07,
3347
+ "loss": 1.1199,
3348
+ "step": 4770
3349
+ },
3350
+ {
3351
+ "epoch": 0.956,
3352
+ "grad_norm": 0.0,
3353
+ "learning_rate": 2.942904638361804e-07,
3354
+ "loss": 0.5519,
3355
+ "step": 4780
3356
+ },
3357
+ {
3358
+ "epoch": 0.958,
3359
+ "grad_norm": 1.450451135635376,
3360
+ "learning_rate": 2.681916759252917e-07,
3361
+ "loss": 0.9452,
3362
+ "step": 4790
3363
+ },
3364
+ {
3365
+ "epoch": 0.96,
3366
+ "grad_norm": 0.0,
3367
+ "learning_rate": 2.4329828146074095e-07,
3368
+ "loss": 3.2854,
3369
+ "step": 4800
3370
+ },
3371
+ {
3372
+ "epoch": 0.962,
3373
+ "grad_norm": 0.326992005109787,
3374
+ "learning_rate": 2.1961149371145795e-07,
3375
+ "loss": 0.5169,
3376
+ "step": 4810
3377
+ },
3378
+ {
3379
+ "epoch": 0.964,
3380
+ "grad_norm": 0.0,
3381
+ "learning_rate": 1.9713246713805588e-07,
3382
+ "loss": 0.8747,
3383
+ "step": 4820
3384
+ },
3385
+ {
3386
+ "epoch": 0.966,
3387
+ "grad_norm": 3.3700320720672607,
3388
+ "learning_rate": 1.7586229733657644e-07,
3389
+ "loss": 1.0667,
3390
+ "step": 4830
3391
+ },
3392
+ {
3393
+ "epoch": 0.968,
3394
+ "grad_norm": 9.731518745422363,
3395
+ "learning_rate": 1.5580202098509077e-07,
3396
+ "loss": 1.4877,
3397
+ "step": 4840
3398
+ },
3399
+ {
3400
+ "epoch": 0.97,
3401
+ "grad_norm": 0.9191303253173828,
3402
+ "learning_rate": 1.3695261579316777e-07,
3403
+ "loss": 0.9893,
3404
+ "step": 4850
3405
+ },
3406
+ {
3407
+ "epoch": 0.972,
3408
+ "grad_norm": 0.6930148005485535,
3409
+ "learning_rate": 1.193150004542204e-07,
3410
+ "loss": 0.9399,
3411
+ "step": 4860
3412
+ },
3413
+ {
3414
+ "epoch": 0.974,
3415
+ "grad_norm": 1.6349282264709473,
3416
+ "learning_rate": 1.0289003460074165e-07,
3417
+ "loss": 0.4823,
3418
+ "step": 4870
3419
+ },
3420
+ {
3421
+ "epoch": 0.976,
3422
+ "grad_norm": 0.8186390399932861,
3423
+ "learning_rate": 8.767851876239074e-08,
3424
+ "loss": 0.999,
3425
+ "step": 4880
3426
+ },
3427
+ {
3428
+ "epoch": 0.978,
3429
+ "grad_norm": 0.3777938485145569,
3430
+ "learning_rate": 7.368119432699383e-08,
3431
+ "loss": 0.6337,
3432
+ "step": 4890
3433
+ },
3434
+ {
3435
+ "epoch": 0.98,
3436
+ "grad_norm": 1.0768874883651733,
3437
+ "learning_rate": 6.089874350439506e-08,
3438
+ "loss": 0.5041,
3439
+ "step": 4900
3440
+ },
3441
+ {
3442
+ "epoch": 0.982,
3443
+ "grad_norm": 1.119691252708435,
3444
+ "learning_rate": 4.9331789293211026e-08,
3445
+ "loss": 0.9211,
3446
+ "step": 4910
3447
+ },
3448
+ {
3449
+ "epoch": 0.984,
3450
+ "grad_norm": 0.404674768447876,
3451
+ "learning_rate": 3.8980895450474455e-08,
3452
+ "loss": 0.5071,
3453
+ "step": 4920
3454
+ },
3455
+ {
3456
+ "epoch": 0.986,
3457
+ "grad_norm": 15.42659854888916,
3458
+ "learning_rate": 2.9846566464150626e-08,
3459
+ "loss": 0.7782,
3460
+ "step": 4930
3461
+ },
3462
+ {
3463
+ "epoch": 0.988,
3464
+ "grad_norm": 0.0,
3465
+ "learning_rate": 2.192924752854042e-08,
3466
+ "loss": 0.9379,
3467
+ "step": 4940
3468
+ },
3469
+ {
3470
+ "epoch": 0.99,
3471
+ "grad_norm": 0.2315896451473236,
3472
+ "learning_rate": 1.522932452260595e-08,
3473
+ "loss": 0.552,
3474
+ "step": 4950
3475
+ },
3476
+ {
3477
+ "epoch": 0.992,
3478
+ "grad_norm": 9.166641235351562,
3479
+ "learning_rate": 9.747123991141194e-09,
3480
+ "loss": 0.5151,
3481
+ "step": 4960
3482
+ },
3483
+ {
3484
+ "epoch": 0.994,
3485
+ "grad_norm": 2.209789752960205,
3486
+ "learning_rate": 5.48291312886251e-09,
3487
+ "loss": 0.7505,
3488
+ "step": 4970
3489
+ },
3490
+ {
3491
+ "epoch": 0.996,
3492
+ "grad_norm": 0.7653072476387024,
3493
+ "learning_rate": 2.4368997673940297e-09,
3494
+ "loss": 0.8377,
3495
+ "step": 4980
3496
+ },
3497
+ {
3498
+ "epoch": 0.998,
3499
+ "grad_norm": 1.3366658687591553,
3500
+ "learning_rate": 6.092323651313292e-10,
3501
+ "loss": 1.0276,
3502
+ "step": 4990
3503
+ },
3504
+ {
3505
+ "epoch": 1.0,
3506
+ "grad_norm": 0.0,
3507
+ "learning_rate": 0.0,
3508
+ "loss": 0.4942,
3509
+ "step": 5000
3510
+ },
3511
+ {
3512
+ "epoch": 1.0,
3513
+ "step": 5000,
3514
+ "total_flos": 3.424475897929728e+16,
3515
+ "train_loss": 1.0425229248046874,
3516
+ "train_runtime": 1241.3169,
3517
+ "train_samples_per_second": 4.028,
3518
+ "train_steps_per_second": 4.028
3519
+ }
3520
+ ],
3521
+ "logging_steps": 10,
3522
+ "max_steps": 5000,
3523
+ "num_input_tokens_seen": 0,
3524
+ "num_train_epochs": 1,
3525
+ "save_steps": 4000,
3526
+ "stateful_callbacks": {
3527
+ "TrainerControl": {
3528
+ "args": {
3529
+ "should_epoch_stop": false,
3530
+ "should_evaluate": false,
3531
+ "should_log": false,
3532
+ "should_save": true,
3533
+ "should_training_stop": true
3534
+ },
3535
+ "attributes": {}
3536
+ }
3537
+ },
3538
+ "total_flos": 3.424475897929728e+16,
3539
+ "train_batch_size": 1,
3540
+ "trial_name": null,
3541
+ "trial_params": null
3542
+ }
Llama-2-13b-chat-hf/DomainBench/Geography/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37b0537f42057321aef63580537318f00e41ce6b989422434a03bdf8f6e599f3
3
+ size 5432
Llama-2-13b-chat-hf/DomainBench/Geography/training_loss.png ADDED
Llama-2-13b-chat-hf/DomainBench/Medicine/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: /hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: threshold_3-lamb_0.1-lr_5e-5
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # threshold_3-lamb_0.1-lr_5e-5
18
+
19
+ This model is a fine-tuned version of [/hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf](https://huggingface.co//hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf) on the gen_med_gpt dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 1
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: cosine
44
+ - lr_scheduler_warmup_ratio: 0.1
45
+ - num_epochs: 1.0
46
+
47
+ ### Training results
48
+
49
+
50
+
51
+ ### Framework versions
52
+
53
+ - PEFT 0.12.0
54
+ - Transformers 4.46.1
55
+ - Pytorch 2.5.1+cu124
56
+ - Datasets 3.1.0
57
+ - Tokenizers 0.20.3
Llama-2-13b-chat-hf/DomainBench/Medicine/adapter_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/hujinwu/LLM_Assemble/pretrain_model/Llama-2-13b-chat-hf",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "v_proj",
24
+ "q_proj"
25
+ ],
26
+ "task_type": "CAUSAL_LM",
27
+ "use_dora": false,
28
+ "use_rslora": false
29
+ }
Llama-2-13b-chat-hf/DomainBench/Medicine/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90dbf183e9007b63219a2bb1e84166f60b2129f15acf2dda914b4a80058873de
3
+ size 26235704
Llama-2-13b-chat-hf/DomainBench/Medicine/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 2.556923075960832e+16,
4
+ "train_loss": 0.03917525251507759,
5
+ "train_runtime": 757.0692,
6
+ "train_samples_per_second": 6.604,
7
+ "train_steps_per_second": 6.604
8
+ }
Llama-2-13b-chat-hf/DomainBench/Medicine/logfile.txt ADDED
The diff for this file is too large to render. See raw diff
 
Llama-2-13b-chat-hf/DomainBench/Medicine/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
Llama-2-13b-chat-hf/DomainBench/Medicine/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff