sunkim317 commited on
Commit
6f3cff7
·
1 Parent(s): 43493b8

Initial commit

Browse files
adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "models/HF/13B",
3
+ "bias": "none",
4
+ "enable_lora": null,
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "lora_alpha": 16,
9
+ "lora_dropout": 0.05,
10
+ "merge_weights": false,
11
+ "modules_to_save": null,
12
+ "peft_type": "LORA",
13
+ "r": 8,
14
+ "target_modules": [
15
+ "q_proj",
16
+ "v_proj"
17
+ ],
18
+ "task_type": "CAUSAL_LM"
19
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c38f7bcd70b9a3c17444f1986792c6c4b08fab8f742fe038b3193254f0571a98
3
+ size 26271757
checkpoint-6800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ebf2b174d5eca164b2d77471eb09eb6fa5b3ebd2b6ea5ce8fa9ae8e1d7c4272
3
+ size 52564997
checkpoint-6800/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a9119b27e400a0c131e06aa9288c4862ed7f867267b89b6026010a94082c36
3
+ size 26271757
checkpoint-6800/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd14cfdfcf52ce7a2011c09b24954f3a3fed44a81c2d316260bee0058db44d81
3
+ size 14583
checkpoint-6800/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22e58698a8f013ada7b3ae20f600228bf2e93adf556bd1fe828642e46fd1434
3
+ size 14583
checkpoint-6800/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c728ca09663d7c80b05c36b62dd781693daeb288dc44e59ee59dd4a735e9859
3
+ size 557
checkpoint-6800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:495d435d9d4f076fa037def2c9abdbb0b90ef189d10bd190b532c3a27512d972
3
+ size 627
checkpoint-6800/trainer_state.json ADDED
@@ -0,0 +1,4368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7945918440818787,
3
+ "best_model_checkpoint": "train.out/checkpoint-6800",
4
+ "epoch": 1.8709588664190397,
5
+ "global_step": 6800,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 2.0000000000000003e-06,
13
+ "loss": 1.7549,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 4.000000000000001e-06,
19
+ "loss": 1.7891,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.02,
24
+ "learning_rate": 6e-06,
25
+ "loss": 1.7738,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.02,
30
+ "learning_rate": 8.000000000000001e-06,
31
+ "loss": 1.767,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.03,
36
+ "learning_rate": 1e-05,
37
+ "loss": 1.764,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 0.04,
42
+ "learning_rate": 1.2e-05,
43
+ "loss": 1.7286,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 0.04,
48
+ "learning_rate": 1.4e-05,
49
+ "loss": 1.68,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 0.05,
54
+ "learning_rate": 1.6000000000000003e-05,
55
+ "loss": 1.5793,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 0.06,
60
+ "learning_rate": 1.8e-05,
61
+ "loss": 1.4831,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 0.06,
66
+ "learning_rate": 2e-05,
67
+ "loss": 1.3956,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.07,
72
+ "learning_rate": 1.998750780762024e-05,
73
+ "loss": 1.2915,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 0.07,
78
+ "learning_rate": 1.9975015615240476e-05,
79
+ "loss": 1.2065,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 0.08,
84
+ "learning_rate": 1.9962523422860715e-05,
85
+ "loss": 1.1471,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 0.09,
90
+ "learning_rate": 1.995003123048095e-05,
91
+ "loss": 1.0787,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 0.09,
96
+ "learning_rate": 1.993753903810119e-05,
97
+ "loss": 0.9869,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 0.1,
102
+ "learning_rate": 1.9925046845721424e-05,
103
+ "loss": 0.9633,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 0.11,
108
+ "learning_rate": 1.9912554653341663e-05,
109
+ "loss": 0.9448,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 0.11,
114
+ "learning_rate": 1.9900062460961898e-05,
115
+ "loss": 0.9208,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 0.12,
120
+ "learning_rate": 1.9887570268582137e-05,
121
+ "loss": 0.9104,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 0.12,
126
+ "learning_rate": 1.9875078076202376e-05,
127
+ "loss": 0.9109,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 0.12,
132
+ "eval_loss": 0.8898332118988037,
133
+ "eval_runtime": 90.2434,
134
+ "eval_samples_per_second": 11.081,
135
+ "eval_steps_per_second": 0.698,
136
+ "step": 200
137
+ },
138
+ {
139
+ "epoch": 0.13,
140
+ "learning_rate": 1.986258588382261e-05,
141
+ "loss": 0.9134,
142
+ "step": 210
143
+ },
144
+ {
145
+ "epoch": 0.14,
146
+ "learning_rate": 1.985009369144285e-05,
147
+ "loss": 0.9016,
148
+ "step": 220
149
+ },
150
+ {
151
+ "epoch": 0.14,
152
+ "learning_rate": 1.983760149906309e-05,
153
+ "loss": 0.8977,
154
+ "step": 230
155
+ },
156
+ {
157
+ "epoch": 0.15,
158
+ "learning_rate": 1.9825109306683324e-05,
159
+ "loss": 0.8835,
160
+ "step": 240
161
+ },
162
+ {
163
+ "epoch": 0.16,
164
+ "learning_rate": 1.9812617114303563e-05,
165
+ "loss": 0.8859,
166
+ "step": 250
167
+ },
168
+ {
169
+ "epoch": 0.16,
170
+ "learning_rate": 1.9800124921923802e-05,
171
+ "loss": 0.8884,
172
+ "step": 260
173
+ },
174
+ {
175
+ "epoch": 0.17,
176
+ "learning_rate": 1.9787632729544037e-05,
177
+ "loss": 0.8829,
178
+ "step": 270
179
+ },
180
+ {
181
+ "epoch": 0.17,
182
+ "learning_rate": 1.9775140537164276e-05,
183
+ "loss": 0.8905,
184
+ "step": 280
185
+ },
186
+ {
187
+ "epoch": 0.18,
188
+ "learning_rate": 1.976264834478451e-05,
189
+ "loss": 0.8824,
190
+ "step": 290
191
+ },
192
+ {
193
+ "epoch": 0.19,
194
+ "learning_rate": 1.975015615240475e-05,
195
+ "loss": 0.8844,
196
+ "step": 300
197
+ },
198
+ {
199
+ "epoch": 0.19,
200
+ "learning_rate": 1.9737663960024985e-05,
201
+ "loss": 0.8817,
202
+ "step": 310
203
+ },
204
+ {
205
+ "epoch": 0.2,
206
+ "learning_rate": 1.9725171767645224e-05,
207
+ "loss": 0.8917,
208
+ "step": 320
209
+ },
210
+ {
211
+ "epoch": 0.2,
212
+ "learning_rate": 1.971267957526546e-05,
213
+ "loss": 0.8823,
214
+ "step": 330
215
+ },
216
+ {
217
+ "epoch": 0.21,
218
+ "learning_rate": 1.97001873828857e-05,
219
+ "loss": 0.8832,
220
+ "step": 340
221
+ },
222
+ {
223
+ "epoch": 0.22,
224
+ "learning_rate": 1.9687695190505937e-05,
225
+ "loss": 0.8743,
226
+ "step": 350
227
+ },
228
+ {
229
+ "epoch": 0.22,
230
+ "learning_rate": 1.9675202998126173e-05,
231
+ "loss": 0.8676,
232
+ "step": 360
233
+ },
234
+ {
235
+ "epoch": 0.23,
236
+ "learning_rate": 1.966271080574641e-05,
237
+ "loss": 0.877,
238
+ "step": 370
239
+ },
240
+ {
241
+ "epoch": 0.24,
242
+ "learning_rate": 1.9650218613366647e-05,
243
+ "loss": 0.8779,
244
+ "step": 380
245
+ },
246
+ {
247
+ "epoch": 0.24,
248
+ "learning_rate": 1.9637726420986885e-05,
249
+ "loss": 0.8703,
250
+ "step": 390
251
+ },
252
+ {
253
+ "epoch": 0.25,
254
+ "learning_rate": 1.962523422860712e-05,
255
+ "loss": 0.8693,
256
+ "step": 400
257
+ },
258
+ {
259
+ "epoch": 0.25,
260
+ "eval_loss": 0.8513386249542236,
261
+ "eval_runtime": 90.0667,
262
+ "eval_samples_per_second": 11.103,
263
+ "eval_steps_per_second": 0.699,
264
+ "step": 400
265
+ },
266
+ {
267
+ "epoch": 0.11,
268
+ "learning_rate": 1.942603221625625e-05,
269
+ "loss": 0.8775,
270
+ "step": 410
271
+ },
272
+ {
273
+ "epoch": 0.12,
274
+ "learning_rate": 1.9407517126458065e-05,
275
+ "loss": 0.8583,
276
+ "step": 420
277
+ },
278
+ {
279
+ "epoch": 0.12,
280
+ "learning_rate": 1.938900203665988e-05,
281
+ "loss": 0.8687,
282
+ "step": 430
283
+ },
284
+ {
285
+ "epoch": 0.12,
286
+ "learning_rate": 1.9370486946861694e-05,
287
+ "loss": 0.8633,
288
+ "step": 440
289
+ },
290
+ {
291
+ "epoch": 0.12,
292
+ "learning_rate": 1.9351971857063506e-05,
293
+ "loss": 0.8642,
294
+ "step": 450
295
+ },
296
+ {
297
+ "epoch": 0.13,
298
+ "learning_rate": 1.933345676726532e-05,
299
+ "loss": 0.8546,
300
+ "step": 460
301
+ },
302
+ {
303
+ "epoch": 0.13,
304
+ "learning_rate": 1.931494167746714e-05,
305
+ "loss": 0.87,
306
+ "step": 470
307
+ },
308
+ {
309
+ "epoch": 0.13,
310
+ "learning_rate": 1.9296426587668953e-05,
311
+ "loss": 0.8372,
312
+ "step": 480
313
+ },
314
+ {
315
+ "epoch": 0.13,
316
+ "learning_rate": 1.9277911497870768e-05,
317
+ "loss": 0.8701,
318
+ "step": 490
319
+ },
320
+ {
321
+ "epoch": 0.14,
322
+ "learning_rate": 1.9259396408072583e-05,
323
+ "loss": 0.8682,
324
+ "step": 500
325
+ },
326
+ {
327
+ "epoch": 0.14,
328
+ "learning_rate": 1.9240881318274397e-05,
329
+ "loss": 0.867,
330
+ "step": 510
331
+ },
332
+ {
333
+ "epoch": 0.14,
334
+ "learning_rate": 1.9222366228476212e-05,
335
+ "loss": 0.8455,
336
+ "step": 520
337
+ },
338
+ {
339
+ "epoch": 0.15,
340
+ "learning_rate": 1.9203851138678023e-05,
341
+ "loss": 0.8479,
342
+ "step": 530
343
+ },
344
+ {
345
+ "epoch": 0.15,
346
+ "learning_rate": 1.9185336048879838e-05,
347
+ "loss": 0.8699,
348
+ "step": 540
349
+ },
350
+ {
351
+ "epoch": 0.15,
352
+ "learning_rate": 1.9166820959081653e-05,
353
+ "loss": 0.8575,
354
+ "step": 550
355
+ },
356
+ {
357
+ "epoch": 0.15,
358
+ "learning_rate": 1.9148305869283467e-05,
359
+ "loss": 0.8577,
360
+ "step": 560
361
+ },
362
+ {
363
+ "epoch": 0.16,
364
+ "learning_rate": 1.9129790779485282e-05,
365
+ "loss": 0.8661,
366
+ "step": 570
367
+ },
368
+ {
369
+ "epoch": 0.16,
370
+ "learning_rate": 1.9111275689687097e-05,
371
+ "loss": 0.8581,
372
+ "step": 580
373
+ },
374
+ {
375
+ "epoch": 0.16,
376
+ "learning_rate": 1.909276059988891e-05,
377
+ "loss": 0.8611,
378
+ "step": 590
379
+ },
380
+ {
381
+ "epoch": 0.17,
382
+ "learning_rate": 1.9074245510090726e-05,
383
+ "loss": 0.8427,
384
+ "step": 600
385
+ },
386
+ {
387
+ "epoch": 0.17,
388
+ "eval_loss": 0.8519569635391235,
389
+ "eval_runtime": 98.6826,
390
+ "eval_samples_per_second": 10.133,
391
+ "eval_steps_per_second": 0.638,
392
+ "step": 600
393
+ },
394
+ {
395
+ "epoch": 0.17,
396
+ "learning_rate": 1.905573042029254e-05,
397
+ "loss": 0.8583,
398
+ "step": 610
399
+ },
400
+ {
401
+ "epoch": 0.17,
402
+ "learning_rate": 1.9037215330494355e-05,
403
+ "loss": 0.8474,
404
+ "step": 620
405
+ },
406
+ {
407
+ "epoch": 0.17,
408
+ "learning_rate": 1.901870024069617e-05,
409
+ "loss": 0.8553,
410
+ "step": 630
411
+ },
412
+ {
413
+ "epoch": 0.18,
414
+ "learning_rate": 1.900018515089798e-05,
415
+ "loss": 0.8523,
416
+ "step": 640
417
+ },
418
+ {
419
+ "epoch": 0.18,
420
+ "learning_rate": 1.8981670061099796e-05,
421
+ "loss": 0.8431,
422
+ "step": 650
423
+ },
424
+ {
425
+ "epoch": 0.18,
426
+ "learning_rate": 1.896315497130161e-05,
427
+ "loss": 0.8567,
428
+ "step": 660
429
+ },
430
+ {
431
+ "epoch": 0.18,
432
+ "learning_rate": 1.8944639881503426e-05,
433
+ "loss": 0.846,
434
+ "step": 670
435
+ },
436
+ {
437
+ "epoch": 0.19,
438
+ "learning_rate": 1.892612479170524e-05,
439
+ "loss": 0.8591,
440
+ "step": 680
441
+ },
442
+ {
443
+ "epoch": 0.19,
444
+ "learning_rate": 1.8907609701907055e-05,
445
+ "loss": 0.8469,
446
+ "step": 690
447
+ },
448
+ {
449
+ "epoch": 0.19,
450
+ "learning_rate": 1.888909461210887e-05,
451
+ "loss": 0.8615,
452
+ "step": 700
453
+ },
454
+ {
455
+ "epoch": 0.2,
456
+ "learning_rate": 1.8870579522310684e-05,
457
+ "loss": 0.8413,
458
+ "step": 710
459
+ },
460
+ {
461
+ "epoch": 0.2,
462
+ "learning_rate": 1.88520644325125e-05,
463
+ "loss": 0.8422,
464
+ "step": 720
465
+ },
466
+ {
467
+ "epoch": 0.2,
468
+ "learning_rate": 1.8833549342714314e-05,
469
+ "loss": 0.8535,
470
+ "step": 730
471
+ },
472
+ {
473
+ "epoch": 0.2,
474
+ "learning_rate": 1.881503425291613e-05,
475
+ "loss": 0.8567,
476
+ "step": 740
477
+ },
478
+ {
479
+ "epoch": 0.21,
480
+ "learning_rate": 1.8796519163117943e-05,
481
+ "loss": 0.8578,
482
+ "step": 750
483
+ },
484
+ {
485
+ "epoch": 0.21,
486
+ "learning_rate": 1.8778004073319758e-05,
487
+ "loss": 0.8468,
488
+ "step": 760
489
+ },
490
+ {
491
+ "epoch": 0.21,
492
+ "learning_rate": 1.8759488983521572e-05,
493
+ "loss": 0.8469,
494
+ "step": 770
495
+ },
496
+ {
497
+ "epoch": 0.21,
498
+ "learning_rate": 1.8740973893723387e-05,
499
+ "loss": 0.8312,
500
+ "step": 780
501
+ },
502
+ {
503
+ "epoch": 0.22,
504
+ "learning_rate": 1.8722458803925202e-05,
505
+ "loss": 0.8466,
506
+ "step": 790
507
+ },
508
+ {
509
+ "epoch": 0.22,
510
+ "learning_rate": 1.8703943714127017e-05,
511
+ "loss": 0.8483,
512
+ "step": 800
513
+ },
514
+ {
515
+ "epoch": 0.22,
516
+ "eval_loss": 0.8436271548271179,
517
+ "eval_runtime": 98.5282,
518
+ "eval_samples_per_second": 10.149,
519
+ "eval_steps_per_second": 0.639,
520
+ "step": 800
521
+ },
522
+ {
523
+ "epoch": 0.22,
524
+ "learning_rate": 1.868542862432883e-05,
525
+ "loss": 0.8513,
526
+ "step": 810
527
+ },
528
+ {
529
+ "epoch": 0.23,
530
+ "learning_rate": 1.8666913534530643e-05,
531
+ "loss": 0.8467,
532
+ "step": 820
533
+ },
534
+ {
535
+ "epoch": 0.23,
536
+ "learning_rate": 1.8648398444732457e-05,
537
+ "loss": 0.8416,
538
+ "step": 830
539
+ },
540
+ {
541
+ "epoch": 0.23,
542
+ "learning_rate": 1.8629883354934272e-05,
543
+ "loss": 0.8398,
544
+ "step": 840
545
+ },
546
+ {
547
+ "epoch": 0.23,
548
+ "learning_rate": 1.8611368265136087e-05,
549
+ "loss": 0.8361,
550
+ "step": 850
551
+ },
552
+ {
553
+ "epoch": 0.24,
554
+ "learning_rate": 1.85928531753379e-05,
555
+ "loss": 0.8474,
556
+ "step": 860
557
+ },
558
+ {
559
+ "epoch": 0.24,
560
+ "learning_rate": 1.8574338085539716e-05,
561
+ "loss": 0.8403,
562
+ "step": 870
563
+ },
564
+ {
565
+ "epoch": 0.24,
566
+ "learning_rate": 1.855582299574153e-05,
567
+ "loss": 0.8489,
568
+ "step": 880
569
+ },
570
+ {
571
+ "epoch": 0.24,
572
+ "learning_rate": 1.8537307905943345e-05,
573
+ "loss": 0.8499,
574
+ "step": 890
575
+ },
576
+ {
577
+ "epoch": 0.25,
578
+ "learning_rate": 1.851879281614516e-05,
579
+ "loss": 0.8438,
580
+ "step": 900
581
+ },
582
+ {
583
+ "epoch": 0.25,
584
+ "learning_rate": 1.8500277726346975e-05,
585
+ "loss": 0.839,
586
+ "step": 910
587
+ },
588
+ {
589
+ "epoch": 0.25,
590
+ "learning_rate": 1.848176263654879e-05,
591
+ "loss": 0.8429,
592
+ "step": 920
593
+ },
594
+ {
595
+ "epoch": 0.26,
596
+ "learning_rate": 1.84632475467506e-05,
597
+ "loss": 0.8384,
598
+ "step": 930
599
+ },
600
+ {
601
+ "epoch": 0.26,
602
+ "learning_rate": 1.8444732456952415e-05,
603
+ "loss": 0.8446,
604
+ "step": 940
605
+ },
606
+ {
607
+ "epoch": 0.26,
608
+ "learning_rate": 1.8426217367154234e-05,
609
+ "loss": 0.8485,
610
+ "step": 950
611
+ },
612
+ {
613
+ "epoch": 0.26,
614
+ "learning_rate": 1.8407702277356048e-05,
615
+ "loss": 0.8364,
616
+ "step": 960
617
+ },
618
+ {
619
+ "epoch": 0.27,
620
+ "learning_rate": 1.8389187187557863e-05,
621
+ "loss": 0.8357,
622
+ "step": 970
623
+ },
624
+ {
625
+ "epoch": 0.27,
626
+ "learning_rate": 1.8370672097759678e-05,
627
+ "loss": 0.8379,
628
+ "step": 980
629
+ },
630
+ {
631
+ "epoch": 0.27,
632
+ "learning_rate": 1.8352157007961492e-05,
633
+ "loss": 0.8275,
634
+ "step": 990
635
+ },
636
+ {
637
+ "epoch": 0.28,
638
+ "learning_rate": 1.8333641918163304e-05,
639
+ "loss": 0.8491,
640
+ "step": 1000
641
+ },
642
+ {
643
+ "epoch": 0.28,
644
+ "eval_loss": 0.8377317190170288,
645
+ "eval_runtime": 98.8443,
646
+ "eval_samples_per_second": 10.117,
647
+ "eval_steps_per_second": 0.637,
648
+ "step": 1000
649
+ },
650
+ {
651
+ "epoch": 0.28,
652
+ "learning_rate": 1.831512682836512e-05,
653
+ "loss": 0.8352,
654
+ "step": 1010
655
+ },
656
+ {
657
+ "epoch": 0.28,
658
+ "learning_rate": 1.8296611738566933e-05,
659
+ "loss": 0.8423,
660
+ "step": 1020
661
+ },
662
+ {
663
+ "epoch": 0.28,
664
+ "learning_rate": 1.8278096648768748e-05,
665
+ "loss": 0.8337,
666
+ "step": 1030
667
+ },
668
+ {
669
+ "epoch": 0.29,
670
+ "learning_rate": 1.8259581558970562e-05,
671
+ "loss": 0.8276,
672
+ "step": 1040
673
+ },
674
+ {
675
+ "epoch": 0.29,
676
+ "learning_rate": 1.8241066469172377e-05,
677
+ "loss": 0.8313,
678
+ "step": 1050
679
+ },
680
+ {
681
+ "epoch": 0.29,
682
+ "learning_rate": 1.8222551379374192e-05,
683
+ "loss": 0.8266,
684
+ "step": 1060
685
+ },
686
+ {
687
+ "epoch": 0.29,
688
+ "learning_rate": 1.8204036289576006e-05,
689
+ "loss": 0.838,
690
+ "step": 1070
691
+ },
692
+ {
693
+ "epoch": 0.3,
694
+ "learning_rate": 1.818552119977782e-05,
695
+ "loss": 0.8456,
696
+ "step": 1080
697
+ },
698
+ {
699
+ "epoch": 0.3,
700
+ "learning_rate": 1.8167006109979636e-05,
701
+ "loss": 0.8281,
702
+ "step": 1090
703
+ },
704
+ {
705
+ "epoch": 0.3,
706
+ "learning_rate": 1.814849102018145e-05,
707
+ "loss": 0.835,
708
+ "step": 1100
709
+ },
710
+ {
711
+ "epoch": 0.31,
712
+ "learning_rate": 1.8129975930383262e-05,
713
+ "loss": 0.84,
714
+ "step": 1110
715
+ },
716
+ {
717
+ "epoch": 0.31,
718
+ "learning_rate": 1.8111460840585077e-05,
719
+ "loss": 0.8323,
720
+ "step": 1120
721
+ },
722
+ {
723
+ "epoch": 0.31,
724
+ "learning_rate": 1.809294575078689e-05,
725
+ "loss": 0.8331,
726
+ "step": 1130
727
+ },
728
+ {
729
+ "epoch": 0.31,
730
+ "learning_rate": 1.8074430660988706e-05,
731
+ "loss": 0.8304,
732
+ "step": 1140
733
+ },
734
+ {
735
+ "epoch": 0.32,
736
+ "learning_rate": 1.805591557119052e-05,
737
+ "loss": 0.8329,
738
+ "step": 1150
739
+ },
740
+ {
741
+ "epoch": 0.32,
742
+ "learning_rate": 1.8037400481392335e-05,
743
+ "loss": 0.8291,
744
+ "step": 1160
745
+ },
746
+ {
747
+ "epoch": 0.32,
748
+ "learning_rate": 1.801888539159415e-05,
749
+ "loss": 0.8437,
750
+ "step": 1170
751
+ },
752
+ {
753
+ "epoch": 0.32,
754
+ "learning_rate": 1.8000370301795965e-05,
755
+ "loss": 0.825,
756
+ "step": 1180
757
+ },
758
+ {
759
+ "epoch": 0.33,
760
+ "learning_rate": 1.798185521199778e-05,
761
+ "loss": 0.828,
762
+ "step": 1190
763
+ },
764
+ {
765
+ "epoch": 0.33,
766
+ "learning_rate": 1.7963340122199594e-05,
767
+ "loss": 0.8342,
768
+ "step": 1200
769
+ },
770
+ {
771
+ "epoch": 0.33,
772
+ "eval_loss": 0.8326738476753235,
773
+ "eval_runtime": 98.7519,
774
+ "eval_samples_per_second": 10.126,
775
+ "eval_steps_per_second": 0.638,
776
+ "step": 1200
777
+ },
778
+ {
779
+ "epoch": 0.33,
780
+ "learning_rate": 1.794482503240141e-05,
781
+ "loss": 0.8252,
782
+ "step": 1210
783
+ },
784
+ {
785
+ "epoch": 0.34,
786
+ "learning_rate": 1.7926309942603223e-05,
787
+ "loss": 0.8227,
788
+ "step": 1220
789
+ },
790
+ {
791
+ "epoch": 0.34,
792
+ "learning_rate": 1.7907794852805038e-05,
793
+ "loss": 0.8296,
794
+ "step": 1230
795
+ },
796
+ {
797
+ "epoch": 0.34,
798
+ "learning_rate": 1.7889279763006853e-05,
799
+ "loss": 0.8429,
800
+ "step": 1240
801
+ },
802
+ {
803
+ "epoch": 0.34,
804
+ "learning_rate": 1.7870764673208668e-05,
805
+ "loss": 0.8257,
806
+ "step": 1250
807
+ },
808
+ {
809
+ "epoch": 0.35,
810
+ "learning_rate": 1.7852249583410482e-05,
811
+ "loss": 0.8267,
812
+ "step": 1260
813
+ },
814
+ {
815
+ "epoch": 0.35,
816
+ "learning_rate": 1.7833734493612297e-05,
817
+ "loss": 0.8424,
818
+ "step": 1270
819
+ },
820
+ {
821
+ "epoch": 0.35,
822
+ "learning_rate": 1.781521940381411e-05,
823
+ "loss": 0.8262,
824
+ "step": 1280
825
+ },
826
+ {
827
+ "epoch": 0.35,
828
+ "learning_rate": 1.7796704314015923e-05,
829
+ "loss": 0.8314,
830
+ "step": 1290
831
+ },
832
+ {
833
+ "epoch": 0.36,
834
+ "learning_rate": 1.7778189224217738e-05,
835
+ "loss": 0.8194,
836
+ "step": 1300
837
+ },
838
+ {
839
+ "epoch": 0.36,
840
+ "learning_rate": 1.7759674134419552e-05,
841
+ "loss": 0.8316,
842
+ "step": 1310
843
+ },
844
+ {
845
+ "epoch": 0.36,
846
+ "learning_rate": 1.7741159044621367e-05,
847
+ "loss": 0.8399,
848
+ "step": 1320
849
+ },
850
+ {
851
+ "epoch": 0.37,
852
+ "learning_rate": 1.7722643954823182e-05,
853
+ "loss": 0.8273,
854
+ "step": 1330
855
+ },
856
+ {
857
+ "epoch": 0.37,
858
+ "learning_rate": 1.7704128865024996e-05,
859
+ "loss": 0.8207,
860
+ "step": 1340
861
+ },
862
+ {
863
+ "epoch": 0.37,
864
+ "learning_rate": 1.768561377522681e-05,
865
+ "loss": 0.8346,
866
+ "step": 1350
867
+ },
868
+ {
869
+ "epoch": 0.37,
870
+ "learning_rate": 1.7667098685428626e-05,
871
+ "loss": 0.8356,
872
+ "step": 1360
873
+ },
874
+ {
875
+ "epoch": 0.38,
876
+ "learning_rate": 1.764858359563044e-05,
877
+ "loss": 0.8352,
878
+ "step": 1370
879
+ },
880
+ {
881
+ "epoch": 0.38,
882
+ "learning_rate": 1.7630068505832255e-05,
883
+ "loss": 0.8362,
884
+ "step": 1380
885
+ },
886
+ {
887
+ "epoch": 0.38,
888
+ "learning_rate": 1.761155341603407e-05,
889
+ "loss": 0.8355,
890
+ "step": 1390
891
+ },
892
+ {
893
+ "epoch": 0.39,
894
+ "learning_rate": 1.759303832623588e-05,
895
+ "loss": 0.8203,
896
+ "step": 1400
897
+ },
898
+ {
899
+ "epoch": 0.39,
900
+ "eval_loss": 0.8290709853172302,
901
+ "eval_runtime": 98.8118,
902
+ "eval_samples_per_second": 10.12,
903
+ "eval_steps_per_second": 0.638,
904
+ "step": 1400
905
+ },
906
+ {
907
+ "epoch": 0.39,
908
+ "learning_rate": 1.7574523236437696e-05,
909
+ "loss": 0.8267,
910
+ "step": 1410
911
+ },
912
+ {
913
+ "epoch": 0.39,
914
+ "learning_rate": 1.755600814663951e-05,
915
+ "loss": 0.814,
916
+ "step": 1420
917
+ },
918
+ {
919
+ "epoch": 0.39,
920
+ "learning_rate": 1.753749305684133e-05,
921
+ "loss": 0.8227,
922
+ "step": 1430
923
+ },
924
+ {
925
+ "epoch": 0.4,
926
+ "learning_rate": 1.7518977967043143e-05,
927
+ "loss": 0.8221,
928
+ "step": 1440
929
+ },
930
+ {
931
+ "epoch": 0.4,
932
+ "learning_rate": 1.7500462877244958e-05,
933
+ "loss": 0.8162,
934
+ "step": 1450
935
+ },
936
+ {
937
+ "epoch": 0.4,
938
+ "learning_rate": 1.7481947787446773e-05,
939
+ "loss": 0.8305,
940
+ "step": 1460
941
+ },
942
+ {
943
+ "epoch": 0.4,
944
+ "learning_rate": 1.7463432697648587e-05,
945
+ "loss": 0.8188,
946
+ "step": 1470
947
+ },
948
+ {
949
+ "epoch": 0.41,
950
+ "learning_rate": 1.74449176078504e-05,
951
+ "loss": 0.8265,
952
+ "step": 1480
953
+ },
954
+ {
955
+ "epoch": 0.41,
956
+ "learning_rate": 1.7426402518052213e-05,
957
+ "loss": 0.8265,
958
+ "step": 1490
959
+ },
960
+ {
961
+ "epoch": 0.41,
962
+ "learning_rate": 1.7407887428254028e-05,
963
+ "loss": 0.8215,
964
+ "step": 1500
965
+ },
966
+ {
967
+ "epoch": 0.42,
968
+ "learning_rate": 1.7389372338455843e-05,
969
+ "loss": 0.8311,
970
+ "step": 1510
971
+ },
972
+ {
973
+ "epoch": 0.42,
974
+ "learning_rate": 1.7370857248657657e-05,
975
+ "loss": 0.8315,
976
+ "step": 1520
977
+ },
978
+ {
979
+ "epoch": 0.42,
980
+ "learning_rate": 1.7352342158859472e-05,
981
+ "loss": 0.8306,
982
+ "step": 1530
983
+ },
984
+ {
985
+ "epoch": 0.42,
986
+ "learning_rate": 1.7333827069061287e-05,
987
+ "loss": 0.8212,
988
+ "step": 1540
989
+ },
990
+ {
991
+ "epoch": 0.43,
992
+ "learning_rate": 1.73153119792631e-05,
993
+ "loss": 0.8263,
994
+ "step": 1550
995
+ },
996
+ {
997
+ "epoch": 0.43,
998
+ "learning_rate": 1.7296796889464916e-05,
999
+ "loss": 0.8179,
1000
+ "step": 1560
1001
+ },
1002
+ {
1003
+ "epoch": 0.43,
1004
+ "learning_rate": 1.727828179966673e-05,
1005
+ "loss": 0.8292,
1006
+ "step": 1570
1007
+ },
1008
+ {
1009
+ "epoch": 0.43,
1010
+ "learning_rate": 1.7259766709868546e-05,
1011
+ "loss": 0.8115,
1012
+ "step": 1580
1013
+ },
1014
+ {
1015
+ "epoch": 0.44,
1016
+ "learning_rate": 1.7241251620070357e-05,
1017
+ "loss": 0.8203,
1018
+ "step": 1590
1019
+ },
1020
+ {
1021
+ "epoch": 0.44,
1022
+ "learning_rate": 1.722273653027217e-05,
1023
+ "loss": 0.8342,
1024
+ "step": 1600
1025
+ },
1026
+ {
1027
+ "epoch": 0.44,
1028
+ "eval_loss": 0.825358510017395,
1029
+ "eval_runtime": 98.8719,
1030
+ "eval_samples_per_second": 10.114,
1031
+ "eval_steps_per_second": 0.637,
1032
+ "step": 1600
1033
+ },
1034
+ {
1035
+ "epoch": 0.44,
1036
+ "learning_rate": 1.7204221440473986e-05,
1037
+ "loss": 0.8396,
1038
+ "step": 1610
1039
+ },
1040
+ {
1041
+ "epoch": 0.45,
1042
+ "learning_rate": 1.71857063506758e-05,
1043
+ "loss": 0.8271,
1044
+ "step": 1620
1045
+ },
1046
+ {
1047
+ "epoch": 0.45,
1048
+ "learning_rate": 1.7167191260877616e-05,
1049
+ "loss": 0.8199,
1050
+ "step": 1630
1051
+ },
1052
+ {
1053
+ "epoch": 0.45,
1054
+ "learning_rate": 1.714867617107943e-05,
1055
+ "loss": 0.8315,
1056
+ "step": 1640
1057
+ },
1058
+ {
1059
+ "epoch": 0.45,
1060
+ "learning_rate": 1.7130161081281245e-05,
1061
+ "loss": 0.8279,
1062
+ "step": 1650
1063
+ },
1064
+ {
1065
+ "epoch": 0.46,
1066
+ "learning_rate": 1.711164599148306e-05,
1067
+ "loss": 0.8146,
1068
+ "step": 1660
1069
+ },
1070
+ {
1071
+ "epoch": 0.46,
1072
+ "learning_rate": 1.7093130901684874e-05,
1073
+ "loss": 0.8198,
1074
+ "step": 1670
1075
+ },
1076
+ {
1077
+ "epoch": 0.46,
1078
+ "learning_rate": 1.707461581188669e-05,
1079
+ "loss": 0.8146,
1080
+ "step": 1680
1081
+ },
1082
+ {
1083
+ "epoch": 0.46,
1084
+ "learning_rate": 1.7056100722088504e-05,
1085
+ "loss": 0.8288,
1086
+ "step": 1690
1087
+ },
1088
+ {
1089
+ "epoch": 0.47,
1090
+ "learning_rate": 1.703758563229032e-05,
1091
+ "loss": 0.8208,
1092
+ "step": 1700
1093
+ },
1094
+ {
1095
+ "epoch": 0.47,
1096
+ "learning_rate": 1.7019070542492133e-05,
1097
+ "loss": 0.8262,
1098
+ "step": 1710
1099
+ },
1100
+ {
1101
+ "epoch": 0.47,
1102
+ "learning_rate": 1.7000555452693948e-05,
1103
+ "loss": 0.8196,
1104
+ "step": 1720
1105
+ },
1106
+ {
1107
+ "epoch": 0.48,
1108
+ "learning_rate": 1.6982040362895763e-05,
1109
+ "loss": 0.8191,
1110
+ "step": 1730
1111
+ },
1112
+ {
1113
+ "epoch": 0.48,
1114
+ "learning_rate": 1.6963525273097577e-05,
1115
+ "loss": 0.8258,
1116
+ "step": 1740
1117
+ },
1118
+ {
1119
+ "epoch": 0.48,
1120
+ "learning_rate": 1.6945010183299392e-05,
1121
+ "loss": 0.827,
1122
+ "step": 1750
1123
+ },
1124
+ {
1125
+ "epoch": 0.48,
1126
+ "learning_rate": 1.6926495093501207e-05,
1127
+ "loss": 0.8234,
1128
+ "step": 1760
1129
+ },
1130
+ {
1131
+ "epoch": 0.49,
1132
+ "learning_rate": 1.6907980003703018e-05,
1133
+ "loss": 0.8225,
1134
+ "step": 1770
1135
+ },
1136
+ {
1137
+ "epoch": 0.49,
1138
+ "learning_rate": 1.6889464913904833e-05,
1139
+ "loss": 0.8242,
1140
+ "step": 1780
1141
+ },
1142
+ {
1143
+ "epoch": 0.49,
1144
+ "learning_rate": 1.6870949824106647e-05,
1145
+ "loss": 0.818,
1146
+ "step": 1790
1147
+ },
1148
+ {
1149
+ "epoch": 0.5,
1150
+ "learning_rate": 1.6852434734308462e-05,
1151
+ "loss": 0.8235,
1152
+ "step": 1800
1153
+ },
1154
+ {
1155
+ "epoch": 0.5,
1156
+ "eval_loss": 0.8223171234130859,
1157
+ "eval_runtime": 99.1806,
1158
+ "eval_samples_per_second": 10.083,
1159
+ "eval_steps_per_second": 0.635,
1160
+ "step": 1800
1161
+ },
1162
+ {
1163
+ "epoch": 0.5,
1164
+ "learning_rate": 1.6833919644510277e-05,
1165
+ "loss": 0.8208,
1166
+ "step": 1810
1167
+ },
1168
+ {
1169
+ "epoch": 0.5,
1170
+ "learning_rate": 1.681540455471209e-05,
1171
+ "loss": 0.8241,
1172
+ "step": 1820
1173
+ },
1174
+ {
1175
+ "epoch": 0.5,
1176
+ "learning_rate": 1.6796889464913906e-05,
1177
+ "loss": 0.8141,
1178
+ "step": 1830
1179
+ },
1180
+ {
1181
+ "epoch": 0.51,
1182
+ "learning_rate": 1.677837437511572e-05,
1183
+ "loss": 0.8234,
1184
+ "step": 1840
1185
+ },
1186
+ {
1187
+ "epoch": 0.51,
1188
+ "learning_rate": 1.6759859285317536e-05,
1189
+ "loss": 0.8276,
1190
+ "step": 1850
1191
+ },
1192
+ {
1193
+ "epoch": 0.51,
1194
+ "learning_rate": 1.674134419551935e-05,
1195
+ "loss": 0.8241,
1196
+ "step": 1860
1197
+ },
1198
+ {
1199
+ "epoch": 0.51,
1200
+ "learning_rate": 1.6722829105721165e-05,
1201
+ "loss": 0.8219,
1202
+ "step": 1870
1203
+ },
1204
+ {
1205
+ "epoch": 0.52,
1206
+ "learning_rate": 1.6704314015922976e-05,
1207
+ "loss": 0.8259,
1208
+ "step": 1880
1209
+ },
1210
+ {
1211
+ "epoch": 0.52,
1212
+ "learning_rate": 1.668579892612479e-05,
1213
+ "loss": 0.8142,
1214
+ "step": 1890
1215
+ },
1216
+ {
1217
+ "epoch": 0.52,
1218
+ "learning_rate": 1.6667283836326606e-05,
1219
+ "loss": 0.8186,
1220
+ "step": 1900
1221
+ },
1222
+ {
1223
+ "epoch": 0.53,
1224
+ "learning_rate": 1.6648768746528424e-05,
1225
+ "loss": 0.8128,
1226
+ "step": 1910
1227
+ },
1228
+ {
1229
+ "epoch": 0.53,
1230
+ "learning_rate": 1.663025365673024e-05,
1231
+ "loss": 0.8126,
1232
+ "step": 1920
1233
+ },
1234
+ {
1235
+ "epoch": 0.53,
1236
+ "learning_rate": 1.6611738566932053e-05,
1237
+ "loss": 0.8451,
1238
+ "step": 1930
1239
+ },
1240
+ {
1241
+ "epoch": 0.53,
1242
+ "learning_rate": 1.6593223477133868e-05,
1243
+ "loss": 0.8175,
1244
+ "step": 1940
1245
+ },
1246
+ {
1247
+ "epoch": 0.54,
1248
+ "learning_rate": 1.657470838733568e-05,
1249
+ "loss": 0.8194,
1250
+ "step": 1950
1251
+ },
1252
+ {
1253
+ "epoch": 0.54,
1254
+ "learning_rate": 1.6556193297537494e-05,
1255
+ "loss": 0.8256,
1256
+ "step": 1960
1257
+ },
1258
+ {
1259
+ "epoch": 0.54,
1260
+ "learning_rate": 1.653767820773931e-05,
1261
+ "loss": 0.8086,
1262
+ "step": 1970
1263
+ },
1264
+ {
1265
+ "epoch": 0.54,
1266
+ "learning_rate": 1.6519163117941123e-05,
1267
+ "loss": 0.8267,
1268
+ "step": 1980
1269
+ },
1270
+ {
1271
+ "epoch": 0.55,
1272
+ "learning_rate": 1.6500648028142938e-05,
1273
+ "loss": 0.8115,
1274
+ "step": 1990
1275
+ },
1276
+ {
1277
+ "epoch": 0.55,
1278
+ "learning_rate": 1.6482132938344753e-05,
1279
+ "loss": 0.8103,
1280
+ "step": 2000
1281
+ },
1282
+ {
1283
+ "epoch": 0.55,
1284
+ "eval_loss": 0.8191922903060913,
1285
+ "eval_runtime": 99.2272,
1286
+ "eval_samples_per_second": 10.078,
1287
+ "eval_steps_per_second": 0.635,
1288
+ "step": 2000
1289
+ },
1290
+ {
1291
+ "epoch": 0.55,
1292
+ "learning_rate": 1.6463617848546567e-05,
1293
+ "loss": 0.8309,
1294
+ "step": 2010
1295
+ },
1296
+ {
1297
+ "epoch": 0.56,
1298
+ "learning_rate": 1.6445102758748382e-05,
1299
+ "loss": 0.8347,
1300
+ "step": 2020
1301
+ },
1302
+ {
1303
+ "epoch": 0.56,
1304
+ "learning_rate": 1.6426587668950197e-05,
1305
+ "loss": 0.8317,
1306
+ "step": 2030
1307
+ },
1308
+ {
1309
+ "epoch": 0.56,
1310
+ "learning_rate": 1.640807257915201e-05,
1311
+ "loss": 0.8177,
1312
+ "step": 2040
1313
+ },
1314
+ {
1315
+ "epoch": 0.56,
1316
+ "learning_rate": 1.6389557489353826e-05,
1317
+ "loss": 0.825,
1318
+ "step": 2050
1319
+ },
1320
+ {
1321
+ "epoch": 0.57,
1322
+ "learning_rate": 1.6371042399555637e-05,
1323
+ "loss": 0.8213,
1324
+ "step": 2060
1325
+ },
1326
+ {
1327
+ "epoch": 0.57,
1328
+ "learning_rate": 1.6352527309757452e-05,
1329
+ "loss": 0.8268,
1330
+ "step": 2070
1331
+ },
1332
+ {
1333
+ "epoch": 0.57,
1334
+ "learning_rate": 1.6334012219959267e-05,
1335
+ "loss": 0.8226,
1336
+ "step": 2080
1337
+ },
1338
+ {
1339
+ "epoch": 0.58,
1340
+ "learning_rate": 1.631549713016108e-05,
1341
+ "loss": 0.8218,
1342
+ "step": 2090
1343
+ },
1344
+ {
1345
+ "epoch": 0.58,
1346
+ "learning_rate": 1.6296982040362896e-05,
1347
+ "loss": 0.8081,
1348
+ "step": 2100
1349
+ },
1350
+ {
1351
+ "epoch": 0.58,
1352
+ "learning_rate": 1.627846695056471e-05,
1353
+ "loss": 0.8389,
1354
+ "step": 2110
1355
+ },
1356
+ {
1357
+ "epoch": 0.58,
1358
+ "learning_rate": 1.6259951860766525e-05,
1359
+ "loss": 0.8278,
1360
+ "step": 2120
1361
+ },
1362
+ {
1363
+ "epoch": 0.59,
1364
+ "learning_rate": 1.624143677096834e-05,
1365
+ "loss": 0.8185,
1366
+ "step": 2130
1367
+ },
1368
+ {
1369
+ "epoch": 0.59,
1370
+ "learning_rate": 1.6222921681170155e-05,
1371
+ "loss": 0.8111,
1372
+ "step": 2140
1373
+ },
1374
+ {
1375
+ "epoch": 0.59,
1376
+ "learning_rate": 1.620440659137197e-05,
1377
+ "loss": 0.8192,
1378
+ "step": 2150
1379
+ },
1380
+ {
1381
+ "epoch": 0.59,
1382
+ "learning_rate": 1.6185891501573784e-05,
1383
+ "loss": 0.8197,
1384
+ "step": 2160
1385
+ },
1386
+ {
1387
+ "epoch": 0.6,
1388
+ "learning_rate": 1.61673764117756e-05,
1389
+ "loss": 0.8208,
1390
+ "step": 2170
1391
+ },
1392
+ {
1393
+ "epoch": 0.6,
1394
+ "learning_rate": 1.6148861321977414e-05,
1395
+ "loss": 0.8087,
1396
+ "step": 2180
1397
+ },
1398
+ {
1399
+ "epoch": 0.6,
1400
+ "learning_rate": 1.613034623217923e-05,
1401
+ "loss": 0.8217,
1402
+ "step": 2190
1403
+ },
1404
+ {
1405
+ "epoch": 0.61,
1406
+ "learning_rate": 1.6111831142381043e-05,
1407
+ "loss": 0.8219,
1408
+ "step": 2200
1409
+ },
1410
+ {
1411
+ "epoch": 0.61,
1412
+ "eval_loss": 0.8205804228782654,
1413
+ "eval_runtime": 99.3077,
1414
+ "eval_samples_per_second": 10.07,
1415
+ "eval_steps_per_second": 0.634,
1416
+ "step": 2200
1417
+ },
1418
+ {
1419
+ "epoch": 0.61,
1420
+ "learning_rate": 1.6093316052582858e-05,
1421
+ "loss": 0.8267,
1422
+ "step": 2210
1423
+ },
1424
+ {
1425
+ "epoch": 0.61,
1426
+ "learning_rate": 1.6074800962784672e-05,
1427
+ "loss": 0.8009,
1428
+ "step": 2220
1429
+ },
1430
+ {
1431
+ "epoch": 0.61,
1432
+ "learning_rate": 1.6056285872986487e-05,
1433
+ "loss": 0.8234,
1434
+ "step": 2230
1435
+ },
1436
+ {
1437
+ "epoch": 0.62,
1438
+ "learning_rate": 1.60377707831883e-05,
1439
+ "loss": 0.8112,
1440
+ "step": 2240
1441
+ },
1442
+ {
1443
+ "epoch": 0.62,
1444
+ "learning_rate": 1.6019255693390113e-05,
1445
+ "loss": 0.8181,
1446
+ "step": 2250
1447
+ },
1448
+ {
1449
+ "epoch": 0.62,
1450
+ "learning_rate": 1.6000740603591928e-05,
1451
+ "loss": 0.8234,
1452
+ "step": 2260
1453
+ },
1454
+ {
1455
+ "epoch": 0.62,
1456
+ "learning_rate": 1.5982225513793742e-05,
1457
+ "loss": 0.8142,
1458
+ "step": 2270
1459
+ },
1460
+ {
1461
+ "epoch": 0.63,
1462
+ "learning_rate": 1.5963710423995557e-05,
1463
+ "loss": 0.8083,
1464
+ "step": 2280
1465
+ },
1466
+ {
1467
+ "epoch": 0.63,
1468
+ "learning_rate": 1.5945195334197372e-05,
1469
+ "loss": 0.8099,
1470
+ "step": 2290
1471
+ },
1472
+ {
1473
+ "epoch": 0.63,
1474
+ "learning_rate": 1.5926680244399187e-05,
1475
+ "loss": 0.8164,
1476
+ "step": 2300
1477
+ },
1478
+ {
1479
+ "epoch": 0.64,
1480
+ "learning_rate": 1.5908165154601e-05,
1481
+ "loss": 0.8095,
1482
+ "step": 2310
1483
+ },
1484
+ {
1485
+ "epoch": 0.64,
1486
+ "learning_rate": 1.5889650064802816e-05,
1487
+ "loss": 0.809,
1488
+ "step": 2320
1489
+ },
1490
+ {
1491
+ "epoch": 0.64,
1492
+ "learning_rate": 1.587113497500463e-05,
1493
+ "loss": 0.8144,
1494
+ "step": 2330
1495
+ },
1496
+ {
1497
+ "epoch": 0.64,
1498
+ "learning_rate": 1.5852619885206445e-05,
1499
+ "loss": 0.813,
1500
+ "step": 2340
1501
+ },
1502
+ {
1503
+ "epoch": 0.65,
1504
+ "learning_rate": 1.5834104795408257e-05,
1505
+ "loss": 0.8114,
1506
+ "step": 2350
1507
+ },
1508
+ {
1509
+ "epoch": 0.65,
1510
+ "learning_rate": 1.581558970561007e-05,
1511
+ "loss": 0.8217,
1512
+ "step": 2360
1513
+ },
1514
+ {
1515
+ "epoch": 0.65,
1516
+ "learning_rate": 1.5797074615811886e-05,
1517
+ "loss": 0.7975,
1518
+ "step": 2370
1519
+ },
1520
+ {
1521
+ "epoch": 0.65,
1522
+ "learning_rate": 1.57785595260137e-05,
1523
+ "loss": 0.8268,
1524
+ "step": 2380
1525
+ },
1526
+ {
1527
+ "epoch": 0.66,
1528
+ "learning_rate": 1.5760044436215515e-05,
1529
+ "loss": 0.8188,
1530
+ "step": 2390
1531
+ },
1532
+ {
1533
+ "epoch": 0.66,
1534
+ "learning_rate": 1.5741529346417333e-05,
1535
+ "loss": 0.8103,
1536
+ "step": 2400
1537
+ },
1538
+ {
1539
+ "epoch": 0.66,
1540
+ "eval_loss": 0.8151404857635498,
1541
+ "eval_runtime": 99.0346,
1542
+ "eval_samples_per_second": 10.097,
1543
+ "eval_steps_per_second": 0.636,
1544
+ "step": 2400
1545
+ },
1546
+ {
1547
+ "epoch": 0.66,
1548
+ "learning_rate": 1.5723014256619148e-05,
1549
+ "loss": 0.813,
1550
+ "step": 2410
1551
+ },
1552
+ {
1553
+ "epoch": 0.67,
1554
+ "learning_rate": 1.5704499166820963e-05,
1555
+ "loss": 0.8175,
1556
+ "step": 2420
1557
+ },
1558
+ {
1559
+ "epoch": 0.67,
1560
+ "learning_rate": 1.5685984077022774e-05,
1561
+ "loss": 0.8167,
1562
+ "step": 2430
1563
+ },
1564
+ {
1565
+ "epoch": 0.67,
1566
+ "learning_rate": 1.566746898722459e-05,
1567
+ "loss": 0.8215,
1568
+ "step": 2440
1569
+ },
1570
+ {
1571
+ "epoch": 0.67,
1572
+ "learning_rate": 1.5648953897426404e-05,
1573
+ "loss": 0.8129,
1574
+ "step": 2450
1575
+ },
1576
+ {
1577
+ "epoch": 0.68,
1578
+ "learning_rate": 1.5630438807628218e-05,
1579
+ "loss": 0.8164,
1580
+ "step": 2460
1581
+ },
1582
+ {
1583
+ "epoch": 0.68,
1584
+ "learning_rate": 1.5611923717830033e-05,
1585
+ "loss": 0.7983,
1586
+ "step": 2470
1587
+ },
1588
+ {
1589
+ "epoch": 0.68,
1590
+ "learning_rate": 1.5593408628031848e-05,
1591
+ "loss": 0.8108,
1592
+ "step": 2480
1593
+ },
1594
+ {
1595
+ "epoch": 0.69,
1596
+ "learning_rate": 1.5574893538233662e-05,
1597
+ "loss": 0.8064,
1598
+ "step": 2490
1599
+ },
1600
+ {
1601
+ "epoch": 0.69,
1602
+ "learning_rate": 1.5556378448435477e-05,
1603
+ "loss": 0.8165,
1604
+ "step": 2500
1605
+ },
1606
+ {
1607
+ "epoch": 0.69,
1608
+ "learning_rate": 1.5537863358637292e-05,
1609
+ "loss": 0.8008,
1610
+ "step": 2510
1611
+ },
1612
+ {
1613
+ "epoch": 0.69,
1614
+ "learning_rate": 1.5519348268839106e-05,
1615
+ "loss": 0.8078,
1616
+ "step": 2520
1617
+ },
1618
+ {
1619
+ "epoch": 0.7,
1620
+ "learning_rate": 1.550083317904092e-05,
1621
+ "loss": 0.7909,
1622
+ "step": 2530
1623
+ },
1624
+ {
1625
+ "epoch": 0.7,
1626
+ "learning_rate": 1.5482318089242732e-05,
1627
+ "loss": 0.8132,
1628
+ "step": 2540
1629
+ },
1630
+ {
1631
+ "epoch": 0.7,
1632
+ "learning_rate": 1.5463802999444547e-05,
1633
+ "loss": 0.8092,
1634
+ "step": 2550
1635
+ },
1636
+ {
1637
+ "epoch": 0.7,
1638
+ "learning_rate": 1.5445287909646362e-05,
1639
+ "loss": 0.823,
1640
+ "step": 2560
1641
+ },
1642
+ {
1643
+ "epoch": 0.71,
1644
+ "learning_rate": 1.5426772819848176e-05,
1645
+ "loss": 0.8063,
1646
+ "step": 2570
1647
+ },
1648
+ {
1649
+ "epoch": 0.71,
1650
+ "learning_rate": 1.540825773004999e-05,
1651
+ "loss": 0.8111,
1652
+ "step": 2580
1653
+ },
1654
+ {
1655
+ "epoch": 0.71,
1656
+ "learning_rate": 1.5389742640251806e-05,
1657
+ "loss": 0.8013,
1658
+ "step": 2590
1659
+ },
1660
+ {
1661
+ "epoch": 0.72,
1662
+ "learning_rate": 1.537122755045362e-05,
1663
+ "loss": 0.7979,
1664
+ "step": 2600
1665
+ },
1666
+ {
1667
+ "epoch": 0.72,
1668
+ "eval_loss": 0.8133894801139832,
1669
+ "eval_runtime": 98.8127,
1670
+ "eval_samples_per_second": 10.12,
1671
+ "eval_steps_per_second": 0.638,
1672
+ "step": 2600
1673
+ },
1674
+ {
1675
+ "epoch": 0.72,
1676
+ "learning_rate": 1.5352712460655435e-05,
1677
+ "loss": 0.8191,
1678
+ "step": 2610
1679
+ },
1680
+ {
1681
+ "epoch": 0.72,
1682
+ "learning_rate": 1.533419737085725e-05,
1683
+ "loss": 0.815,
1684
+ "step": 2620
1685
+ },
1686
+ {
1687
+ "epoch": 0.72,
1688
+ "learning_rate": 1.5315682281059065e-05,
1689
+ "loss": 0.8041,
1690
+ "step": 2630
1691
+ },
1692
+ {
1693
+ "epoch": 0.73,
1694
+ "learning_rate": 1.529716719126088e-05,
1695
+ "loss": 0.8123,
1696
+ "step": 2640
1697
+ },
1698
+ {
1699
+ "epoch": 0.73,
1700
+ "learning_rate": 1.5278652101462694e-05,
1701
+ "loss": 0.8097,
1702
+ "step": 2650
1703
+ },
1704
+ {
1705
+ "epoch": 0.73,
1706
+ "learning_rate": 1.526013701166451e-05,
1707
+ "loss": 0.8256,
1708
+ "step": 2660
1709
+ },
1710
+ {
1711
+ "epoch": 0.73,
1712
+ "learning_rate": 1.5241621921866323e-05,
1713
+ "loss": 0.8071,
1714
+ "step": 2670
1715
+ },
1716
+ {
1717
+ "epoch": 0.74,
1718
+ "learning_rate": 1.5223106832068138e-05,
1719
+ "loss": 0.7995,
1720
+ "step": 2680
1721
+ },
1722
+ {
1723
+ "epoch": 0.74,
1724
+ "learning_rate": 1.5204591742269951e-05,
1725
+ "loss": 0.8017,
1726
+ "step": 2690
1727
+ },
1728
+ {
1729
+ "epoch": 0.74,
1730
+ "learning_rate": 1.5186076652471766e-05,
1731
+ "loss": 0.8111,
1732
+ "step": 2700
1733
+ },
1734
+ {
1735
+ "epoch": 0.75,
1736
+ "learning_rate": 1.516756156267358e-05,
1737
+ "loss": 0.8154,
1738
+ "step": 2710
1739
+ },
1740
+ {
1741
+ "epoch": 0.75,
1742
+ "learning_rate": 1.5149046472875395e-05,
1743
+ "loss": 0.8056,
1744
+ "step": 2720
1745
+ },
1746
+ {
1747
+ "epoch": 0.75,
1748
+ "learning_rate": 1.513053138307721e-05,
1749
+ "loss": 0.7996,
1750
+ "step": 2730
1751
+ },
1752
+ {
1753
+ "epoch": 0.75,
1754
+ "learning_rate": 1.5112016293279025e-05,
1755
+ "loss": 0.8149,
1756
+ "step": 2740
1757
+ },
1758
+ {
1759
+ "epoch": 0.76,
1760
+ "learning_rate": 1.5093501203480838e-05,
1761
+ "loss": 0.8108,
1762
+ "step": 2750
1763
+ },
1764
+ {
1765
+ "epoch": 0.76,
1766
+ "learning_rate": 1.5074986113682652e-05,
1767
+ "loss": 0.8093,
1768
+ "step": 2760
1769
+ },
1770
+ {
1771
+ "epoch": 0.76,
1772
+ "learning_rate": 1.5056471023884467e-05,
1773
+ "loss": 0.8063,
1774
+ "step": 2770
1775
+ },
1776
+ {
1777
+ "epoch": 0.76,
1778
+ "learning_rate": 1.5037955934086282e-05,
1779
+ "loss": 0.8089,
1780
+ "step": 2780
1781
+ },
1782
+ {
1783
+ "epoch": 0.77,
1784
+ "learning_rate": 1.5019440844288096e-05,
1785
+ "loss": 0.8059,
1786
+ "step": 2790
1787
+ },
1788
+ {
1789
+ "epoch": 0.77,
1790
+ "learning_rate": 1.500092575448991e-05,
1791
+ "loss": 0.8136,
1792
+ "step": 2800
1793
+ },
1794
+ {
1795
+ "epoch": 0.77,
1796
+ "eval_loss": 0.8110187649726868,
1797
+ "eval_runtime": 98.9424,
1798
+ "eval_samples_per_second": 10.107,
1799
+ "eval_steps_per_second": 0.637,
1800
+ "step": 2800
1801
+ },
1802
+ {
1803
+ "epoch": 0.77,
1804
+ "learning_rate": 1.4982410664691724e-05,
1805
+ "loss": 0.8123,
1806
+ "step": 2810
1807
+ },
1808
+ {
1809
+ "epoch": 0.78,
1810
+ "learning_rate": 1.4963895574893539e-05,
1811
+ "loss": 0.8198,
1812
+ "step": 2820
1813
+ },
1814
+ {
1815
+ "epoch": 0.78,
1816
+ "learning_rate": 1.4945380485095353e-05,
1817
+ "loss": 0.8014,
1818
+ "step": 2830
1819
+ },
1820
+ {
1821
+ "epoch": 0.78,
1822
+ "learning_rate": 1.4926865395297168e-05,
1823
+ "loss": 0.8017,
1824
+ "step": 2840
1825
+ },
1826
+ {
1827
+ "epoch": 0.78,
1828
+ "learning_rate": 1.4908350305498983e-05,
1829
+ "loss": 0.8104,
1830
+ "step": 2850
1831
+ },
1832
+ {
1833
+ "epoch": 0.79,
1834
+ "learning_rate": 1.4889835215700796e-05,
1835
+ "loss": 0.7939,
1836
+ "step": 2860
1837
+ },
1838
+ {
1839
+ "epoch": 0.79,
1840
+ "learning_rate": 1.487132012590261e-05,
1841
+ "loss": 0.8072,
1842
+ "step": 2870
1843
+ },
1844
+ {
1845
+ "epoch": 0.79,
1846
+ "learning_rate": 1.4852805036104427e-05,
1847
+ "loss": 0.818,
1848
+ "step": 2880
1849
+ },
1850
+ {
1851
+ "epoch": 0.8,
1852
+ "learning_rate": 1.4834289946306242e-05,
1853
+ "loss": 0.8024,
1854
+ "step": 2890
1855
+ },
1856
+ {
1857
+ "epoch": 0.8,
1858
+ "learning_rate": 1.4815774856508056e-05,
1859
+ "loss": 0.8166,
1860
+ "step": 2900
1861
+ },
1862
+ {
1863
+ "epoch": 0.8,
1864
+ "learning_rate": 1.4797259766709871e-05,
1865
+ "loss": 0.8047,
1866
+ "step": 2910
1867
+ },
1868
+ {
1869
+ "epoch": 0.8,
1870
+ "learning_rate": 1.4778744676911686e-05,
1871
+ "loss": 0.8186,
1872
+ "step": 2920
1873
+ },
1874
+ {
1875
+ "epoch": 0.81,
1876
+ "learning_rate": 1.4760229587113499e-05,
1877
+ "loss": 0.8089,
1878
+ "step": 2930
1879
+ },
1880
+ {
1881
+ "epoch": 0.81,
1882
+ "learning_rate": 1.4741714497315313e-05,
1883
+ "loss": 0.8053,
1884
+ "step": 2940
1885
+ },
1886
+ {
1887
+ "epoch": 0.81,
1888
+ "learning_rate": 1.4723199407517128e-05,
1889
+ "loss": 0.8016,
1890
+ "step": 2950
1891
+ },
1892
+ {
1893
+ "epoch": 0.81,
1894
+ "learning_rate": 1.4704684317718943e-05,
1895
+ "loss": 0.8004,
1896
+ "step": 2960
1897
+ },
1898
+ {
1899
+ "epoch": 0.82,
1900
+ "learning_rate": 1.4686169227920757e-05,
1901
+ "loss": 0.815,
1902
+ "step": 2970
1903
+ },
1904
+ {
1905
+ "epoch": 0.82,
1906
+ "learning_rate": 1.466765413812257e-05,
1907
+ "loss": 0.8055,
1908
+ "step": 2980
1909
+ },
1910
+ {
1911
+ "epoch": 0.82,
1912
+ "learning_rate": 1.4649139048324385e-05,
1913
+ "loss": 0.8142,
1914
+ "step": 2990
1915
+ },
1916
+ {
1917
+ "epoch": 0.83,
1918
+ "learning_rate": 1.46306239585262e-05,
1919
+ "loss": 0.8149,
1920
+ "step": 3000
1921
+ },
1922
+ {
1923
+ "epoch": 0.83,
1924
+ "eval_loss": 0.809707760810852,
1925
+ "eval_runtime": 98.7818,
1926
+ "eval_samples_per_second": 10.123,
1927
+ "eval_steps_per_second": 0.638,
1928
+ "step": 3000
1929
+ },
1930
+ {
1931
+ "epoch": 0.83,
1932
+ "learning_rate": 1.4612108868728014e-05,
1933
+ "loss": 0.7967,
1934
+ "step": 3010
1935
+ },
1936
+ {
1937
+ "epoch": 0.83,
1938
+ "learning_rate": 1.4593593778929829e-05,
1939
+ "loss": 0.8142,
1940
+ "step": 3020
1941
+ },
1942
+ {
1943
+ "epoch": 0.83,
1944
+ "learning_rate": 1.4575078689131644e-05,
1945
+ "loss": 0.8023,
1946
+ "step": 3030
1947
+ },
1948
+ {
1949
+ "epoch": 0.84,
1950
+ "learning_rate": 1.4556563599333457e-05,
1951
+ "loss": 0.8143,
1952
+ "step": 3040
1953
+ },
1954
+ {
1955
+ "epoch": 0.84,
1956
+ "learning_rate": 1.4538048509535272e-05,
1957
+ "loss": 0.8165,
1958
+ "step": 3050
1959
+ },
1960
+ {
1961
+ "epoch": 0.84,
1962
+ "learning_rate": 1.4519533419737086e-05,
1963
+ "loss": 0.8052,
1964
+ "step": 3060
1965
+ },
1966
+ {
1967
+ "epoch": 0.84,
1968
+ "learning_rate": 1.4501018329938901e-05,
1969
+ "loss": 0.8136,
1970
+ "step": 3070
1971
+ },
1972
+ {
1973
+ "epoch": 0.85,
1974
+ "learning_rate": 1.4482503240140716e-05,
1975
+ "loss": 0.8128,
1976
+ "step": 3080
1977
+ },
1978
+ {
1979
+ "epoch": 0.85,
1980
+ "learning_rate": 1.4463988150342529e-05,
1981
+ "loss": 0.7964,
1982
+ "step": 3090
1983
+ },
1984
+ {
1985
+ "epoch": 0.85,
1986
+ "learning_rate": 1.4445473060544343e-05,
1987
+ "loss": 0.8157,
1988
+ "step": 3100
1989
+ },
1990
+ {
1991
+ "epoch": 0.86,
1992
+ "learning_rate": 1.4426957970746158e-05,
1993
+ "loss": 0.8031,
1994
+ "step": 3110
1995
+ },
1996
+ {
1997
+ "epoch": 0.86,
1998
+ "learning_rate": 1.4408442880947974e-05,
1999
+ "loss": 0.8041,
2000
+ "step": 3120
2001
+ },
2002
+ {
2003
+ "epoch": 0.86,
2004
+ "learning_rate": 1.4389927791149789e-05,
2005
+ "loss": 0.8112,
2006
+ "step": 3130
2007
+ },
2008
+ {
2009
+ "epoch": 0.86,
2010
+ "learning_rate": 1.4371412701351604e-05,
2011
+ "loss": 0.8067,
2012
+ "step": 3140
2013
+ },
2014
+ {
2015
+ "epoch": 0.87,
2016
+ "learning_rate": 1.4352897611553418e-05,
2017
+ "loss": 0.8163,
2018
+ "step": 3150
2019
+ },
2020
+ {
2021
+ "epoch": 0.87,
2022
+ "learning_rate": 1.4334382521755233e-05,
2023
+ "loss": 0.8051,
2024
+ "step": 3160
2025
+ },
2026
+ {
2027
+ "epoch": 0.87,
2028
+ "learning_rate": 1.4315867431957046e-05,
2029
+ "loss": 0.8179,
2030
+ "step": 3170
2031
+ },
2032
+ {
2033
+ "epoch": 0.87,
2034
+ "learning_rate": 1.429735234215886e-05,
2035
+ "loss": 0.7973,
2036
+ "step": 3180
2037
+ },
2038
+ {
2039
+ "epoch": 0.88,
2040
+ "learning_rate": 1.4278837252360676e-05,
2041
+ "loss": 0.811,
2042
+ "step": 3190
2043
+ },
2044
+ {
2045
+ "epoch": 0.88,
2046
+ "learning_rate": 1.426032216256249e-05,
2047
+ "loss": 0.7944,
2048
+ "step": 3200
2049
+ },
2050
+ {
2051
+ "epoch": 0.88,
2052
+ "eval_loss": 0.8083726763725281,
2053
+ "eval_runtime": 99.0291,
2054
+ "eval_samples_per_second": 10.098,
2055
+ "eval_steps_per_second": 0.636,
2056
+ "step": 3200
2057
+ },
2058
+ {
2059
+ "epoch": 0.88,
2060
+ "learning_rate": 1.4241807072764305e-05,
2061
+ "loss": 0.8138,
2062
+ "step": 3210
2063
+ },
2064
+ {
2065
+ "epoch": 0.89,
2066
+ "learning_rate": 1.4223291982966118e-05,
2067
+ "loss": 0.7919,
2068
+ "step": 3220
2069
+ },
2070
+ {
2071
+ "epoch": 0.89,
2072
+ "learning_rate": 1.4204776893167933e-05,
2073
+ "loss": 0.8061,
2074
+ "step": 3230
2075
+ },
2076
+ {
2077
+ "epoch": 0.89,
2078
+ "learning_rate": 1.4186261803369747e-05,
2079
+ "loss": 0.8014,
2080
+ "step": 3240
2081
+ },
2082
+ {
2083
+ "epoch": 0.89,
2084
+ "learning_rate": 1.4167746713571562e-05,
2085
+ "loss": 0.8299,
2086
+ "step": 3250
2087
+ },
2088
+ {
2089
+ "epoch": 0.9,
2090
+ "learning_rate": 1.4149231623773377e-05,
2091
+ "loss": 0.8038,
2092
+ "step": 3260
2093
+ },
2094
+ {
2095
+ "epoch": 0.9,
2096
+ "learning_rate": 1.4130716533975191e-05,
2097
+ "loss": 0.7951,
2098
+ "step": 3270
2099
+ },
2100
+ {
2101
+ "epoch": 0.9,
2102
+ "learning_rate": 1.4112201444177004e-05,
2103
+ "loss": 0.8043,
2104
+ "step": 3280
2105
+ },
2106
+ {
2107
+ "epoch": 0.91,
2108
+ "learning_rate": 1.4093686354378819e-05,
2109
+ "loss": 0.8158,
2110
+ "step": 3290
2111
+ },
2112
+ {
2113
+ "epoch": 0.91,
2114
+ "learning_rate": 1.4075171264580634e-05,
2115
+ "loss": 0.8041,
2116
+ "step": 3300
2117
+ },
2118
+ {
2119
+ "epoch": 0.91,
2120
+ "learning_rate": 1.4056656174782448e-05,
2121
+ "loss": 0.799,
2122
+ "step": 3310
2123
+ },
2124
+ {
2125
+ "epoch": 0.91,
2126
+ "learning_rate": 1.4038141084984263e-05,
2127
+ "loss": 0.8033,
2128
+ "step": 3320
2129
+ },
2130
+ {
2131
+ "epoch": 0.92,
2132
+ "learning_rate": 1.4019625995186076e-05,
2133
+ "loss": 0.8097,
2134
+ "step": 3330
2135
+ },
2136
+ {
2137
+ "epoch": 0.92,
2138
+ "learning_rate": 1.400111090538789e-05,
2139
+ "loss": 0.8047,
2140
+ "step": 3340
2141
+ },
2142
+ {
2143
+ "epoch": 0.92,
2144
+ "learning_rate": 1.3982595815589706e-05,
2145
+ "loss": 0.82,
2146
+ "step": 3350
2147
+ },
2148
+ {
2149
+ "epoch": 0.92,
2150
+ "learning_rate": 1.3964080725791522e-05,
2151
+ "loss": 0.8003,
2152
+ "step": 3360
2153
+ },
2154
+ {
2155
+ "epoch": 0.93,
2156
+ "learning_rate": 1.3945565635993337e-05,
2157
+ "loss": 0.7975,
2158
+ "step": 3370
2159
+ },
2160
+ {
2161
+ "epoch": 0.93,
2162
+ "learning_rate": 1.3927050546195151e-05,
2163
+ "loss": 0.8006,
2164
+ "step": 3380
2165
+ },
2166
+ {
2167
+ "epoch": 0.93,
2168
+ "learning_rate": 1.3908535456396966e-05,
2169
+ "loss": 0.7952,
2170
+ "step": 3390
2171
+ },
2172
+ {
2173
+ "epoch": 0.94,
2174
+ "learning_rate": 1.3890020366598779e-05,
2175
+ "loss": 0.8084,
2176
+ "step": 3400
2177
+ },
2178
+ {
2179
+ "epoch": 0.94,
2180
+ "eval_loss": 0.807253360748291,
2181
+ "eval_runtime": 98.8373,
2182
+ "eval_samples_per_second": 10.118,
2183
+ "eval_steps_per_second": 0.637,
2184
+ "step": 3400
2185
+ },
2186
+ {
2187
+ "epoch": 0.94,
2188
+ "learning_rate": 1.3871505276800594e-05,
2189
+ "loss": 0.8168,
2190
+ "step": 3410
2191
+ },
2192
+ {
2193
+ "epoch": 0.94,
2194
+ "learning_rate": 1.3852990187002408e-05,
2195
+ "loss": 0.8134,
2196
+ "step": 3420
2197
+ },
2198
+ {
2199
+ "epoch": 0.94,
2200
+ "learning_rate": 1.3834475097204223e-05,
2201
+ "loss": 0.7968,
2202
+ "step": 3430
2203
+ },
2204
+ {
2205
+ "epoch": 0.95,
2206
+ "learning_rate": 1.3815960007406038e-05,
2207
+ "loss": 0.8145,
2208
+ "step": 3440
2209
+ },
2210
+ {
2211
+ "epoch": 0.95,
2212
+ "learning_rate": 1.3797444917607852e-05,
2213
+ "loss": 0.7996,
2214
+ "step": 3450
2215
+ },
2216
+ {
2217
+ "epoch": 0.95,
2218
+ "learning_rate": 1.3778929827809665e-05,
2219
+ "loss": 0.8016,
2220
+ "step": 3460
2221
+ },
2222
+ {
2223
+ "epoch": 0.95,
2224
+ "learning_rate": 1.376041473801148e-05,
2225
+ "loss": 0.7863,
2226
+ "step": 3470
2227
+ },
2228
+ {
2229
+ "epoch": 0.96,
2230
+ "learning_rate": 1.3741899648213295e-05,
2231
+ "loss": 0.8193,
2232
+ "step": 3480
2233
+ },
2234
+ {
2235
+ "epoch": 0.96,
2236
+ "learning_rate": 1.372338455841511e-05,
2237
+ "loss": 0.8148,
2238
+ "step": 3490
2239
+ },
2240
+ {
2241
+ "epoch": 0.96,
2242
+ "learning_rate": 1.3704869468616924e-05,
2243
+ "loss": 0.8016,
2244
+ "step": 3500
2245
+ },
2246
+ {
2247
+ "epoch": 0.97,
2248
+ "learning_rate": 1.3686354378818739e-05,
2249
+ "loss": 0.8077,
2250
+ "step": 3510
2251
+ },
2252
+ {
2253
+ "epoch": 0.97,
2254
+ "learning_rate": 1.3667839289020552e-05,
2255
+ "loss": 0.806,
2256
+ "step": 3520
2257
+ },
2258
+ {
2259
+ "epoch": 0.97,
2260
+ "learning_rate": 1.3649324199222367e-05,
2261
+ "loss": 0.8108,
2262
+ "step": 3530
2263
+ },
2264
+ {
2265
+ "epoch": 0.97,
2266
+ "learning_rate": 1.3630809109424181e-05,
2267
+ "loss": 0.8101,
2268
+ "step": 3540
2269
+ },
2270
+ {
2271
+ "epoch": 0.98,
2272
+ "learning_rate": 1.3612294019625996e-05,
2273
+ "loss": 0.8051,
2274
+ "step": 3550
2275
+ },
2276
+ {
2277
+ "epoch": 0.98,
2278
+ "learning_rate": 1.359377892982781e-05,
2279
+ "loss": 0.8079,
2280
+ "step": 3560
2281
+ },
2282
+ {
2283
+ "epoch": 0.98,
2284
+ "learning_rate": 1.3575263840029624e-05,
2285
+ "loss": 0.8078,
2286
+ "step": 3570
2287
+ },
2288
+ {
2289
+ "epoch": 0.99,
2290
+ "learning_rate": 1.3556748750231438e-05,
2291
+ "loss": 0.7974,
2292
+ "step": 3580
2293
+ },
2294
+ {
2295
+ "epoch": 0.99,
2296
+ "learning_rate": 1.3538233660433253e-05,
2297
+ "loss": 0.8152,
2298
+ "step": 3590
2299
+ },
2300
+ {
2301
+ "epoch": 0.99,
2302
+ "learning_rate": 1.351971857063507e-05,
2303
+ "loss": 0.7966,
2304
+ "step": 3600
2305
+ },
2306
+ {
2307
+ "epoch": 0.99,
2308
+ "eval_loss": 0.8059168457984924,
2309
+ "eval_runtime": 98.7407,
2310
+ "eval_samples_per_second": 10.128,
2311
+ "eval_steps_per_second": 0.638,
2312
+ "step": 3600
2313
+ },
2314
+ {
2315
+ "epoch": 0.99,
2316
+ "learning_rate": 1.3501203480836884e-05,
2317
+ "loss": 0.8112,
2318
+ "step": 3610
2319
+ },
2320
+ {
2321
+ "epoch": 1.0,
2322
+ "learning_rate": 1.3482688391038699e-05,
2323
+ "loss": 0.8148,
2324
+ "step": 3620
2325
+ },
2326
+ {
2327
+ "epoch": 1.0,
2328
+ "learning_rate": 1.3464173301240514e-05,
2329
+ "loss": 0.8081,
2330
+ "step": 3630
2331
+ },
2332
+ {
2333
+ "epoch": 1.0,
2334
+ "learning_rate": 1.3445658211442327e-05,
2335
+ "loss": 0.8169,
2336
+ "step": 3640
2337
+ },
2338
+ {
2339
+ "epoch": 1.0,
2340
+ "learning_rate": 1.3427143121644141e-05,
2341
+ "loss": 0.8125,
2342
+ "step": 3650
2343
+ },
2344
+ {
2345
+ "epoch": 1.01,
2346
+ "learning_rate": 1.3408628031845956e-05,
2347
+ "loss": 0.81,
2348
+ "step": 3660
2349
+ },
2350
+ {
2351
+ "epoch": 1.01,
2352
+ "learning_rate": 1.339011294204777e-05,
2353
+ "loss": 0.7986,
2354
+ "step": 3670
2355
+ },
2356
+ {
2357
+ "epoch": 1.01,
2358
+ "learning_rate": 1.3371597852249585e-05,
2359
+ "loss": 0.8034,
2360
+ "step": 3680
2361
+ },
2362
+ {
2363
+ "epoch": 1.02,
2364
+ "learning_rate": 1.33530827624514e-05,
2365
+ "loss": 0.8135,
2366
+ "step": 3690
2367
+ },
2368
+ {
2369
+ "epoch": 1.02,
2370
+ "learning_rate": 1.3334567672653213e-05,
2371
+ "loss": 0.7991,
2372
+ "step": 3700
2373
+ },
2374
+ {
2375
+ "epoch": 1.02,
2376
+ "learning_rate": 1.3316052582855028e-05,
2377
+ "loss": 0.8021,
2378
+ "step": 3710
2379
+ },
2380
+ {
2381
+ "epoch": 1.02,
2382
+ "learning_rate": 1.3297537493056842e-05,
2383
+ "loss": 0.793,
2384
+ "step": 3720
2385
+ },
2386
+ {
2387
+ "epoch": 1.03,
2388
+ "learning_rate": 1.3279022403258657e-05,
2389
+ "loss": 0.7976,
2390
+ "step": 3730
2391
+ },
2392
+ {
2393
+ "epoch": 1.03,
2394
+ "learning_rate": 1.3260507313460472e-05,
2395
+ "loss": 0.7978,
2396
+ "step": 3740
2397
+ },
2398
+ {
2399
+ "epoch": 1.03,
2400
+ "learning_rate": 1.3241992223662285e-05,
2401
+ "loss": 0.8048,
2402
+ "step": 3750
2403
+ },
2404
+ {
2405
+ "epoch": 1.03,
2406
+ "learning_rate": 1.32234771338641e-05,
2407
+ "loss": 0.814,
2408
+ "step": 3760
2409
+ },
2410
+ {
2411
+ "epoch": 1.04,
2412
+ "learning_rate": 1.3204962044065914e-05,
2413
+ "loss": 0.8091,
2414
+ "step": 3770
2415
+ },
2416
+ {
2417
+ "epoch": 1.04,
2418
+ "learning_rate": 1.3186446954267729e-05,
2419
+ "loss": 0.8128,
2420
+ "step": 3780
2421
+ },
2422
+ {
2423
+ "epoch": 1.04,
2424
+ "learning_rate": 1.3167931864469544e-05,
2425
+ "loss": 0.8147,
2426
+ "step": 3790
2427
+ },
2428
+ {
2429
+ "epoch": 1.05,
2430
+ "learning_rate": 1.3149416774671358e-05,
2431
+ "loss": 0.8067,
2432
+ "step": 3800
2433
+ },
2434
+ {
2435
+ "epoch": 1.05,
2436
+ "eval_loss": 0.8047506809234619,
2437
+ "eval_runtime": 98.7957,
2438
+ "eval_samples_per_second": 10.122,
2439
+ "eval_steps_per_second": 0.638,
2440
+ "step": 3800
2441
+ },
2442
+ {
2443
+ "epoch": 1.05,
2444
+ "learning_rate": 1.3130901684873171e-05,
2445
+ "loss": 0.7979,
2446
+ "step": 3810
2447
+ },
2448
+ {
2449
+ "epoch": 1.05,
2450
+ "learning_rate": 1.3112386595074986e-05,
2451
+ "loss": 0.8154,
2452
+ "step": 3820
2453
+ },
2454
+ {
2455
+ "epoch": 1.05,
2456
+ "learning_rate": 1.30938715052768e-05,
2457
+ "loss": 0.8133,
2458
+ "step": 3830
2459
+ },
2460
+ {
2461
+ "epoch": 1.06,
2462
+ "learning_rate": 1.3075356415478617e-05,
2463
+ "loss": 0.7961,
2464
+ "step": 3840
2465
+ },
2466
+ {
2467
+ "epoch": 1.06,
2468
+ "learning_rate": 1.3056841325680432e-05,
2469
+ "loss": 0.8019,
2470
+ "step": 3850
2471
+ },
2472
+ {
2473
+ "epoch": 1.06,
2474
+ "learning_rate": 1.3038326235882246e-05,
2475
+ "loss": 0.7977,
2476
+ "step": 3860
2477
+ },
2478
+ {
2479
+ "epoch": 1.06,
2480
+ "learning_rate": 1.3019811146084061e-05,
2481
+ "loss": 0.7995,
2482
+ "step": 3870
2483
+ },
2484
+ {
2485
+ "epoch": 1.07,
2486
+ "learning_rate": 1.3001296056285874e-05,
2487
+ "loss": 0.7997,
2488
+ "step": 3880
2489
+ },
2490
+ {
2491
+ "epoch": 1.07,
2492
+ "learning_rate": 1.2982780966487689e-05,
2493
+ "loss": 0.8025,
2494
+ "step": 3890
2495
+ },
2496
+ {
2497
+ "epoch": 1.07,
2498
+ "learning_rate": 1.2964265876689503e-05,
2499
+ "loss": 0.7911,
2500
+ "step": 3900
2501
+ },
2502
+ {
2503
+ "epoch": 1.08,
2504
+ "learning_rate": 1.2945750786891318e-05,
2505
+ "loss": 0.8097,
2506
+ "step": 3910
2507
+ },
2508
+ {
2509
+ "epoch": 1.08,
2510
+ "learning_rate": 1.2927235697093133e-05,
2511
+ "loss": 0.7942,
2512
+ "step": 3920
2513
+ },
2514
+ {
2515
+ "epoch": 1.08,
2516
+ "learning_rate": 1.2908720607294948e-05,
2517
+ "loss": 0.7999,
2518
+ "step": 3930
2519
+ },
2520
+ {
2521
+ "epoch": 1.08,
2522
+ "learning_rate": 1.289020551749676e-05,
2523
+ "loss": 0.7935,
2524
+ "step": 3940
2525
+ },
2526
+ {
2527
+ "epoch": 1.09,
2528
+ "learning_rate": 1.2871690427698575e-05,
2529
+ "loss": 0.8069,
2530
+ "step": 3950
2531
+ },
2532
+ {
2533
+ "epoch": 1.09,
2534
+ "learning_rate": 1.285317533790039e-05,
2535
+ "loss": 0.8121,
2536
+ "step": 3960
2537
+ },
2538
+ {
2539
+ "epoch": 1.09,
2540
+ "learning_rate": 1.2834660248102205e-05,
2541
+ "loss": 0.8206,
2542
+ "step": 3970
2543
+ },
2544
+ {
2545
+ "epoch": 1.1,
2546
+ "learning_rate": 1.281614515830402e-05,
2547
+ "loss": 0.7966,
2548
+ "step": 3980
2549
+ },
2550
+ {
2551
+ "epoch": 1.1,
2552
+ "learning_rate": 1.2797630068505832e-05,
2553
+ "loss": 0.7969,
2554
+ "step": 3990
2555
+ },
2556
+ {
2557
+ "epoch": 1.1,
2558
+ "learning_rate": 1.2779114978707647e-05,
2559
+ "loss": 0.8021,
2560
+ "step": 4000
2561
+ },
2562
+ {
2563
+ "epoch": 1.1,
2564
+ "eval_loss": 0.8035591840744019,
2565
+ "eval_runtime": 98.7519,
2566
+ "eval_samples_per_second": 10.126,
2567
+ "eval_steps_per_second": 0.638,
2568
+ "step": 4000
2569
+ },
2570
+ {
2571
+ "epoch": 1.1,
2572
+ "learning_rate": 1.2760599888909462e-05,
2573
+ "loss": 0.8029,
2574
+ "step": 4010
2575
+ },
2576
+ {
2577
+ "epoch": 1.11,
2578
+ "learning_rate": 1.2742084799111276e-05,
2579
+ "loss": 0.8043,
2580
+ "step": 4020
2581
+ },
2582
+ {
2583
+ "epoch": 1.11,
2584
+ "learning_rate": 1.2723569709313091e-05,
2585
+ "loss": 0.8049,
2586
+ "step": 4030
2587
+ },
2588
+ {
2589
+ "epoch": 1.11,
2590
+ "learning_rate": 1.2705054619514906e-05,
2591
+ "loss": 0.8068,
2592
+ "step": 4040
2593
+ },
2594
+ {
2595
+ "epoch": 1.11,
2596
+ "learning_rate": 1.2686539529716719e-05,
2597
+ "loss": 0.7908,
2598
+ "step": 4050
2599
+ },
2600
+ {
2601
+ "epoch": 1.12,
2602
+ "learning_rate": 1.2668024439918533e-05,
2603
+ "loss": 0.7989,
2604
+ "step": 4060
2605
+ },
2606
+ {
2607
+ "epoch": 1.12,
2608
+ "learning_rate": 1.2649509350120348e-05,
2609
+ "loss": 0.792,
2610
+ "step": 4070
2611
+ },
2612
+ {
2613
+ "epoch": 1.12,
2614
+ "learning_rate": 1.2630994260322165e-05,
2615
+ "loss": 0.8028,
2616
+ "step": 4080
2617
+ },
2618
+ {
2619
+ "epoch": 1.13,
2620
+ "learning_rate": 1.261247917052398e-05,
2621
+ "loss": 0.7886,
2622
+ "step": 4090
2623
+ },
2624
+ {
2625
+ "epoch": 1.13,
2626
+ "learning_rate": 1.2593964080725794e-05,
2627
+ "loss": 0.807,
2628
+ "step": 4100
2629
+ },
2630
+ {
2631
+ "epoch": 1.13,
2632
+ "learning_rate": 1.2575448990927609e-05,
2633
+ "loss": 0.8166,
2634
+ "step": 4110
2635
+ },
2636
+ {
2637
+ "epoch": 1.13,
2638
+ "learning_rate": 1.2556933901129422e-05,
2639
+ "loss": 0.802,
2640
+ "step": 4120
2641
+ },
2642
+ {
2643
+ "epoch": 1.14,
2644
+ "learning_rate": 1.2538418811331236e-05,
2645
+ "loss": 0.8019,
2646
+ "step": 4130
2647
+ },
2648
+ {
2649
+ "epoch": 1.14,
2650
+ "learning_rate": 1.2519903721533051e-05,
2651
+ "loss": 0.8099,
2652
+ "step": 4140
2653
+ },
2654
+ {
2655
+ "epoch": 1.14,
2656
+ "learning_rate": 1.2501388631734866e-05,
2657
+ "loss": 0.8052,
2658
+ "step": 4150
2659
+ },
2660
+ {
2661
+ "epoch": 1.14,
2662
+ "learning_rate": 1.248287354193668e-05,
2663
+ "loss": 0.8015,
2664
+ "step": 4160
2665
+ },
2666
+ {
2667
+ "epoch": 1.15,
2668
+ "learning_rate": 1.2464358452138493e-05,
2669
+ "loss": 0.797,
2670
+ "step": 4170
2671
+ },
2672
+ {
2673
+ "epoch": 1.15,
2674
+ "learning_rate": 1.2445843362340308e-05,
2675
+ "loss": 0.7982,
2676
+ "step": 4180
2677
+ },
2678
+ {
2679
+ "epoch": 1.15,
2680
+ "learning_rate": 1.2427328272542123e-05,
2681
+ "loss": 0.7919,
2682
+ "step": 4190
2683
+ },
2684
+ {
2685
+ "epoch": 1.16,
2686
+ "learning_rate": 1.2408813182743937e-05,
2687
+ "loss": 0.7962,
2688
+ "step": 4200
2689
+ },
2690
+ {
2691
+ "epoch": 1.16,
2692
+ "eval_loss": 0.8028302192687988,
2693
+ "eval_runtime": 98.7487,
2694
+ "eval_samples_per_second": 10.127,
2695
+ "eval_steps_per_second": 0.638,
2696
+ "step": 4200
2697
+ },
2698
+ {
2699
+ "epoch": 1.16,
2700
+ "learning_rate": 1.2390298092945752e-05,
2701
+ "loss": 0.7912,
2702
+ "step": 4210
2703
+ },
2704
+ {
2705
+ "epoch": 1.16,
2706
+ "learning_rate": 1.2371783003147567e-05,
2707
+ "loss": 0.7894,
2708
+ "step": 4220
2709
+ },
2710
+ {
2711
+ "epoch": 1.16,
2712
+ "learning_rate": 1.235326791334938e-05,
2713
+ "loss": 0.7847,
2714
+ "step": 4230
2715
+ },
2716
+ {
2717
+ "epoch": 1.17,
2718
+ "learning_rate": 1.2334752823551195e-05,
2719
+ "loss": 0.8126,
2720
+ "step": 4240
2721
+ },
2722
+ {
2723
+ "epoch": 1.17,
2724
+ "learning_rate": 1.231623773375301e-05,
2725
+ "loss": 0.7908,
2726
+ "step": 4250
2727
+ },
2728
+ {
2729
+ "epoch": 1.17,
2730
+ "learning_rate": 1.2297722643954824e-05,
2731
+ "loss": 0.8084,
2732
+ "step": 4260
2733
+ },
2734
+ {
2735
+ "epoch": 1.17,
2736
+ "learning_rate": 1.2279207554156639e-05,
2737
+ "loss": 0.7963,
2738
+ "step": 4270
2739
+ },
2740
+ {
2741
+ "epoch": 1.18,
2742
+ "learning_rate": 1.2260692464358452e-05,
2743
+ "loss": 0.8031,
2744
+ "step": 4280
2745
+ },
2746
+ {
2747
+ "epoch": 1.18,
2748
+ "learning_rate": 1.2242177374560266e-05,
2749
+ "loss": 0.7843,
2750
+ "step": 4290
2751
+ },
2752
+ {
2753
+ "epoch": 1.18,
2754
+ "learning_rate": 1.2223662284762081e-05,
2755
+ "loss": 0.8006,
2756
+ "step": 4300
2757
+ },
2758
+ {
2759
+ "epoch": 1.19,
2760
+ "learning_rate": 1.2205147194963896e-05,
2761
+ "loss": 0.8026,
2762
+ "step": 4310
2763
+ },
2764
+ {
2765
+ "epoch": 1.19,
2766
+ "learning_rate": 1.2186632105165712e-05,
2767
+ "loss": 0.7994,
2768
+ "step": 4320
2769
+ },
2770
+ {
2771
+ "epoch": 1.19,
2772
+ "learning_rate": 1.2168117015367527e-05,
2773
+ "loss": 0.8066,
2774
+ "step": 4330
2775
+ },
2776
+ {
2777
+ "epoch": 1.19,
2778
+ "learning_rate": 1.2149601925569341e-05,
2779
+ "loss": 0.8063,
2780
+ "step": 4340
2781
+ },
2782
+ {
2783
+ "epoch": 1.2,
2784
+ "learning_rate": 1.2131086835771156e-05,
2785
+ "loss": 0.8131,
2786
+ "step": 4350
2787
+ },
2788
+ {
2789
+ "epoch": 1.2,
2790
+ "learning_rate": 1.2112571745972969e-05,
2791
+ "loss": 0.7985,
2792
+ "step": 4360
2793
+ },
2794
+ {
2795
+ "epoch": 1.2,
2796
+ "learning_rate": 1.2094056656174784e-05,
2797
+ "loss": 0.8126,
2798
+ "step": 4370
2799
+ },
2800
+ {
2801
+ "epoch": 1.21,
2802
+ "learning_rate": 1.2075541566376599e-05,
2803
+ "loss": 0.8088,
2804
+ "step": 4380
2805
+ },
2806
+ {
2807
+ "epoch": 1.21,
2808
+ "learning_rate": 1.2057026476578413e-05,
2809
+ "loss": 0.7983,
2810
+ "step": 4390
2811
+ },
2812
+ {
2813
+ "epoch": 1.21,
2814
+ "learning_rate": 1.2038511386780228e-05,
2815
+ "loss": 0.8059,
2816
+ "step": 4400
2817
+ },
2818
+ {
2819
+ "epoch": 1.21,
2820
+ "eval_loss": 0.8016332983970642,
2821
+ "eval_runtime": 98.7551,
2822
+ "eval_samples_per_second": 10.126,
2823
+ "eval_steps_per_second": 0.638,
2824
+ "step": 4400
2825
+ },
2826
+ {
2827
+ "epoch": 1.21,
2828
+ "learning_rate": 1.2019996296982041e-05,
2829
+ "loss": 0.7998,
2830
+ "step": 4410
2831
+ },
2832
+ {
2833
+ "epoch": 1.22,
2834
+ "learning_rate": 1.2001481207183856e-05,
2835
+ "loss": 0.8014,
2836
+ "step": 4420
2837
+ },
2838
+ {
2839
+ "epoch": 1.22,
2840
+ "learning_rate": 1.198296611738567e-05,
2841
+ "loss": 0.7918,
2842
+ "step": 4430
2843
+ },
2844
+ {
2845
+ "epoch": 1.22,
2846
+ "learning_rate": 1.1964451027587485e-05,
2847
+ "loss": 0.8018,
2848
+ "step": 4440
2849
+ },
2850
+ {
2851
+ "epoch": 1.22,
2852
+ "learning_rate": 1.19459359377893e-05,
2853
+ "loss": 0.7924,
2854
+ "step": 4450
2855
+ },
2856
+ {
2857
+ "epoch": 1.23,
2858
+ "learning_rate": 1.1927420847991114e-05,
2859
+ "loss": 0.8068,
2860
+ "step": 4460
2861
+ },
2862
+ {
2863
+ "epoch": 1.23,
2864
+ "learning_rate": 1.1908905758192927e-05,
2865
+ "loss": 0.7883,
2866
+ "step": 4470
2867
+ },
2868
+ {
2869
+ "epoch": 1.23,
2870
+ "learning_rate": 1.1890390668394742e-05,
2871
+ "loss": 0.7981,
2872
+ "step": 4480
2873
+ },
2874
+ {
2875
+ "epoch": 1.24,
2876
+ "learning_rate": 1.1871875578596557e-05,
2877
+ "loss": 0.807,
2878
+ "step": 4490
2879
+ },
2880
+ {
2881
+ "epoch": 1.24,
2882
+ "learning_rate": 1.1853360488798371e-05,
2883
+ "loss": 0.7953,
2884
+ "step": 4500
2885
+ },
2886
+ {
2887
+ "epoch": 1.24,
2888
+ "learning_rate": 1.1834845399000186e-05,
2889
+ "loss": 0.7949,
2890
+ "step": 4510
2891
+ },
2892
+ {
2893
+ "epoch": 1.24,
2894
+ "learning_rate": 1.1816330309201999e-05,
2895
+ "loss": 0.8015,
2896
+ "step": 4520
2897
+ },
2898
+ {
2899
+ "epoch": 1.25,
2900
+ "learning_rate": 1.1797815219403814e-05,
2901
+ "loss": 0.7967,
2902
+ "step": 4530
2903
+ },
2904
+ {
2905
+ "epoch": 1.25,
2906
+ "learning_rate": 1.1779300129605629e-05,
2907
+ "loss": 0.782,
2908
+ "step": 4540
2909
+ },
2910
+ {
2911
+ "epoch": 1.25,
2912
+ "learning_rate": 1.1760785039807443e-05,
2913
+ "loss": 0.8061,
2914
+ "step": 4550
2915
+ },
2916
+ {
2917
+ "epoch": 1.25,
2918
+ "learning_rate": 1.1742269950009258e-05,
2919
+ "loss": 0.7991,
2920
+ "step": 4560
2921
+ },
2922
+ {
2923
+ "epoch": 1.26,
2924
+ "learning_rate": 1.1723754860211074e-05,
2925
+ "loss": 0.8064,
2926
+ "step": 4570
2927
+ },
2928
+ {
2929
+ "epoch": 1.26,
2930
+ "learning_rate": 1.1705239770412889e-05,
2931
+ "loss": 0.8071,
2932
+ "step": 4580
2933
+ },
2934
+ {
2935
+ "epoch": 1.26,
2936
+ "learning_rate": 1.1686724680614702e-05,
2937
+ "loss": 0.8122,
2938
+ "step": 4590
2939
+ },
2940
+ {
2941
+ "epoch": 1.27,
2942
+ "learning_rate": 1.1668209590816517e-05,
2943
+ "loss": 0.8063,
2944
+ "step": 4600
2945
+ },
2946
+ {
2947
+ "epoch": 1.27,
2948
+ "eval_loss": 0.8010302186012268,
2949
+ "eval_runtime": 98.6771,
2950
+ "eval_samples_per_second": 10.134,
2951
+ "eval_steps_per_second": 0.638,
2952
+ "step": 4600
2953
+ },
2954
+ {
2955
+ "epoch": 1.27,
2956
+ "learning_rate": 1.1649694501018331e-05,
2957
+ "loss": 0.8053,
2958
+ "step": 4610
2959
+ },
2960
+ {
2961
+ "epoch": 1.27,
2962
+ "learning_rate": 1.1631179411220146e-05,
2963
+ "loss": 0.8058,
2964
+ "step": 4620
2965
+ },
2966
+ {
2967
+ "epoch": 1.27,
2968
+ "learning_rate": 1.161266432142196e-05,
2969
+ "loss": 0.7906,
2970
+ "step": 4630
2971
+ },
2972
+ {
2973
+ "epoch": 1.28,
2974
+ "learning_rate": 1.1594149231623775e-05,
2975
+ "loss": 0.7955,
2976
+ "step": 4640
2977
+ },
2978
+ {
2979
+ "epoch": 1.28,
2980
+ "learning_rate": 1.1575634141825588e-05,
2981
+ "loss": 0.7967,
2982
+ "step": 4650
2983
+ },
2984
+ {
2985
+ "epoch": 1.28,
2986
+ "learning_rate": 1.1557119052027403e-05,
2987
+ "loss": 0.8047,
2988
+ "step": 4660
2989
+ },
2990
+ {
2991
+ "epoch": 1.28,
2992
+ "learning_rate": 1.1538603962229218e-05,
2993
+ "loss": 0.7835,
2994
+ "step": 4670
2995
+ },
2996
+ {
2997
+ "epoch": 1.29,
2998
+ "learning_rate": 1.1520088872431033e-05,
2999
+ "loss": 0.8087,
3000
+ "step": 4680
3001
+ },
3002
+ {
3003
+ "epoch": 1.29,
3004
+ "learning_rate": 1.1501573782632847e-05,
3005
+ "loss": 0.787,
3006
+ "step": 4690
3007
+ },
3008
+ {
3009
+ "epoch": 1.29,
3010
+ "learning_rate": 1.148305869283466e-05,
3011
+ "loss": 0.804,
3012
+ "step": 4700
3013
+ },
3014
+ {
3015
+ "epoch": 1.3,
3016
+ "learning_rate": 1.1464543603036475e-05,
3017
+ "loss": 0.8023,
3018
+ "step": 4710
3019
+ },
3020
+ {
3021
+ "epoch": 1.3,
3022
+ "learning_rate": 1.144602851323829e-05,
3023
+ "loss": 0.8001,
3024
+ "step": 4720
3025
+ },
3026
+ {
3027
+ "epoch": 1.3,
3028
+ "learning_rate": 1.1427513423440104e-05,
3029
+ "loss": 0.8026,
3030
+ "step": 4730
3031
+ },
3032
+ {
3033
+ "epoch": 1.3,
3034
+ "learning_rate": 1.1408998333641919e-05,
3035
+ "loss": 0.7916,
3036
+ "step": 4740
3037
+ },
3038
+ {
3039
+ "epoch": 1.31,
3040
+ "learning_rate": 1.1390483243843734e-05,
3041
+ "loss": 0.7939,
3042
+ "step": 4750
3043
+ },
3044
+ {
3045
+ "epoch": 1.31,
3046
+ "learning_rate": 1.1371968154045547e-05,
3047
+ "loss": 0.8032,
3048
+ "step": 4760
3049
+ },
3050
+ {
3051
+ "epoch": 1.31,
3052
+ "learning_rate": 1.1353453064247361e-05,
3053
+ "loss": 0.8013,
3054
+ "step": 4770
3055
+ },
3056
+ {
3057
+ "epoch": 1.32,
3058
+ "learning_rate": 1.1334937974449176e-05,
3059
+ "loss": 0.8026,
3060
+ "step": 4780
3061
+ },
3062
+ {
3063
+ "epoch": 1.32,
3064
+ "learning_rate": 1.131642288465099e-05,
3065
+ "loss": 0.793,
3066
+ "step": 4790
3067
+ },
3068
+ {
3069
+ "epoch": 1.32,
3070
+ "learning_rate": 1.1297907794852805e-05,
3071
+ "loss": 0.7993,
3072
+ "step": 4800
3073
+ },
3074
+ {
3075
+ "epoch": 1.32,
3076
+ "eval_loss": 0.8001218438148499,
3077
+ "eval_runtime": 98.8463,
3078
+ "eval_samples_per_second": 10.117,
3079
+ "eval_steps_per_second": 0.637,
3080
+ "step": 4800
3081
+ },
3082
+ {
3083
+ "epoch": 1.32,
3084
+ "learning_rate": 1.1279392705054622e-05,
3085
+ "loss": 0.8054,
3086
+ "step": 4810
3087
+ },
3088
+ {
3089
+ "epoch": 1.33,
3090
+ "learning_rate": 1.1260877615256437e-05,
3091
+ "loss": 0.792,
3092
+ "step": 4820
3093
+ },
3094
+ {
3095
+ "epoch": 1.33,
3096
+ "learning_rate": 1.124236252545825e-05,
3097
+ "loss": 0.7997,
3098
+ "step": 4830
3099
+ },
3100
+ {
3101
+ "epoch": 1.33,
3102
+ "learning_rate": 1.1223847435660064e-05,
3103
+ "loss": 0.7973,
3104
+ "step": 4840
3105
+ },
3106
+ {
3107
+ "epoch": 1.33,
3108
+ "learning_rate": 1.1205332345861879e-05,
3109
+ "loss": 0.7994,
3110
+ "step": 4850
3111
+ },
3112
+ {
3113
+ "epoch": 1.34,
3114
+ "learning_rate": 1.1186817256063694e-05,
3115
+ "loss": 0.7923,
3116
+ "step": 4860
3117
+ },
3118
+ {
3119
+ "epoch": 1.34,
3120
+ "learning_rate": 1.1168302166265508e-05,
3121
+ "loss": 0.7959,
3122
+ "step": 4870
3123
+ },
3124
+ {
3125
+ "epoch": 1.34,
3126
+ "learning_rate": 1.1149787076467323e-05,
3127
+ "loss": 0.794,
3128
+ "step": 4880
3129
+ },
3130
+ {
3131
+ "epoch": 1.35,
3132
+ "learning_rate": 1.1131271986669136e-05,
3133
+ "loss": 0.796,
3134
+ "step": 4890
3135
+ },
3136
+ {
3137
+ "epoch": 1.35,
3138
+ "learning_rate": 1.111275689687095e-05,
3139
+ "loss": 0.8023,
3140
+ "step": 4900
3141
+ },
3142
+ {
3143
+ "epoch": 1.35,
3144
+ "learning_rate": 1.1094241807072765e-05,
3145
+ "loss": 0.7972,
3146
+ "step": 4910
3147
+ },
3148
+ {
3149
+ "epoch": 1.35,
3150
+ "learning_rate": 1.107572671727458e-05,
3151
+ "loss": 0.811,
3152
+ "step": 4920
3153
+ },
3154
+ {
3155
+ "epoch": 1.36,
3156
+ "learning_rate": 1.1057211627476395e-05,
3157
+ "loss": 0.7931,
3158
+ "step": 4930
3159
+ },
3160
+ {
3161
+ "epoch": 1.36,
3162
+ "learning_rate": 1.1038696537678208e-05,
3163
+ "loss": 0.7933,
3164
+ "step": 4940
3165
+ },
3166
+ {
3167
+ "epoch": 1.36,
3168
+ "learning_rate": 1.1020181447880022e-05,
3169
+ "loss": 0.7961,
3170
+ "step": 4950
3171
+ },
3172
+ {
3173
+ "epoch": 1.36,
3174
+ "learning_rate": 1.1001666358081837e-05,
3175
+ "loss": 0.8124,
3176
+ "step": 4960
3177
+ },
3178
+ {
3179
+ "epoch": 1.37,
3180
+ "learning_rate": 1.0983151268283652e-05,
3181
+ "loss": 0.7983,
3182
+ "step": 4970
3183
+ },
3184
+ {
3185
+ "epoch": 1.37,
3186
+ "learning_rate": 1.0964636178485467e-05,
3187
+ "loss": 0.7932,
3188
+ "step": 4980
3189
+ },
3190
+ {
3191
+ "epoch": 1.37,
3192
+ "learning_rate": 1.0946121088687281e-05,
3193
+ "loss": 0.8005,
3194
+ "step": 4990
3195
+ },
3196
+ {
3197
+ "epoch": 1.38,
3198
+ "learning_rate": 1.0927605998889094e-05,
3199
+ "loss": 0.7956,
3200
+ "step": 5000
3201
+ },
3202
+ {
3203
+ "epoch": 1.38,
3204
+ "eval_loss": 0.7999187111854553,
3205
+ "eval_runtime": 98.8229,
3206
+ "eval_samples_per_second": 10.119,
3207
+ "eval_steps_per_second": 0.638,
3208
+ "step": 5000
3209
+ },
3210
+ {
3211
+ "epoch": 1.38,
3212
+ "learning_rate": 1.0909090909090909e-05,
3213
+ "loss": 0.795,
3214
+ "step": 5010
3215
+ },
3216
+ {
3217
+ "epoch": 1.38,
3218
+ "learning_rate": 1.0890575819292724e-05,
3219
+ "loss": 0.7998,
3220
+ "step": 5020
3221
+ },
3222
+ {
3223
+ "epoch": 1.38,
3224
+ "learning_rate": 1.0872060729494538e-05,
3225
+ "loss": 0.8049,
3226
+ "step": 5030
3227
+ },
3228
+ {
3229
+ "epoch": 1.39,
3230
+ "learning_rate": 1.0853545639696353e-05,
3231
+ "loss": 0.7942,
3232
+ "step": 5040
3233
+ },
3234
+ {
3235
+ "epoch": 1.39,
3236
+ "learning_rate": 1.083503054989817e-05,
3237
+ "loss": 0.7899,
3238
+ "step": 5050
3239
+ },
3240
+ {
3241
+ "epoch": 1.39,
3242
+ "learning_rate": 1.0816515460099984e-05,
3243
+ "loss": 0.806,
3244
+ "step": 5060
3245
+ },
3246
+ {
3247
+ "epoch": 1.39,
3248
+ "learning_rate": 1.0798000370301797e-05,
3249
+ "loss": 0.8026,
3250
+ "step": 5070
3251
+ },
3252
+ {
3253
+ "epoch": 1.4,
3254
+ "learning_rate": 1.0779485280503612e-05,
3255
+ "loss": 0.7914,
3256
+ "step": 5080
3257
+ },
3258
+ {
3259
+ "epoch": 1.4,
3260
+ "learning_rate": 1.0760970190705426e-05,
3261
+ "loss": 0.7993,
3262
+ "step": 5090
3263
+ },
3264
+ {
3265
+ "epoch": 1.4,
3266
+ "learning_rate": 1.0742455100907241e-05,
3267
+ "loss": 0.8064,
3268
+ "step": 5100
3269
+ },
3270
+ {
3271
+ "epoch": 1.41,
3272
+ "learning_rate": 1.0723940011109056e-05,
3273
+ "loss": 0.7989,
3274
+ "step": 5110
3275
+ },
3276
+ {
3277
+ "epoch": 1.41,
3278
+ "learning_rate": 1.0705424921310869e-05,
3279
+ "loss": 0.8054,
3280
+ "step": 5120
3281
+ },
3282
+ {
3283
+ "epoch": 1.41,
3284
+ "learning_rate": 1.0686909831512684e-05,
3285
+ "loss": 0.8007,
3286
+ "step": 5130
3287
+ },
3288
+ {
3289
+ "epoch": 1.41,
3290
+ "learning_rate": 1.0668394741714498e-05,
3291
+ "loss": 0.7882,
3292
+ "step": 5140
3293
+ },
3294
+ {
3295
+ "epoch": 1.42,
3296
+ "learning_rate": 1.0649879651916313e-05,
3297
+ "loss": 0.8137,
3298
+ "step": 5150
3299
+ },
3300
+ {
3301
+ "epoch": 1.42,
3302
+ "learning_rate": 1.0631364562118128e-05,
3303
+ "loss": 0.8,
3304
+ "step": 5160
3305
+ },
3306
+ {
3307
+ "epoch": 1.42,
3308
+ "learning_rate": 1.0612849472319942e-05,
3309
+ "loss": 0.7873,
3310
+ "step": 5170
3311
+ },
3312
+ {
3313
+ "epoch": 1.43,
3314
+ "learning_rate": 1.0594334382521755e-05,
3315
+ "loss": 0.8002,
3316
+ "step": 5180
3317
+ },
3318
+ {
3319
+ "epoch": 1.43,
3320
+ "learning_rate": 1.057581929272357e-05,
3321
+ "loss": 0.8025,
3322
+ "step": 5190
3323
+ },
3324
+ {
3325
+ "epoch": 1.43,
3326
+ "learning_rate": 1.0557304202925385e-05,
3327
+ "loss": 0.8128,
3328
+ "step": 5200
3329
+ },
3330
+ {
3331
+ "epoch": 1.43,
3332
+ "eval_loss": 0.7986959218978882,
3333
+ "eval_runtime": 98.7666,
3334
+ "eval_samples_per_second": 10.125,
3335
+ "eval_steps_per_second": 0.638,
3336
+ "step": 5200
3337
+ },
3338
+ {
3339
+ "epoch": 1.43,
3340
+ "learning_rate": 1.05387891131272e-05,
3341
+ "loss": 0.7914,
3342
+ "step": 5210
3343
+ },
3344
+ {
3345
+ "epoch": 1.44,
3346
+ "learning_rate": 1.0520274023329014e-05,
3347
+ "loss": 0.8034,
3348
+ "step": 5220
3349
+ },
3350
+ {
3351
+ "epoch": 1.44,
3352
+ "learning_rate": 1.0501758933530827e-05,
3353
+ "loss": 0.7978,
3354
+ "step": 5230
3355
+ },
3356
+ {
3357
+ "epoch": 1.44,
3358
+ "learning_rate": 1.0483243843732642e-05,
3359
+ "loss": 0.798,
3360
+ "step": 5240
3361
+ },
3362
+ {
3363
+ "epoch": 1.44,
3364
+ "learning_rate": 1.0464728753934456e-05,
3365
+ "loss": 0.7996,
3366
+ "step": 5250
3367
+ },
3368
+ {
3369
+ "epoch": 1.45,
3370
+ "learning_rate": 1.0446213664136271e-05,
3371
+ "loss": 0.7986,
3372
+ "step": 5260
3373
+ },
3374
+ {
3375
+ "epoch": 1.45,
3376
+ "learning_rate": 1.0427698574338086e-05,
3377
+ "loss": 0.8057,
3378
+ "step": 5270
3379
+ },
3380
+ {
3381
+ "epoch": 1.45,
3382
+ "learning_rate": 1.04091834845399e-05,
3383
+ "loss": 0.7974,
3384
+ "step": 5280
3385
+ },
3386
+ {
3387
+ "epoch": 1.46,
3388
+ "learning_rate": 1.0390668394741717e-05,
3389
+ "loss": 0.8033,
3390
+ "step": 5290
3391
+ },
3392
+ {
3393
+ "epoch": 1.46,
3394
+ "learning_rate": 1.0372153304943532e-05,
3395
+ "loss": 0.8002,
3396
+ "step": 5300
3397
+ },
3398
+ {
3399
+ "epoch": 1.46,
3400
+ "learning_rate": 1.0353638215145345e-05,
3401
+ "loss": 0.8013,
3402
+ "step": 5310
3403
+ },
3404
+ {
3405
+ "epoch": 1.46,
3406
+ "learning_rate": 1.033512312534716e-05,
3407
+ "loss": 0.7841,
3408
+ "step": 5320
3409
+ },
3410
+ {
3411
+ "epoch": 1.47,
3412
+ "learning_rate": 1.0316608035548974e-05,
3413
+ "loss": 0.7902,
3414
+ "step": 5330
3415
+ },
3416
+ {
3417
+ "epoch": 1.47,
3418
+ "learning_rate": 1.0298092945750789e-05,
3419
+ "loss": 0.8043,
3420
+ "step": 5340
3421
+ },
3422
+ {
3423
+ "epoch": 1.47,
3424
+ "learning_rate": 1.0279577855952603e-05,
3425
+ "loss": 0.7913,
3426
+ "step": 5350
3427
+ },
3428
+ {
3429
+ "epoch": 1.47,
3430
+ "learning_rate": 1.0261062766154416e-05,
3431
+ "loss": 0.792,
3432
+ "step": 5360
3433
+ },
3434
+ {
3435
+ "epoch": 1.48,
3436
+ "learning_rate": 1.0242547676356231e-05,
3437
+ "loss": 0.7938,
3438
+ "step": 5370
3439
+ },
3440
+ {
3441
+ "epoch": 1.48,
3442
+ "learning_rate": 1.0224032586558046e-05,
3443
+ "loss": 0.8014,
3444
+ "step": 5380
3445
+ },
3446
+ {
3447
+ "epoch": 1.48,
3448
+ "learning_rate": 1.020551749675986e-05,
3449
+ "loss": 0.7939,
3450
+ "step": 5390
3451
+ },
3452
+ {
3453
+ "epoch": 1.49,
3454
+ "learning_rate": 1.0187002406961675e-05,
3455
+ "loss": 0.793,
3456
+ "step": 5400
3457
+ },
3458
+ {
3459
+ "epoch": 1.49,
3460
+ "eval_loss": 0.7980086207389832,
3461
+ "eval_runtime": 98.7281,
3462
+ "eval_samples_per_second": 10.129,
3463
+ "eval_steps_per_second": 0.638,
3464
+ "step": 5400
3465
+ },
3466
+ {
3467
+ "epoch": 1.49,
3468
+ "learning_rate": 1.016848731716349e-05,
3469
+ "loss": 0.7972,
3470
+ "step": 5410
3471
+ },
3472
+ {
3473
+ "epoch": 1.49,
3474
+ "learning_rate": 1.0149972227365303e-05,
3475
+ "loss": 0.7937,
3476
+ "step": 5420
3477
+ },
3478
+ {
3479
+ "epoch": 1.49,
3480
+ "learning_rate": 1.0131457137567118e-05,
3481
+ "loss": 0.7918,
3482
+ "step": 5430
3483
+ },
3484
+ {
3485
+ "epoch": 1.5,
3486
+ "learning_rate": 1.0112942047768932e-05,
3487
+ "loss": 0.816,
3488
+ "step": 5440
3489
+ },
3490
+ {
3491
+ "epoch": 1.5,
3492
+ "learning_rate": 1.0094426957970747e-05,
3493
+ "loss": 0.7963,
3494
+ "step": 5450
3495
+ },
3496
+ {
3497
+ "epoch": 1.5,
3498
+ "learning_rate": 1.0075911868172562e-05,
3499
+ "loss": 0.8012,
3500
+ "step": 5460
3501
+ },
3502
+ {
3503
+ "epoch": 1.51,
3504
+ "learning_rate": 1.0057396778374375e-05,
3505
+ "loss": 0.801,
3506
+ "step": 5470
3507
+ },
3508
+ {
3509
+ "epoch": 1.51,
3510
+ "learning_rate": 1.003888168857619e-05,
3511
+ "loss": 0.7947,
3512
+ "step": 5480
3513
+ },
3514
+ {
3515
+ "epoch": 1.51,
3516
+ "learning_rate": 1.0020366598778004e-05,
3517
+ "loss": 0.8022,
3518
+ "step": 5490
3519
+ },
3520
+ {
3521
+ "epoch": 1.51,
3522
+ "learning_rate": 1.0001851508979819e-05,
3523
+ "loss": 0.7928,
3524
+ "step": 5500
3525
+ },
3526
+ {
3527
+ "epoch": 1.52,
3528
+ "learning_rate": 9.983336419181633e-06,
3529
+ "loss": 0.7971,
3530
+ "step": 5510
3531
+ },
3532
+ {
3533
+ "epoch": 1.52,
3534
+ "learning_rate": 9.964821329383448e-06,
3535
+ "loss": 0.8007,
3536
+ "step": 5520
3537
+ },
3538
+ {
3539
+ "epoch": 1.52,
3540
+ "learning_rate": 9.946306239585263e-06,
3541
+ "loss": 0.796,
3542
+ "step": 5530
3543
+ },
3544
+ {
3545
+ "epoch": 1.52,
3546
+ "learning_rate": 9.927791149787077e-06,
3547
+ "loss": 0.7948,
3548
+ "step": 5540
3549
+ },
3550
+ {
3551
+ "epoch": 1.53,
3552
+ "learning_rate": 9.909276059988892e-06,
3553
+ "loss": 0.7928,
3554
+ "step": 5550
3555
+ },
3556
+ {
3557
+ "epoch": 1.53,
3558
+ "learning_rate": 9.890760970190705e-06,
3559
+ "loss": 0.7943,
3560
+ "step": 5560
3561
+ },
3562
+ {
3563
+ "epoch": 1.53,
3564
+ "learning_rate": 9.87224588039252e-06,
3565
+ "loss": 0.7896,
3566
+ "step": 5570
3567
+ },
3568
+ {
3569
+ "epoch": 1.54,
3570
+ "learning_rate": 9.853730790594335e-06,
3571
+ "loss": 0.7975,
3572
+ "step": 5580
3573
+ },
3574
+ {
3575
+ "epoch": 1.54,
3576
+ "learning_rate": 9.835215700796151e-06,
3577
+ "loss": 0.7944,
3578
+ "step": 5590
3579
+ },
3580
+ {
3581
+ "epoch": 1.54,
3582
+ "learning_rate": 9.816700610997964e-06,
3583
+ "loss": 0.7968,
3584
+ "step": 5600
3585
+ },
3586
+ {
3587
+ "epoch": 1.54,
3588
+ "eval_loss": 0.7978225350379944,
3589
+ "eval_runtime": 98.8162,
3590
+ "eval_samples_per_second": 10.12,
3591
+ "eval_steps_per_second": 0.638,
3592
+ "step": 5600
3593
+ },
3594
+ {
3595
+ "epoch": 1.54,
3596
+ "learning_rate": 9.798185521199779e-06,
3597
+ "loss": 0.7939,
3598
+ "step": 5610
3599
+ },
3600
+ {
3601
+ "epoch": 1.55,
3602
+ "learning_rate": 9.779670431401593e-06,
3603
+ "loss": 0.7958,
3604
+ "step": 5620
3605
+ },
3606
+ {
3607
+ "epoch": 1.55,
3608
+ "learning_rate": 9.761155341603408e-06,
3609
+ "loss": 0.8013,
3610
+ "step": 5630
3611
+ },
3612
+ {
3613
+ "epoch": 1.55,
3614
+ "learning_rate": 9.742640251805223e-06,
3615
+ "loss": 0.8005,
3616
+ "step": 5640
3617
+ },
3618
+ {
3619
+ "epoch": 1.55,
3620
+ "learning_rate": 9.724125162007036e-06,
3621
+ "loss": 0.809,
3622
+ "step": 5650
3623
+ },
3624
+ {
3625
+ "epoch": 1.56,
3626
+ "learning_rate": 9.70561007220885e-06,
3627
+ "loss": 0.8062,
3628
+ "step": 5660
3629
+ },
3630
+ {
3631
+ "epoch": 1.56,
3632
+ "learning_rate": 9.687094982410665e-06,
3633
+ "loss": 0.7933,
3634
+ "step": 5670
3635
+ },
3636
+ {
3637
+ "epoch": 1.56,
3638
+ "learning_rate": 9.66857989261248e-06,
3639
+ "loss": 0.7945,
3640
+ "step": 5680
3641
+ },
3642
+ {
3643
+ "epoch": 1.57,
3644
+ "learning_rate": 9.650064802814294e-06,
3645
+ "loss": 0.8052,
3646
+ "step": 5690
3647
+ },
3648
+ {
3649
+ "epoch": 1.57,
3650
+ "learning_rate": 9.631549713016109e-06,
3651
+ "loss": 0.8031,
3652
+ "step": 5700
3653
+ },
3654
+ {
3655
+ "epoch": 1.57,
3656
+ "learning_rate": 9.613034623217924e-06,
3657
+ "loss": 0.786,
3658
+ "step": 5710
3659
+ },
3660
+ {
3661
+ "epoch": 1.57,
3662
+ "learning_rate": 9.594519533419739e-06,
3663
+ "loss": 0.7982,
3664
+ "step": 5720
3665
+ },
3666
+ {
3667
+ "epoch": 1.58,
3668
+ "learning_rate": 9.576004443621553e-06,
3669
+ "loss": 0.7946,
3670
+ "step": 5730
3671
+ },
3672
+ {
3673
+ "epoch": 1.58,
3674
+ "learning_rate": 9.557489353823368e-06,
3675
+ "loss": 0.7991,
3676
+ "step": 5740
3677
+ },
3678
+ {
3679
+ "epoch": 1.58,
3680
+ "learning_rate": 9.538974264025181e-06,
3681
+ "loss": 0.7918,
3682
+ "step": 5750
3683
+ },
3684
+ {
3685
+ "epoch": 1.58,
3686
+ "learning_rate": 9.520459174226996e-06,
3687
+ "loss": 0.7963,
3688
+ "step": 5760
3689
+ },
3690
+ {
3691
+ "epoch": 1.59,
3692
+ "learning_rate": 9.50194408442881e-06,
3693
+ "loss": 0.7871,
3694
+ "step": 5770
3695
+ },
3696
+ {
3697
+ "epoch": 1.59,
3698
+ "learning_rate": 9.483428994630625e-06,
3699
+ "loss": 0.7944,
3700
+ "step": 5780
3701
+ },
3702
+ {
3703
+ "epoch": 1.59,
3704
+ "learning_rate": 9.46491390483244e-06,
3705
+ "loss": 0.7986,
3706
+ "step": 5790
3707
+ },
3708
+ {
3709
+ "epoch": 1.6,
3710
+ "learning_rate": 9.446398815034253e-06,
3711
+ "loss": 0.7943,
3712
+ "step": 5800
3713
+ },
3714
+ {
3715
+ "epoch": 1.6,
3716
+ "eval_loss": 0.7971388101577759,
3717
+ "eval_runtime": 98.783,
3718
+ "eval_samples_per_second": 10.123,
3719
+ "eval_steps_per_second": 0.638,
3720
+ "step": 5800
3721
+ },
3722
+ {
3723
+ "epoch": 1.6,
3724
+ "learning_rate": 9.427883725236067e-06,
3725
+ "loss": 0.804,
3726
+ "step": 5810
3727
+ },
3728
+ {
3729
+ "epoch": 1.6,
3730
+ "learning_rate": 9.409368635437882e-06,
3731
+ "loss": 0.8041,
3732
+ "step": 5820
3733
+ },
3734
+ {
3735
+ "epoch": 1.6,
3736
+ "learning_rate": 9.390853545639698e-06,
3737
+ "loss": 0.7908,
3738
+ "step": 5830
3739
+ },
3740
+ {
3741
+ "epoch": 1.61,
3742
+ "learning_rate": 9.372338455841511e-06,
3743
+ "loss": 0.8059,
3744
+ "step": 5840
3745
+ },
3746
+ {
3747
+ "epoch": 1.61,
3748
+ "learning_rate": 9.353823366043326e-06,
3749
+ "loss": 0.8027,
3750
+ "step": 5850
3751
+ },
3752
+ {
3753
+ "epoch": 1.61,
3754
+ "learning_rate": 9.33530827624514e-06,
3755
+ "loss": 0.7851,
3756
+ "step": 5860
3757
+ },
3758
+ {
3759
+ "epoch": 1.62,
3760
+ "learning_rate": 9.316793186446956e-06,
3761
+ "loss": 0.7932,
3762
+ "step": 5870
3763
+ },
3764
+ {
3765
+ "epoch": 1.62,
3766
+ "learning_rate": 9.29827809664877e-06,
3767
+ "loss": 0.7911,
3768
+ "step": 5880
3769
+ },
3770
+ {
3771
+ "epoch": 1.62,
3772
+ "learning_rate": 9.279763006850583e-06,
3773
+ "loss": 0.7917,
3774
+ "step": 5890
3775
+ },
3776
+ {
3777
+ "epoch": 1.62,
3778
+ "learning_rate": 9.261247917052398e-06,
3779
+ "loss": 0.7987,
3780
+ "step": 5900
3781
+ },
3782
+ {
3783
+ "epoch": 1.63,
3784
+ "learning_rate": 9.242732827254213e-06,
3785
+ "loss": 0.796,
3786
+ "step": 5910
3787
+ },
3788
+ {
3789
+ "epoch": 1.63,
3790
+ "learning_rate": 9.224217737456027e-06,
3791
+ "loss": 0.7934,
3792
+ "step": 5920
3793
+ },
3794
+ {
3795
+ "epoch": 1.63,
3796
+ "learning_rate": 9.205702647657842e-06,
3797
+ "loss": 0.7984,
3798
+ "step": 5930
3799
+ },
3800
+ {
3801
+ "epoch": 1.63,
3802
+ "learning_rate": 9.187187557859657e-06,
3803
+ "loss": 0.7887,
3804
+ "step": 5940
3805
+ },
3806
+ {
3807
+ "epoch": 1.64,
3808
+ "learning_rate": 9.168672468061471e-06,
3809
+ "loss": 0.8099,
3810
+ "step": 5950
3811
+ },
3812
+ {
3813
+ "epoch": 1.64,
3814
+ "learning_rate": 9.150157378263286e-06,
3815
+ "loss": 0.7949,
3816
+ "step": 5960
3817
+ },
3818
+ {
3819
+ "epoch": 1.64,
3820
+ "learning_rate": 9.1316422884651e-06,
3821
+ "loss": 0.7951,
3822
+ "step": 5970
3823
+ },
3824
+ {
3825
+ "epoch": 1.65,
3826
+ "learning_rate": 9.113127198666914e-06,
3827
+ "loss": 0.79,
3828
+ "step": 5980
3829
+ },
3830
+ {
3831
+ "epoch": 1.65,
3832
+ "learning_rate": 9.094612108868728e-06,
3833
+ "loss": 0.776,
3834
+ "step": 5990
3835
+ },
3836
+ {
3837
+ "epoch": 1.65,
3838
+ "learning_rate": 9.076097019070543e-06,
3839
+ "loss": 0.7867,
3840
+ "step": 6000
3841
+ },
3842
+ {
3843
+ "epoch": 1.65,
3844
+ "eval_loss": 0.7963413596153259,
3845
+ "eval_runtime": 98.6932,
3846
+ "eval_samples_per_second": 10.132,
3847
+ "eval_steps_per_second": 0.638,
3848
+ "step": 6000
3849
+ },
3850
+ {
3851
+ "epoch": 1.65,
3852
+ "learning_rate": 9.057581929272358e-06,
3853
+ "loss": 0.7967,
3854
+ "step": 6010
3855
+ },
3856
+ {
3857
+ "epoch": 1.66,
3858
+ "learning_rate": 9.039066839474173e-06,
3859
+ "loss": 0.7858,
3860
+ "step": 6020
3861
+ },
3862
+ {
3863
+ "epoch": 1.66,
3864
+ "learning_rate": 9.020551749675987e-06,
3865
+ "loss": 0.803,
3866
+ "step": 6030
3867
+ },
3868
+ {
3869
+ "epoch": 1.66,
3870
+ "learning_rate": 9.0020366598778e-06,
3871
+ "loss": 0.8034,
3872
+ "step": 6040
3873
+ },
3874
+ {
3875
+ "epoch": 1.66,
3876
+ "learning_rate": 8.983521570079615e-06,
3877
+ "loss": 0.7844,
3878
+ "step": 6050
3879
+ },
3880
+ {
3881
+ "epoch": 1.67,
3882
+ "learning_rate": 8.96500648028143e-06,
3883
+ "loss": 0.7867,
3884
+ "step": 6060
3885
+ },
3886
+ {
3887
+ "epoch": 1.67,
3888
+ "learning_rate": 8.946491390483244e-06,
3889
+ "loss": 0.8031,
3890
+ "step": 6070
3891
+ },
3892
+ {
3893
+ "epoch": 1.67,
3894
+ "learning_rate": 8.927976300685059e-06,
3895
+ "loss": 0.7979,
3896
+ "step": 6080
3897
+ },
3898
+ {
3899
+ "epoch": 1.68,
3900
+ "learning_rate": 8.909461210886874e-06,
3901
+ "loss": 0.7784,
3902
+ "step": 6090
3903
+ },
3904
+ {
3905
+ "epoch": 1.68,
3906
+ "learning_rate": 8.890946121088688e-06,
3907
+ "loss": 0.7909,
3908
+ "step": 6100
3909
+ },
3910
+ {
3911
+ "epoch": 1.68,
3912
+ "learning_rate": 8.872431031290503e-06,
3913
+ "loss": 0.8128,
3914
+ "step": 6110
3915
+ },
3916
+ {
3917
+ "epoch": 1.68,
3918
+ "learning_rate": 8.853915941492318e-06,
3919
+ "loss": 0.7901,
3920
+ "step": 6120
3921
+ },
3922
+ {
3923
+ "epoch": 1.69,
3924
+ "learning_rate": 8.83540085169413e-06,
3925
+ "loss": 0.808,
3926
+ "step": 6130
3927
+ },
3928
+ {
3929
+ "epoch": 1.69,
3930
+ "learning_rate": 8.816885761895945e-06,
3931
+ "loss": 0.7872,
3932
+ "step": 6140
3933
+ },
3934
+ {
3935
+ "epoch": 1.69,
3936
+ "learning_rate": 8.79837067209776e-06,
3937
+ "loss": 0.7927,
3938
+ "step": 6150
3939
+ },
3940
+ {
3941
+ "epoch": 1.69,
3942
+ "learning_rate": 8.779855582299575e-06,
3943
+ "loss": 0.7958,
3944
+ "step": 6160
3945
+ },
3946
+ {
3947
+ "epoch": 1.7,
3948
+ "learning_rate": 8.76134049250139e-06,
3949
+ "loss": 0.7989,
3950
+ "step": 6170
3951
+ },
3952
+ {
3953
+ "epoch": 1.7,
3954
+ "learning_rate": 8.742825402703203e-06,
3955
+ "loss": 0.8054,
3956
+ "step": 6180
3957
+ },
3958
+ {
3959
+ "epoch": 1.7,
3960
+ "learning_rate": 8.724310312905019e-06,
3961
+ "loss": 0.7951,
3962
+ "step": 6190
3963
+ },
3964
+ {
3965
+ "epoch": 1.71,
3966
+ "learning_rate": 8.705795223106834e-06,
3967
+ "loss": 0.7843,
3968
+ "step": 6200
3969
+ },
3970
+ {
3971
+ "epoch": 1.71,
3972
+ "eval_loss": 0.7958487272262573,
3973
+ "eval_runtime": 98.7031,
3974
+ "eval_samples_per_second": 10.131,
3975
+ "eval_steps_per_second": 0.638,
3976
+ "step": 6200
3977
+ },
3978
+ {
3979
+ "epoch": 1.71,
3980
+ "learning_rate": 8.687280133308648e-06,
3981
+ "loss": 0.7896,
3982
+ "step": 6210
3983
+ },
3984
+ {
3985
+ "epoch": 1.71,
3986
+ "learning_rate": 8.668765043510461e-06,
3987
+ "loss": 0.7814,
3988
+ "step": 6220
3989
+ },
3990
+ {
3991
+ "epoch": 1.71,
3992
+ "learning_rate": 8.650249953712276e-06,
3993
+ "loss": 0.8004,
3994
+ "step": 6230
3995
+ },
3996
+ {
3997
+ "epoch": 1.72,
3998
+ "learning_rate": 8.63173486391409e-06,
3999
+ "loss": 0.7786,
4000
+ "step": 6240
4001
+ },
4002
+ {
4003
+ "epoch": 1.72,
4004
+ "learning_rate": 8.613219774115905e-06,
4005
+ "loss": 0.7991,
4006
+ "step": 6250
4007
+ },
4008
+ {
4009
+ "epoch": 1.72,
4010
+ "learning_rate": 8.59470468431772e-06,
4011
+ "loss": 0.7918,
4012
+ "step": 6260
4013
+ },
4014
+ {
4015
+ "epoch": 1.73,
4016
+ "learning_rate": 8.576189594519535e-06,
4017
+ "loss": 0.7778,
4018
+ "step": 6270
4019
+ },
4020
+ {
4021
+ "epoch": 1.73,
4022
+ "learning_rate": 8.557674504721348e-06,
4023
+ "loss": 0.7909,
4024
+ "step": 6280
4025
+ },
4026
+ {
4027
+ "epoch": 1.73,
4028
+ "learning_rate": 8.539159414923162e-06,
4029
+ "loss": 0.792,
4030
+ "step": 6290
4031
+ },
4032
+ {
4033
+ "epoch": 1.73,
4034
+ "learning_rate": 8.520644325124977e-06,
4035
+ "loss": 0.8067,
4036
+ "step": 6300
4037
+ },
4038
+ {
4039
+ "epoch": 1.74,
4040
+ "learning_rate": 8.502129235326792e-06,
4041
+ "loss": 0.7965,
4042
+ "step": 6310
4043
+ },
4044
+ {
4045
+ "epoch": 1.74,
4046
+ "learning_rate": 8.483614145528607e-06,
4047
+ "loss": 0.7958,
4048
+ "step": 6320
4049
+ },
4050
+ {
4051
+ "epoch": 1.74,
4052
+ "learning_rate": 8.465099055730421e-06,
4053
+ "loss": 0.777,
4054
+ "step": 6330
4055
+ },
4056
+ {
4057
+ "epoch": 1.74,
4058
+ "learning_rate": 8.446583965932236e-06,
4059
+ "loss": 0.7934,
4060
+ "step": 6340
4061
+ },
4062
+ {
4063
+ "epoch": 1.75,
4064
+ "learning_rate": 8.42806887613405e-06,
4065
+ "loss": 0.7899,
4066
+ "step": 6350
4067
+ },
4068
+ {
4069
+ "epoch": 1.75,
4070
+ "learning_rate": 8.409553786335865e-06,
4071
+ "loss": 0.7933,
4072
+ "step": 6360
4073
+ },
4074
+ {
4075
+ "epoch": 1.75,
4076
+ "learning_rate": 8.391038696537678e-06,
4077
+ "loss": 0.7898,
4078
+ "step": 6370
4079
+ },
4080
+ {
4081
+ "epoch": 1.76,
4082
+ "learning_rate": 8.372523606739493e-06,
4083
+ "loss": 0.7749,
4084
+ "step": 6380
4085
+ },
4086
+ {
4087
+ "epoch": 1.76,
4088
+ "learning_rate": 8.354008516941308e-06,
4089
+ "loss": 0.7951,
4090
+ "step": 6390
4091
+ },
4092
+ {
4093
+ "epoch": 1.76,
4094
+ "learning_rate": 8.335493427143122e-06,
4095
+ "loss": 0.79,
4096
+ "step": 6400
4097
+ },
4098
+ {
4099
+ "epoch": 1.76,
4100
+ "eval_loss": 0.7957026362419128,
4101
+ "eval_runtime": 98.7285,
4102
+ "eval_samples_per_second": 10.129,
4103
+ "eval_steps_per_second": 0.638,
4104
+ "step": 6400
4105
+ },
4106
+ {
4107
+ "epoch": 1.76,
4108
+ "learning_rate": 8.316978337344937e-06,
4109
+ "loss": 0.8007,
4110
+ "step": 6410
4111
+ },
4112
+ {
4113
+ "epoch": 1.77,
4114
+ "learning_rate": 8.29846324754675e-06,
4115
+ "loss": 0.791,
4116
+ "step": 6420
4117
+ },
4118
+ {
4119
+ "epoch": 1.77,
4120
+ "learning_rate": 8.279948157748566e-06,
4121
+ "loss": 0.8045,
4122
+ "step": 6430
4123
+ },
4124
+ {
4125
+ "epoch": 1.77,
4126
+ "learning_rate": 8.261433067950381e-06,
4127
+ "loss": 0.7969,
4128
+ "step": 6440
4129
+ },
4130
+ {
4131
+ "epoch": 1.77,
4132
+ "learning_rate": 8.242917978152196e-06,
4133
+ "loss": 0.7826,
4134
+ "step": 6450
4135
+ },
4136
+ {
4137
+ "epoch": 1.78,
4138
+ "learning_rate": 8.224402888354009e-06,
4139
+ "loss": 0.7792,
4140
+ "step": 6460
4141
+ },
4142
+ {
4143
+ "epoch": 1.78,
4144
+ "learning_rate": 8.205887798555824e-06,
4145
+ "loss": 0.7953,
4146
+ "step": 6470
4147
+ },
4148
+ {
4149
+ "epoch": 1.78,
4150
+ "learning_rate": 8.187372708757638e-06,
4151
+ "loss": 0.7822,
4152
+ "step": 6480
4153
+ },
4154
+ {
4155
+ "epoch": 1.79,
4156
+ "learning_rate": 8.168857618959453e-06,
4157
+ "loss": 0.79,
4158
+ "step": 6490
4159
+ },
4160
+ {
4161
+ "epoch": 1.79,
4162
+ "learning_rate": 8.150342529161268e-06,
4163
+ "loss": 0.7902,
4164
+ "step": 6500
4165
+ },
4166
+ {
4167
+ "epoch": 1.79,
4168
+ "learning_rate": 8.13182743936308e-06,
4169
+ "loss": 0.7979,
4170
+ "step": 6510
4171
+ },
4172
+ {
4173
+ "epoch": 1.79,
4174
+ "learning_rate": 8.113312349564895e-06,
4175
+ "loss": 0.8036,
4176
+ "step": 6520
4177
+ },
4178
+ {
4179
+ "epoch": 1.8,
4180
+ "learning_rate": 8.09479725976671e-06,
4181
+ "loss": 0.7917,
4182
+ "step": 6530
4183
+ },
4184
+ {
4185
+ "epoch": 1.8,
4186
+ "learning_rate": 8.076282169968525e-06,
4187
+ "loss": 0.7659,
4188
+ "step": 6540
4189
+ },
4190
+ {
4191
+ "epoch": 1.8,
4192
+ "learning_rate": 8.05776708017034e-06,
4193
+ "loss": 0.8012,
4194
+ "step": 6550
4195
+ },
4196
+ {
4197
+ "epoch": 1.8,
4198
+ "learning_rate": 8.039251990372154e-06,
4199
+ "loss": 0.7836,
4200
+ "step": 6560
4201
+ },
4202
+ {
4203
+ "epoch": 1.81,
4204
+ "learning_rate": 8.020736900573969e-06,
4205
+ "loss": 0.7888,
4206
+ "step": 6570
4207
+ },
4208
+ {
4209
+ "epoch": 1.81,
4210
+ "learning_rate": 8.002221810775783e-06,
4211
+ "loss": 0.7869,
4212
+ "step": 6580
4213
+ },
4214
+ {
4215
+ "epoch": 1.81,
4216
+ "learning_rate": 7.983706720977598e-06,
4217
+ "loss": 0.7913,
4218
+ "step": 6590
4219
+ },
4220
+ {
4221
+ "epoch": 1.82,
4222
+ "learning_rate": 7.965191631179411e-06,
4223
+ "loss": 0.8022,
4224
+ "step": 6600
4225
+ },
4226
+ {
4227
+ "epoch": 1.82,
4228
+ "eval_loss": 0.7951002717018127,
4229
+ "eval_runtime": 98.6725,
4230
+ "eval_samples_per_second": 10.135,
4231
+ "eval_steps_per_second": 0.638,
4232
+ "step": 6600
4233
+ },
4234
+ {
4235
+ "epoch": 1.82,
4236
+ "learning_rate": 7.948528050361045e-06,
4237
+ "loss": 0.7867,
4238
+ "step": 6610
4239
+ },
4240
+ {
4241
+ "epoch": 1.82,
4242
+ "learning_rate": 7.93001296056286e-06,
4243
+ "loss": 0.7974,
4244
+ "step": 6620
4245
+ },
4246
+ {
4247
+ "epoch": 1.82,
4248
+ "learning_rate": 7.911497870764675e-06,
4249
+ "loss": 0.7959,
4250
+ "step": 6630
4251
+ },
4252
+ {
4253
+ "epoch": 1.83,
4254
+ "learning_rate": 7.892982780966488e-06,
4255
+ "loss": 0.79,
4256
+ "step": 6640
4257
+ },
4258
+ {
4259
+ "epoch": 1.83,
4260
+ "learning_rate": 7.874467691168302e-06,
4261
+ "loss": 0.7942,
4262
+ "step": 6650
4263
+ },
4264
+ {
4265
+ "epoch": 1.83,
4266
+ "learning_rate": 7.855952601370117e-06,
4267
+ "loss": 0.7984,
4268
+ "step": 6660
4269
+ },
4270
+ {
4271
+ "epoch": 1.84,
4272
+ "learning_rate": 7.837437511571932e-06,
4273
+ "loss": 0.785,
4274
+ "step": 6670
4275
+ },
4276
+ {
4277
+ "epoch": 1.84,
4278
+ "learning_rate": 7.818922421773747e-06,
4279
+ "loss": 0.7959,
4280
+ "step": 6680
4281
+ },
4282
+ {
4283
+ "epoch": 1.84,
4284
+ "learning_rate": 7.80040733197556e-06,
4285
+ "loss": 0.7774,
4286
+ "step": 6690
4287
+ },
4288
+ {
4289
+ "epoch": 1.84,
4290
+ "learning_rate": 7.781892242177376e-06,
4291
+ "loss": 0.7914,
4292
+ "step": 6700
4293
+ },
4294
+ {
4295
+ "epoch": 1.85,
4296
+ "learning_rate": 7.76337715237919e-06,
4297
+ "loss": 0.7791,
4298
+ "step": 6710
4299
+ },
4300
+ {
4301
+ "epoch": 1.85,
4302
+ "learning_rate": 7.744862062581005e-06,
4303
+ "loss": 0.7986,
4304
+ "step": 6720
4305
+ },
4306
+ {
4307
+ "epoch": 1.85,
4308
+ "learning_rate": 7.726346972782818e-06,
4309
+ "loss": 0.7937,
4310
+ "step": 6730
4311
+ },
4312
+ {
4313
+ "epoch": 1.85,
4314
+ "learning_rate": 7.707831882984633e-06,
4315
+ "loss": 0.7949,
4316
+ "step": 6740
4317
+ },
4318
+ {
4319
+ "epoch": 1.86,
4320
+ "learning_rate": 7.689316793186448e-06,
4321
+ "loss": 0.7932,
4322
+ "step": 6750
4323
+ },
4324
+ {
4325
+ "epoch": 1.86,
4326
+ "learning_rate": 7.670801703388262e-06,
4327
+ "loss": 0.7988,
4328
+ "step": 6760
4329
+ },
4330
+ {
4331
+ "epoch": 1.86,
4332
+ "learning_rate": 7.652286613590077e-06,
4333
+ "loss": 0.7852,
4334
+ "step": 6770
4335
+ },
4336
+ {
4337
+ "epoch": 1.87,
4338
+ "learning_rate": 7.63377152379189e-06,
4339
+ "loss": 0.7922,
4340
+ "step": 6780
4341
+ },
4342
+ {
4343
+ "epoch": 1.87,
4344
+ "learning_rate": 7.615256433993706e-06,
4345
+ "loss": 0.7936,
4346
+ "step": 6790
4347
+ },
4348
+ {
4349
+ "epoch": 1.87,
4350
+ "learning_rate": 7.5967413441955194e-06,
4351
+ "loss": 0.7794,
4352
+ "step": 6800
4353
+ },
4354
+ {
4355
+ "epoch": 1.87,
4356
+ "eval_loss": 0.7945918440818787,
4357
+ "eval_runtime": 98.7262,
4358
+ "eval_samples_per_second": 10.129,
4359
+ "eval_steps_per_second": 0.638,
4360
+ "step": 6800
4361
+ }
4362
+ ],
4363
+ "max_steps": 10902,
4364
+ "num_train_epochs": 3,
4365
+ "total_flos": 2.0184019484077457e+19,
4366
+ "trial_name": null,
4367
+ "trial_params": null
4368
+ }
checkpoint-6800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:061bef34d08900ea9fa9968114112b4c9df622a8ab74fd0a9a3c90bc3e259f6c
3
+ size 3579