alexandertam commited on
Commit
3c9d085
·
verified ·
1 Parent(s): 09b380f

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - generated_from_trainer
5
+ metrics:
6
+ - accuracy
7
+ model-index:
8
+ - name: babylm-base5M-gpt2
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # babylm-base5M-gpt2
16
+
17
+ This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
18
+ It achieves the following results on the evaluation set:
19
+ - Loss: 3.0628
20
+ - Accuracy: 0.4521
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 5e-05
40
+ - train_batch_size: 16
41
+ - eval_batch_size: 16
42
+ - seed: 42
43
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
44
+ - lr_scheduler_type: linear
45
+ - lr_scheduler_warmup_steps: 190
46
+ - training_steps: 19000
47
+ - mixed_precision_training: Native AMP
48
+
49
+ ### Training results
50
+
51
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
+ |:-------------:|:------:|:-----:|:---------------:|:--------:|
53
+ | 5.579 | 0.1024 | 200 | 4.7677 | 0.3189 |
54
+ | 4.7716 | 0.2048 | 400 | 4.3385 | 0.3544 |
55
+ | 4.5162 | 0.3072 | 600 | 4.1772 | 0.3593 |
56
+ | 4.4056 | 0.4096 | 800 | 4.0754 | 0.3693 |
57
+ | 4.3138 | 0.5120 | 1000 | 4.0143 | 0.3626 |
58
+ | 4.2148 | 0.6144 | 1200 | 3.9601 | 0.3554 |
59
+ | 4.1925 | 0.7168 | 1400 | 3.9019 | 0.3723 |
60
+ | 4.0293 | 0.8193 | 1600 | 3.8579 | 0.3749 |
61
+ | 3.9407 | 0.9217 | 1800 | 3.8101 | 0.3782 |
62
+ | 3.8371 | 1.0241 | 2000 | 3.7870 | 0.3721 |
63
+ | 3.0659 | 2.0481 | 4000 | 3.4672 | 0.4085 |
64
+ | 2.6866 | 3.0722 | 6000 | 3.2850 | 0.4316 |
65
+ | 2.5063 | 4.0963 | 8000 | 3.1963 | 0.4372 |
66
+ | 2.4139 | 5.1203 | 10000 | 3.1406 | 0.4442 |
67
+ | 2.3246 | 6.1444 | 12000 | 3.1152 | 0.4484 |
68
+ | 2.3111 | 7.1685 | 14000 | 3.0879 | 0.4489 |
69
+ | 2.2761 | 8.1925 | 16000 | 3.0668 | 0.4542 |
70
+ | 2.2231 | 9.2166 | 18000 | 3.0695 | 0.4517 |
71
+
72
+
73
+ ### Framework versions
74
+
75
+ - Transformers 4.50.3
76
+ - Pytorch 2.7.1+cu126
77
+ - Datasets 3.6.0
78
+ - Tokenizers 0.21.4
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.728622631848438,
3
+ "eval_accuracy": 0.45214562944577746,
4
+ "eval_loss": 3.0628035068511963,
5
+ "eval_runtime": 95.1631,
6
+ "eval_samples": 33544,
7
+ "eval_samples_per_second": 352.49,
8
+ "eval_steps_per_second": 22.036,
9
+ "perplexity": 21.38743300660324,
10
+ "total_flos": 7.9413964701696e+16,
11
+ "train_loss": 2.775124670731394,
12
+ "train_runtime": 3271.669,
13
+ "train_samples": 31240,
14
+ "train_samples_per_second": 92.919,
15
+ "train_steps_per_second": 5.807
16
+ }
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 1,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 2,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 768,
14
+ "n_head": 12,
15
+ "n_inner": null,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.50.3",
29
+ "use_cache": true,
30
+ "vocab_size": 16384
31
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.728622631848438,
3
+ "eval_accuracy": 0.45214562944577746,
4
+ "eval_loss": 3.0628035068511963,
5
+ "eval_runtime": 95.1631,
6
+ "eval_samples": 33544,
7
+ "eval_samples_per_second": 352.49,
8
+ "eval_steps_per_second": 22.036,
9
+ "perplexity": 21.38743300660324
10
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.50.3"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6ef61138a105c85ceffe6b6f43295e1227d2edb594f4b906933ab8302486203
3
+ size 393716352
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "cls_token": null,
39
+ "eos_token": "</s>",
40
+ "extra_special_tokens": {},
41
+ "mask_token": null,
42
+ "model_max_length": 1024,
43
+ "pad_token": "<pad>",
44
+ "sep_token": null,
45
+ "tokenizer_class": "PreTrainedTokenizer",
46
+ "unk_token": "<unk>"
47
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.728622631848438,
3
+ "total_flos": 7.9413964701696e+16,
4
+ "train_loss": 2.775124670731394,
5
+ "train_runtime": 3271.669,
6
+ "train_samples": 31240,
7
+ "train_samples_per_second": 92.919,
8
+ "train_steps_per_second": 5.807
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 9.728622631848438,
6
+ "eval_steps": 99999999,
7
+ "global_step": 19000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.051203277009728626,
14
+ "grad_norm": 4.247892379760742,
15
+ "learning_rate": 2.6315789473684212e-05,
16
+ "loss": 7.4934,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.10240655401945725,
21
+ "grad_norm": 3.517791748046875,
22
+ "learning_rate": 4.997341839447103e-05,
23
+ "loss": 5.579,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.10240655401945725,
28
+ "eval_accuracy": 0.31893968280934165,
29
+ "eval_loss": 4.767695903778076,
30
+ "eval_runtime": 94.8557,
31
+ "eval_samples_per_second": 353.632,
32
+ "eval_steps_per_second": 22.107,
33
+ "step": 200
34
+ },
35
+ {
36
+ "epoch": 0.15360983102918588,
37
+ "grad_norm": 2.3053529262542725,
38
+ "learning_rate": 4.970760233918128e-05,
39
+ "loss": 4.9703,
40
+ "step": 300
41
+ },
42
+ {
43
+ "epoch": 0.2048131080389145,
44
+ "grad_norm": 2.0182411670684814,
45
+ "learning_rate": 4.944178628389155e-05,
46
+ "loss": 4.7716,
47
+ "step": 400
48
+ },
49
+ {
50
+ "epoch": 0.2048131080389145,
51
+ "eval_accuracy": 0.354354102424925,
52
+ "eval_loss": 4.33852481842041,
53
+ "eval_runtime": 95.6535,
54
+ "eval_samples_per_second": 350.682,
55
+ "eval_steps_per_second": 21.923,
56
+ "step": 400
57
+ },
58
+ {
59
+ "epoch": 0.2560163850486431,
60
+ "grad_norm": 1.6917961835861206,
61
+ "learning_rate": 4.917597022860181e-05,
62
+ "loss": 4.5491,
63
+ "step": 500
64
+ },
65
+ {
66
+ "epoch": 0.30721966205837176,
67
+ "grad_norm": 1.3638228178024292,
68
+ "learning_rate": 4.8910154173312074e-05,
69
+ "loss": 4.5162,
70
+ "step": 600
71
+ },
72
+ {
73
+ "epoch": 0.30721966205837176,
74
+ "eval_accuracy": 0.35930259313000934,
75
+ "eval_loss": 4.177200794219971,
76
+ "eval_runtime": 95.6652,
77
+ "eval_samples_per_second": 350.64,
78
+ "eval_steps_per_second": 21.92,
79
+ "step": 600
80
+ },
81
+ {
82
+ "epoch": 0.35842293906810035,
83
+ "grad_norm": 1.5296661853790283,
84
+ "learning_rate": 4.8644338118022334e-05,
85
+ "loss": 4.5039,
86
+ "step": 700
87
+ },
88
+ {
89
+ "epoch": 0.409626216077829,
90
+ "grad_norm": 1.1248077154159546,
91
+ "learning_rate": 4.837852206273259e-05,
92
+ "loss": 4.4056,
93
+ "step": 800
94
+ },
95
+ {
96
+ "epoch": 0.409626216077829,
97
+ "eval_accuracy": 0.36928340870045734,
98
+ "eval_loss": 4.075441837310791,
99
+ "eval_runtime": 96.0011,
100
+ "eval_samples_per_second": 349.412,
101
+ "eval_steps_per_second": 21.843,
102
+ "step": 800
103
+ },
104
+ {
105
+ "epoch": 0.4608294930875576,
106
+ "grad_norm": 1.6162877082824707,
107
+ "learning_rate": 4.811270600744285e-05,
108
+ "loss": 4.3569,
109
+ "step": 900
110
+ },
111
+ {
112
+ "epoch": 0.5120327700972862,
113
+ "grad_norm": 1.6072983741760254,
114
+ "learning_rate": 4.784688995215311e-05,
115
+ "loss": 4.3138,
116
+ "step": 1000
117
+ },
118
+ {
119
+ "epoch": 0.5120327700972862,
120
+ "eval_accuracy": 0.3626070708659433,
121
+ "eval_loss": 4.014294624328613,
122
+ "eval_runtime": 95.7344,
123
+ "eval_samples_per_second": 350.386,
124
+ "eval_steps_per_second": 21.904,
125
+ "step": 1000
126
+ },
127
+ {
128
+ "epoch": 0.5632360471070148,
129
+ "grad_norm": 1.4564248323440552,
130
+ "learning_rate": 4.758107389686337e-05,
131
+ "loss": 4.2611,
132
+ "step": 1100
133
+ },
134
+ {
135
+ "epoch": 0.6144393241167435,
136
+ "grad_norm": 1.2788530588150024,
137
+ "learning_rate": 4.731525784157364e-05,
138
+ "loss": 4.2148,
139
+ "step": 1200
140
+ },
141
+ {
142
+ "epoch": 0.6144393241167435,
143
+ "eval_accuracy": 0.35539686636426476,
144
+ "eval_loss": 3.9600954055786133,
145
+ "eval_runtime": 95.926,
146
+ "eval_samples_per_second": 349.686,
147
+ "eval_steps_per_second": 21.861,
148
+ "step": 1200
149
+ },
150
+ {
151
+ "epoch": 0.6656426011264721,
152
+ "grad_norm": 1.3278919458389282,
153
+ "learning_rate": 4.7049441786283896e-05,
154
+ "loss": 4.2095,
155
+ "step": 1300
156
+ },
157
+ {
158
+ "epoch": 0.7168458781362007,
159
+ "grad_norm": 1.6991857290267944,
160
+ "learning_rate": 4.678362573099415e-05,
161
+ "loss": 4.1925,
162
+ "step": 1400
163
+ },
164
+ {
165
+ "epoch": 0.7168458781362007,
166
+ "eval_accuracy": 0.3723133981106336,
167
+ "eval_loss": 3.901859998703003,
168
+ "eval_runtime": 95.2539,
169
+ "eval_samples_per_second": 352.154,
170
+ "eval_steps_per_second": 22.015,
171
+ "step": 1400
172
+ },
173
+ {
174
+ "epoch": 0.7680491551459293,
175
+ "grad_norm": 1.7155863046646118,
176
+ "learning_rate": 4.6517809675704415e-05,
177
+ "loss": 4.1556,
178
+ "step": 1500
179
+ },
180
+ {
181
+ "epoch": 0.819252432155658,
182
+ "grad_norm": 1.6039901971817017,
183
+ "learning_rate": 4.6251993620414674e-05,
184
+ "loss": 4.0293,
185
+ "step": 1600
186
+ },
187
+ {
188
+ "epoch": 0.819252432155658,
189
+ "eval_accuracy": 0.37494784430112066,
190
+ "eval_loss": 3.857882499694824,
191
+ "eval_runtime": 95.7224,
192
+ "eval_samples_per_second": 350.43,
193
+ "eval_steps_per_second": 21.907,
194
+ "step": 1600
195
+ },
196
+ {
197
+ "epoch": 0.8704557091653866,
198
+ "grad_norm": 1.6892971992492676,
199
+ "learning_rate": 4.598617756512494e-05,
200
+ "loss": 3.939,
201
+ "step": 1700
202
+ },
203
+ {
204
+ "epoch": 0.9216589861751152,
205
+ "grad_norm": 1.5192146301269531,
206
+ "learning_rate": 4.57203615098352e-05,
207
+ "loss": 3.9407,
208
+ "step": 1800
209
+ },
210
+ {
211
+ "epoch": 0.9216589861751152,
212
+ "eval_accuracy": 0.37819625757774467,
213
+ "eval_loss": 3.8101186752319336,
214
+ "eval_runtime": 95.9641,
215
+ "eval_samples_per_second": 349.547,
216
+ "eval_steps_per_second": 21.852,
217
+ "step": 1800
218
+ },
219
+ {
220
+ "epoch": 0.9728622631848438,
221
+ "grad_norm": 1.6878364086151123,
222
+ "learning_rate": 4.545454545454546e-05,
223
+ "loss": 3.9125,
224
+ "step": 1900
225
+ },
226
+ {
227
+ "epoch": 1.0240655401945724,
228
+ "grad_norm": 1.6546193361282349,
229
+ "learning_rate": 4.518872939925572e-05,
230
+ "loss": 3.8371,
231
+ "step": 2000
232
+ },
233
+ {
234
+ "epoch": 1.0240655401945724,
235
+ "eval_accuracy": 0.37209030706755225,
236
+ "eval_loss": 3.787036180496216,
237
+ "eval_runtime": 95.7788,
238
+ "eval_samples_per_second": 350.224,
239
+ "eval_steps_per_second": 21.894,
240
+ "step": 2000
241
+ },
242
+ {
243
+ "epoch": 1.075268817204301,
244
+ "grad_norm": 1.744416356086731,
245
+ "learning_rate": 4.492291334396598e-05,
246
+ "loss": 3.8068,
247
+ "step": 2100
248
+ },
249
+ {
250
+ "epoch": 1.1264720942140296,
251
+ "grad_norm": 1.6214791536331177,
252
+ "learning_rate": 4.4657097288676236e-05,
253
+ "loss": 3.7642,
254
+ "step": 2200
255
+ },
256
+ {
257
+ "epoch": 1.1776753712237582,
258
+ "grad_norm": 1.5377932786941528,
259
+ "learning_rate": 4.43912812333865e-05,
260
+ "loss": 3.7599,
261
+ "step": 2300
262
+ },
263
+ {
264
+ "epoch": 1.228878648233487,
265
+ "grad_norm": 1.8464128971099854,
266
+ "learning_rate": 4.412546517809676e-05,
267
+ "loss": 3.6853,
268
+ "step": 2400
269
+ },
270
+ {
271
+ "epoch": 1.2800819252432156,
272
+ "grad_norm": 1.901739239692688,
273
+ "learning_rate": 4.3859649122807014e-05,
274
+ "loss": 3.6537,
275
+ "step": 2500
276
+ },
277
+ {
278
+ "epoch": 1.3312852022529442,
279
+ "grad_norm": 1.936339020729065,
280
+ "learning_rate": 4.359383306751728e-05,
281
+ "loss": 3.6377,
282
+ "step": 2600
283
+ },
284
+ {
285
+ "epoch": 1.3824884792626728,
286
+ "grad_norm": 1.8641879558563232,
287
+ "learning_rate": 4.332801701222754e-05,
288
+ "loss": 3.5523,
289
+ "step": 2700
290
+ },
291
+ {
292
+ "epoch": 1.4336917562724014,
293
+ "grad_norm": 2.087099075317383,
294
+ "learning_rate": 4.3062200956937806e-05,
295
+ "loss": 3.5007,
296
+ "step": 2800
297
+ },
298
+ {
299
+ "epoch": 1.48489503328213,
300
+ "grad_norm": 1.7824835777282715,
301
+ "learning_rate": 4.2796384901648065e-05,
302
+ "loss": 3.4422,
303
+ "step": 2900
304
+ },
305
+ {
306
+ "epoch": 1.5360983102918588,
307
+ "grad_norm": 2.0298142433166504,
308
+ "learning_rate": 4.253056884635832e-05,
309
+ "loss": 3.4254,
310
+ "step": 3000
311
+ },
312
+ {
313
+ "epoch": 1.5873015873015874,
314
+ "grad_norm": 2.227325439453125,
315
+ "learning_rate": 4.2264752791068584e-05,
316
+ "loss": 3.4087,
317
+ "step": 3100
318
+ },
319
+ {
320
+ "epoch": 1.638504864311316,
321
+ "grad_norm": 2.2700014114379883,
322
+ "learning_rate": 4.199893673577884e-05,
323
+ "loss": 3.4023,
324
+ "step": 3200
325
+ },
326
+ {
327
+ "epoch": 1.6897081413210446,
328
+ "grad_norm": 2.290431022644043,
329
+ "learning_rate": 4.17331206804891e-05,
330
+ "loss": 3.3579,
331
+ "step": 3300
332
+ },
333
+ {
334
+ "epoch": 1.7409114183307732,
335
+ "grad_norm": 2.5916295051574707,
336
+ "learning_rate": 4.146730462519937e-05,
337
+ "loss": 3.2602,
338
+ "step": 3400
339
+ },
340
+ {
341
+ "epoch": 1.7921146953405018,
342
+ "grad_norm": 2.0242862701416016,
343
+ "learning_rate": 4.120148856990963e-05,
344
+ "loss": 3.1941,
345
+ "step": 3500
346
+ },
347
+ {
348
+ "epoch": 1.8433179723502304,
349
+ "grad_norm": 1.8922170400619507,
350
+ "learning_rate": 4.093567251461988e-05,
351
+ "loss": 3.1774,
352
+ "step": 3600
353
+ },
354
+ {
355
+ "epoch": 1.894521249359959,
356
+ "grad_norm": 2.2697248458862305,
357
+ "learning_rate": 4.0669856459330146e-05,
358
+ "loss": 3.2001,
359
+ "step": 3700
360
+ },
361
+ {
362
+ "epoch": 1.9457245263696876,
363
+ "grad_norm": 2.68994402885437,
364
+ "learning_rate": 4.0404040404040405e-05,
365
+ "loss": 3.1232,
366
+ "step": 3800
367
+ },
368
+ {
369
+ "epoch": 1.9969278033794162,
370
+ "grad_norm": 2.16379451751709,
371
+ "learning_rate": 4.0138224348750665e-05,
372
+ "loss": 3.101,
373
+ "step": 3900
374
+ },
375
+ {
376
+ "epoch": 2.048131080389145,
377
+ "grad_norm": 1.908273458480835,
378
+ "learning_rate": 3.987240829346093e-05,
379
+ "loss": 3.0659,
380
+ "step": 4000
381
+ },
382
+ {
383
+ "epoch": 2.048131080389145,
384
+ "eval_accuracy": 0.4085060110901451,
385
+ "eval_loss": 3.467172384262085,
386
+ "eval_runtime": 95.3375,
387
+ "eval_samples_per_second": 351.845,
388
+ "eval_steps_per_second": 21.996,
389
+ "step": 4000
390
+ },
391
+ {
392
+ "epoch": 2.0993343573988734,
393
+ "grad_norm": 2.217175006866455,
394
+ "learning_rate": 3.960659223817118e-05,
395
+ "loss": 3.009,
396
+ "step": 4100
397
+ },
398
+ {
399
+ "epoch": 2.150537634408602,
400
+ "grad_norm": 2.744868755340576,
401
+ "learning_rate": 3.934077618288145e-05,
402
+ "loss": 3.0265,
403
+ "step": 4200
404
+ },
405
+ {
406
+ "epoch": 2.2017409114183306,
407
+ "grad_norm": 2.3460192680358887,
408
+ "learning_rate": 3.907496012759171e-05,
409
+ "loss": 2.9379,
410
+ "step": 4300
411
+ },
412
+ {
413
+ "epoch": 2.252944188428059,
414
+ "grad_norm": 2.120131015777588,
415
+ "learning_rate": 3.880914407230197e-05,
416
+ "loss": 2.9437,
417
+ "step": 4400
418
+ },
419
+ {
420
+ "epoch": 2.3041474654377883,
421
+ "grad_norm": 2.3475334644317627,
422
+ "learning_rate": 3.8543328017012234e-05,
423
+ "loss": 2.933,
424
+ "step": 4500
425
+ },
426
+ {
427
+ "epoch": 2.3553507424475164,
428
+ "grad_norm": 2.450538396835327,
429
+ "learning_rate": 3.8277511961722486e-05,
430
+ "loss": 2.9619,
431
+ "step": 4600
432
+ },
433
+ {
434
+ "epoch": 2.4065540194572455,
435
+ "grad_norm": 2.027897596359253,
436
+ "learning_rate": 3.8011695906432746e-05,
437
+ "loss": 2.9179,
438
+ "step": 4700
439
+ },
440
+ {
441
+ "epoch": 2.457757296466974,
442
+ "grad_norm": 2.3188517093658447,
443
+ "learning_rate": 3.774587985114301e-05,
444
+ "loss": 2.8655,
445
+ "step": 4800
446
+ },
447
+ {
448
+ "epoch": 2.5089605734767026,
449
+ "grad_norm": 1.909847617149353,
450
+ "learning_rate": 3.748006379585327e-05,
451
+ "loss": 2.8649,
452
+ "step": 4900
453
+ },
454
+ {
455
+ "epoch": 2.5601638504864312,
456
+ "grad_norm": 1.8613382577896118,
457
+ "learning_rate": 3.721424774056353e-05,
458
+ "loss": 2.8555,
459
+ "step": 5000
460
+ },
461
+ {
462
+ "epoch": 2.61136712749616,
463
+ "grad_norm": 1.7689636945724487,
464
+ "learning_rate": 3.6948431685273796e-05,
465
+ "loss": 2.8957,
466
+ "step": 5100
467
+ },
468
+ {
469
+ "epoch": 2.6625704045058884,
470
+ "grad_norm": 1.8662793636322021,
471
+ "learning_rate": 3.668261562998405e-05,
472
+ "loss": 2.8132,
473
+ "step": 5200
474
+ },
475
+ {
476
+ "epoch": 2.713773681515617,
477
+ "grad_norm": 1.9725576639175415,
478
+ "learning_rate": 3.6416799574694315e-05,
479
+ "loss": 2.8121,
480
+ "step": 5300
481
+ },
482
+ {
483
+ "epoch": 2.7649769585253456,
484
+ "grad_norm": 1.7306541204452515,
485
+ "learning_rate": 3.6150983519404574e-05,
486
+ "loss": 2.8154,
487
+ "step": 5400
488
+ },
489
+ {
490
+ "epoch": 2.8161802355350742,
491
+ "grad_norm": 1.824539065361023,
492
+ "learning_rate": 3.5885167464114834e-05,
493
+ "loss": 2.776,
494
+ "step": 5500
495
+ },
496
+ {
497
+ "epoch": 2.867383512544803,
498
+ "grad_norm": 1.9677956104278564,
499
+ "learning_rate": 3.56193514088251e-05,
500
+ "loss": 2.8155,
501
+ "step": 5600
502
+ },
503
+ {
504
+ "epoch": 2.9185867895545314,
505
+ "grad_norm": 1.9770982265472412,
506
+ "learning_rate": 3.535353535353535e-05,
507
+ "loss": 2.7638,
508
+ "step": 5700
509
+ },
510
+ {
511
+ "epoch": 2.96979006656426,
512
+ "grad_norm": 2.1280829906463623,
513
+ "learning_rate": 3.508771929824561e-05,
514
+ "loss": 2.7771,
515
+ "step": 5800
516
+ },
517
+ {
518
+ "epoch": 3.0209933435739886,
519
+ "grad_norm": 1.7677674293518066,
520
+ "learning_rate": 3.482190324295588e-05,
521
+ "loss": 2.7382,
522
+ "step": 5900
523
+ },
524
+ {
525
+ "epoch": 3.0721966205837172,
526
+ "grad_norm": 1.6372385025024414,
527
+ "learning_rate": 3.455608718766614e-05,
528
+ "loss": 2.6866,
529
+ "step": 6000
530
+ },
531
+ {
532
+ "epoch": 3.0721966205837172,
533
+ "eval_accuracy": 0.4316442393272171,
534
+ "eval_loss": 3.2849764823913574,
535
+ "eval_runtime": 95.1067,
536
+ "eval_samples_per_second": 352.699,
537
+ "eval_steps_per_second": 22.049,
538
+ "step": 6000
539
+ },
540
+ {
541
+ "epoch": 3.123399897593446,
542
+ "grad_norm": 1.9155375957489014,
543
+ "learning_rate": 3.4290271132376396e-05,
544
+ "loss": 2.6794,
545
+ "step": 6100
546
+ },
547
+ {
548
+ "epoch": 3.1746031746031744,
549
+ "grad_norm": 1.5887531042099,
550
+ "learning_rate": 3.402445507708666e-05,
551
+ "loss": 2.6785,
552
+ "step": 6200
553
+ },
554
+ {
555
+ "epoch": 3.225806451612903,
556
+ "grad_norm": 1.7831392288208008,
557
+ "learning_rate": 3.3758639021796915e-05,
558
+ "loss": 2.682,
559
+ "step": 6300
560
+ },
561
+ {
562
+ "epoch": 3.277009728622632,
563
+ "grad_norm": 1.6347112655639648,
564
+ "learning_rate": 3.349282296650718e-05,
565
+ "loss": 2.6161,
566
+ "step": 6400
567
+ },
568
+ {
569
+ "epoch": 3.32821300563236,
570
+ "grad_norm": 1.774921178817749,
571
+ "learning_rate": 3.322700691121744e-05,
572
+ "loss": 2.6727,
573
+ "step": 6500
574
+ },
575
+ {
576
+ "epoch": 3.3794162826420893,
577
+ "grad_norm": 1.7162151336669922,
578
+ "learning_rate": 3.29611908559277e-05,
579
+ "loss": 2.692,
580
+ "step": 6600
581
+ },
582
+ {
583
+ "epoch": 3.430619559651818,
584
+ "grad_norm": 1.7720268964767456,
585
+ "learning_rate": 3.2695374800637965e-05,
586
+ "loss": 2.6795,
587
+ "step": 6700
588
+ },
589
+ {
590
+ "epoch": 3.4818228366615465,
591
+ "grad_norm": 1.7839425802230835,
592
+ "learning_rate": 3.242955874534822e-05,
593
+ "loss": 2.6487,
594
+ "step": 6800
595
+ },
596
+ {
597
+ "epoch": 3.533026113671275,
598
+ "grad_norm": 1.7593103647232056,
599
+ "learning_rate": 3.216374269005848e-05,
600
+ "loss": 2.6821,
601
+ "step": 6900
602
+ },
603
+ {
604
+ "epoch": 3.5842293906810037,
605
+ "grad_norm": 1.838611125946045,
606
+ "learning_rate": 3.189792663476874e-05,
607
+ "loss": 2.6547,
608
+ "step": 7000
609
+ },
610
+ {
611
+ "epoch": 3.6354326676907323,
612
+ "grad_norm": 1.5734059810638428,
613
+ "learning_rate": 3.1632110579479e-05,
614
+ "loss": 2.6343,
615
+ "step": 7100
616
+ },
617
+ {
618
+ "epoch": 3.686635944700461,
619
+ "grad_norm": 1.7309696674346924,
620
+ "learning_rate": 3.136629452418926e-05,
621
+ "loss": 2.6379,
622
+ "step": 7200
623
+ },
624
+ {
625
+ "epoch": 3.7378392217101895,
626
+ "grad_norm": 1.837202787399292,
627
+ "learning_rate": 3.110047846889952e-05,
628
+ "loss": 2.6094,
629
+ "step": 7300
630
+ },
631
+ {
632
+ "epoch": 3.789042498719918,
633
+ "grad_norm": 1.6872186660766602,
634
+ "learning_rate": 3.083466241360978e-05,
635
+ "loss": 2.5951,
636
+ "step": 7400
637
+ },
638
+ {
639
+ "epoch": 3.8402457757296466,
640
+ "grad_norm": 1.7772648334503174,
641
+ "learning_rate": 3.056884635832004e-05,
642
+ "loss": 2.5935,
643
+ "step": 7500
644
+ },
645
+ {
646
+ "epoch": 3.8914490527393752,
647
+ "grad_norm": 1.7157689332962036,
648
+ "learning_rate": 3.0303030303030306e-05,
649
+ "loss": 2.6076,
650
+ "step": 7600
651
+ },
652
+ {
653
+ "epoch": 3.942652329749104,
654
+ "grad_norm": 1.7484747171401978,
655
+ "learning_rate": 3.0037214247740565e-05,
656
+ "loss": 2.6294,
657
+ "step": 7700
658
+ },
659
+ {
660
+ "epoch": 3.9938556067588324,
661
+ "grad_norm": 1.6567468643188477,
662
+ "learning_rate": 2.9771398192450828e-05,
663
+ "loss": 2.603,
664
+ "step": 7800
665
+ },
666
+ {
667
+ "epoch": 4.0450588837685615,
668
+ "grad_norm": 1.6986713409423828,
669
+ "learning_rate": 2.9505582137161087e-05,
670
+ "loss": 2.5767,
671
+ "step": 7900
672
+ },
673
+ {
674
+ "epoch": 4.09626216077829,
675
+ "grad_norm": 1.9400880336761475,
676
+ "learning_rate": 2.9239766081871346e-05,
677
+ "loss": 2.5063,
678
+ "step": 8000
679
+ },
680
+ {
681
+ "epoch": 4.09626216077829,
682
+ "eval_accuracy": 0.4372182483806064,
683
+ "eval_loss": 3.1963303089141846,
684
+ "eval_runtime": 95.395,
685
+ "eval_samples_per_second": 351.633,
686
+ "eval_steps_per_second": 21.982,
687
+ "step": 8000
688
+ },
689
+ {
690
+ "epoch": 4.147465437788019,
691
+ "grad_norm": 1.7278245687484741,
692
+ "learning_rate": 2.8976608187134503e-05,
693
+ "loss": 2.5088,
694
+ "step": 8100
695
+ },
696
+ {
697
+ "epoch": 4.198668714797747,
698
+ "grad_norm": 1.778096318244934,
699
+ "learning_rate": 2.8710792131844765e-05,
700
+ "loss": 2.5284,
701
+ "step": 8200
702
+ },
703
+ {
704
+ "epoch": 4.249871991807476,
705
+ "grad_norm": 1.990983009338379,
706
+ "learning_rate": 2.8444976076555025e-05,
707
+ "loss": 2.5181,
708
+ "step": 8300
709
+ },
710
+ {
711
+ "epoch": 4.301075268817204,
712
+ "grad_norm": 1.577682375907898,
713
+ "learning_rate": 2.8179160021265287e-05,
714
+ "loss": 2.4916,
715
+ "step": 8400
716
+ },
717
+ {
718
+ "epoch": 4.352278545826933,
719
+ "grad_norm": 1.6727075576782227,
720
+ "learning_rate": 2.7913343965975547e-05,
721
+ "loss": 2.5346,
722
+ "step": 8500
723
+ },
724
+ {
725
+ "epoch": 4.403481822836661,
726
+ "grad_norm": 1.6889344453811646,
727
+ "learning_rate": 2.7647527910685806e-05,
728
+ "loss": 2.5164,
729
+ "step": 8600
730
+ },
731
+ {
732
+ "epoch": 4.45468509984639,
733
+ "grad_norm": 1.763092041015625,
734
+ "learning_rate": 2.738171185539607e-05,
735
+ "loss": 2.5073,
736
+ "step": 8700
737
+ },
738
+ {
739
+ "epoch": 4.505888376856118,
740
+ "grad_norm": 1.6232457160949707,
741
+ "learning_rate": 2.7115895800106328e-05,
742
+ "loss": 2.515,
743
+ "step": 8800
744
+ },
745
+ {
746
+ "epoch": 4.5570916538658475,
747
+ "grad_norm": 1.7539464235305786,
748
+ "learning_rate": 2.6850079744816587e-05,
749
+ "loss": 2.538,
750
+ "step": 8900
751
+ },
752
+ {
753
+ "epoch": 4.6082949308755765,
754
+ "grad_norm": 1.729683518409729,
755
+ "learning_rate": 2.658426368952685e-05,
756
+ "loss": 2.5301,
757
+ "step": 9000
758
+ },
759
+ {
760
+ "epoch": 4.659498207885305,
761
+ "grad_norm": 1.753625750541687,
762
+ "learning_rate": 2.631844763423711e-05,
763
+ "loss": 2.5267,
764
+ "step": 9100
765
+ },
766
+ {
767
+ "epoch": 4.710701484895033,
768
+ "grad_norm": 1.7453761100769043,
769
+ "learning_rate": 2.605263157894737e-05,
770
+ "loss": 2.5207,
771
+ "step": 9200
772
+ },
773
+ {
774
+ "epoch": 4.761904761904762,
775
+ "grad_norm": 1.9901796579360962,
776
+ "learning_rate": 2.578681552365763e-05,
777
+ "loss": 2.4987,
778
+ "step": 9300
779
+ },
780
+ {
781
+ "epoch": 4.813108038914491,
782
+ "grad_norm": 1.6312249898910522,
783
+ "learning_rate": 2.552099946836789e-05,
784
+ "loss": 2.4943,
785
+ "step": 9400
786
+ },
787
+ {
788
+ "epoch": 4.864311315924219,
789
+ "grad_norm": 1.7069945335388184,
790
+ "learning_rate": 2.525518341307815e-05,
791
+ "loss": 2.4666,
792
+ "step": 9500
793
+ },
794
+ {
795
+ "epoch": 4.915514592933948,
796
+ "grad_norm": 1.7387930154800415,
797
+ "learning_rate": 2.4989367357788412e-05,
798
+ "loss": 2.4932,
799
+ "step": 9600
800
+ },
801
+ {
802
+ "epoch": 4.966717869943676,
803
+ "grad_norm": 1.6212061643600464,
804
+ "learning_rate": 2.472355130249867e-05,
805
+ "loss": 2.52,
806
+ "step": 9700
807
+ },
808
+ {
809
+ "epoch": 5.017921146953405,
810
+ "grad_norm": 1.7624274492263794,
811
+ "learning_rate": 2.445773524720893e-05,
812
+ "loss": 2.4938,
813
+ "step": 9800
814
+ },
815
+ {
816
+ "epoch": 5.0691244239631335,
817
+ "grad_norm": 1.824257254600525,
818
+ "learning_rate": 2.4191919191919194e-05,
819
+ "loss": 2.4275,
820
+ "step": 9900
821
+ },
822
+ {
823
+ "epoch": 5.1203277009728625,
824
+ "grad_norm": 1.616184115409851,
825
+ "learning_rate": 2.3926103136629456e-05,
826
+ "loss": 2.4139,
827
+ "step": 10000
828
+ },
829
+ {
830
+ "epoch": 5.1203277009728625,
831
+ "eval_accuracy": 0.4442466080127022,
832
+ "eval_loss": 3.1405584812164307,
833
+ "eval_runtime": 95.5986,
834
+ "eval_samples_per_second": 350.884,
835
+ "eval_steps_per_second": 21.935,
836
+ "step": 10000
837
+ },
838
+ {
839
+ "epoch": 5.171530977982591,
840
+ "grad_norm": 1.764162302017212,
841
+ "learning_rate": 2.3660287081339712e-05,
842
+ "loss": 2.4878,
843
+ "step": 10100
844
+ },
845
+ {
846
+ "epoch": 5.22273425499232,
847
+ "grad_norm": 1.6120530366897583,
848
+ "learning_rate": 2.3394471026049975e-05,
849
+ "loss": 2.4391,
850
+ "step": 10200
851
+ },
852
+ {
853
+ "epoch": 5.273937532002048,
854
+ "grad_norm": 1.6949564218521118,
855
+ "learning_rate": 2.3128654970760234e-05,
856
+ "loss": 2.4131,
857
+ "step": 10300
858
+ },
859
+ {
860
+ "epoch": 5.325140809011777,
861
+ "grad_norm": 1.667853832244873,
862
+ "learning_rate": 2.2862838915470493e-05,
863
+ "loss": 2.4112,
864
+ "step": 10400
865
+ },
866
+ {
867
+ "epoch": 5.376344086021505,
868
+ "grad_norm": 1.427404522895813,
869
+ "learning_rate": 2.2597022860180756e-05,
870
+ "loss": 2.4242,
871
+ "step": 10500
872
+ },
873
+ {
874
+ "epoch": 5.427547363031234,
875
+ "grad_norm": 1.6098235845565796,
876
+ "learning_rate": 2.2331206804891015e-05,
877
+ "loss": 2.429,
878
+ "step": 10600
879
+ },
880
+ {
881
+ "epoch": 5.478750640040962,
882
+ "grad_norm": 1.8480441570281982,
883
+ "learning_rate": 2.2065390749601278e-05,
884
+ "loss": 2.4439,
885
+ "step": 10700
886
+ },
887
+ {
888
+ "epoch": 5.529953917050691,
889
+ "grad_norm": 1.8656944036483765,
890
+ "learning_rate": 2.1799574694311537e-05,
891
+ "loss": 2.4164,
892
+ "step": 10800
893
+ },
894
+ {
895
+ "epoch": 5.58115719406042,
896
+ "grad_norm": 1.9880622625350952,
897
+ "learning_rate": 2.1533758639021797e-05,
898
+ "loss": 2.465,
899
+ "step": 10900
900
+ },
901
+ {
902
+ "epoch": 5.6323604710701485,
903
+ "grad_norm": 1.8432066440582275,
904
+ "learning_rate": 2.126794258373206e-05,
905
+ "loss": 2.4216,
906
+ "step": 11000
907
+ },
908
+ {
909
+ "epoch": 5.683563748079877,
910
+ "grad_norm": 1.7552499771118164,
911
+ "learning_rate": 2.100212652844232e-05,
912
+ "loss": 2.379,
913
+ "step": 11100
914
+ },
915
+ {
916
+ "epoch": 5.734767025089606,
917
+ "grad_norm": 1.7043852806091309,
918
+ "learning_rate": 2.0736310473152578e-05,
919
+ "loss": 2.4308,
920
+ "step": 11200
921
+ },
922
+ {
923
+ "epoch": 5.785970302099335,
924
+ "grad_norm": 1.8842682838439941,
925
+ "learning_rate": 2.047049441786284e-05,
926
+ "loss": 2.412,
927
+ "step": 11300
928
+ },
929
+ {
930
+ "epoch": 5.837173579109063,
931
+ "grad_norm": 1.6765940189361572,
932
+ "learning_rate": 2.02046783625731e-05,
933
+ "loss": 2.4299,
934
+ "step": 11400
935
+ },
936
+ {
937
+ "epoch": 5.888376856118792,
938
+ "grad_norm": 1.7585973739624023,
939
+ "learning_rate": 1.993886230728336e-05,
940
+ "loss": 2.4078,
941
+ "step": 11500
942
+ },
943
+ {
944
+ "epoch": 5.93958013312852,
945
+ "grad_norm": 1.788201928138733,
946
+ "learning_rate": 1.9673046251993622e-05,
947
+ "loss": 2.4186,
948
+ "step": 11600
949
+ },
950
+ {
951
+ "epoch": 5.990783410138249,
952
+ "grad_norm": 1.6128724813461304,
953
+ "learning_rate": 1.940723019670388e-05,
954
+ "loss": 2.3844,
955
+ "step": 11700
956
+ },
957
+ {
958
+ "epoch": 6.041986687147977,
959
+ "grad_norm": 1.7053714990615845,
960
+ "learning_rate": 1.9141414141414144e-05,
961
+ "loss": 2.3299,
962
+ "step": 11800
963
+ },
964
+ {
965
+ "epoch": 6.093189964157706,
966
+ "grad_norm": 2.06257963180542,
967
+ "learning_rate": 1.8875598086124403e-05,
968
+ "loss": 2.3666,
969
+ "step": 11900
970
+ },
971
+ {
972
+ "epoch": 6.1443932411674345,
973
+ "grad_norm": 1.8419615030288696,
974
+ "learning_rate": 1.8609782030834662e-05,
975
+ "loss": 2.3246,
976
+ "step": 12000
977
+ },
978
+ {
979
+ "epoch": 6.1443932411674345,
980
+ "eval_accuracy": 0.44839520298251256,
981
+ "eval_loss": 3.115219831466675,
982
+ "eval_runtime": 95.2063,
983
+ "eval_samples_per_second": 352.329,
984
+ "eval_steps_per_second": 22.026,
985
+ "step": 12000
986
+ },
987
+ {
988
+ "epoch": 6.1955965181771635,
989
+ "grad_norm": 1.4872585535049438,
990
+ "learning_rate": 1.834662413609782e-05,
991
+ "loss": 2.3234,
992
+ "step": 12100
993
+ },
994
+ {
995
+ "epoch": 6.246799795186892,
996
+ "grad_norm": 1.6765344142913818,
997
+ "learning_rate": 1.808080808080808e-05,
998
+ "loss": 2.353,
999
+ "step": 12200
1000
+ },
1001
+ {
1002
+ "epoch": 6.298003072196621,
1003
+ "grad_norm": 1.611560583114624,
1004
+ "learning_rate": 1.7814992025518344e-05,
1005
+ "loss": 2.3758,
1006
+ "step": 12300
1007
+ },
1008
+ {
1009
+ "epoch": 6.349206349206349,
1010
+ "grad_norm": 1.7853387594223022,
1011
+ "learning_rate": 1.75491759702286e-05,
1012
+ "loss": 2.3232,
1013
+ "step": 12400
1014
+ },
1015
+ {
1016
+ "epoch": 6.400409626216078,
1017
+ "grad_norm": 1.614193320274353,
1018
+ "learning_rate": 1.7283359914938863e-05,
1019
+ "loss": 2.3759,
1020
+ "step": 12500
1021
+ },
1022
+ {
1023
+ "epoch": 6.451612903225806,
1024
+ "grad_norm": 1.8242110013961792,
1025
+ "learning_rate": 1.7017543859649125e-05,
1026
+ "loss": 2.4106,
1027
+ "step": 12600
1028
+ },
1029
+ {
1030
+ "epoch": 6.502816180235535,
1031
+ "grad_norm": 1.674735188484192,
1032
+ "learning_rate": 1.6751727804359385e-05,
1033
+ "loss": 2.3834,
1034
+ "step": 12700
1035
+ },
1036
+ {
1037
+ "epoch": 6.554019457245264,
1038
+ "grad_norm": 1.8332616090774536,
1039
+ "learning_rate": 1.6485911749069644e-05,
1040
+ "loss": 2.3631,
1041
+ "step": 12800
1042
+ },
1043
+ {
1044
+ "epoch": 6.605222734254992,
1045
+ "grad_norm": 1.8917430639266968,
1046
+ "learning_rate": 1.6220095693779903e-05,
1047
+ "loss": 2.3549,
1048
+ "step": 12900
1049
+ },
1050
+ {
1051
+ "epoch": 6.65642601126472,
1052
+ "grad_norm": 1.9428924322128296,
1053
+ "learning_rate": 1.5954279638490166e-05,
1054
+ "loss": 2.3205,
1055
+ "step": 13000
1056
+ },
1057
+ {
1058
+ "epoch": 6.7076292882744495,
1059
+ "grad_norm": 1.639167308807373,
1060
+ "learning_rate": 1.568846358320043e-05,
1061
+ "loss": 2.3646,
1062
+ "step": 13100
1063
+ },
1064
+ {
1065
+ "epoch": 6.7588325652841785,
1066
+ "grad_norm": 1.6185379028320312,
1067
+ "learning_rate": 1.5422647527910685e-05,
1068
+ "loss": 2.3757,
1069
+ "step": 13200
1070
+ },
1071
+ {
1072
+ "epoch": 6.810035842293907,
1073
+ "grad_norm": 1.6808964014053345,
1074
+ "learning_rate": 1.5156831472620947e-05,
1075
+ "loss": 2.3542,
1076
+ "step": 13300
1077
+ },
1078
+ {
1079
+ "epoch": 6.861239119303636,
1080
+ "grad_norm": 1.4692785739898682,
1081
+ "learning_rate": 1.4891015417331208e-05,
1082
+ "loss": 2.3476,
1083
+ "step": 13400
1084
+ },
1085
+ {
1086
+ "epoch": 6.912442396313364,
1087
+ "grad_norm": 1.6999351978302002,
1088
+ "learning_rate": 1.4625199362041468e-05,
1089
+ "loss": 2.3441,
1090
+ "step": 13500
1091
+ },
1092
+ {
1093
+ "epoch": 6.963645673323093,
1094
+ "grad_norm": 1.7041444778442383,
1095
+ "learning_rate": 1.4359383306751729e-05,
1096
+ "loss": 2.3725,
1097
+ "step": 13600
1098
+ },
1099
+ {
1100
+ "epoch": 7.014848950332821,
1101
+ "grad_norm": 1.773606538772583,
1102
+ "learning_rate": 1.409356725146199e-05,
1103
+ "loss": 2.3379,
1104
+ "step": 13700
1105
+ },
1106
+ {
1107
+ "epoch": 7.06605222734255,
1108
+ "grad_norm": 1.7857049703598022,
1109
+ "learning_rate": 1.382775119617225e-05,
1110
+ "loss": 2.3471,
1111
+ "step": 13800
1112
+ },
1113
+ {
1114
+ "epoch": 7.117255504352278,
1115
+ "grad_norm": 1.7172592878341675,
1116
+ "learning_rate": 1.356193514088251e-05,
1117
+ "loss": 2.2639,
1118
+ "step": 13900
1119
+ },
1120
+ {
1121
+ "epoch": 7.168458781362007,
1122
+ "grad_norm": 2.0828044414520264,
1123
+ "learning_rate": 1.329611908559277e-05,
1124
+ "loss": 2.3111,
1125
+ "step": 14000
1126
+ },
1127
+ {
1128
+ "epoch": 7.168458781362007,
1129
+ "eval_accuracy": 0.44894773835621105,
1130
+ "eval_loss": 3.0879344940185547,
1131
+ "eval_runtime": 95.1844,
1132
+ "eval_samples_per_second": 352.411,
1133
+ "eval_steps_per_second": 22.031,
1134
+ "step": 14000
1135
+ },
1136
+ {
1137
+ "epoch": 7.2196620583717355,
1138
+ "grad_norm": 1.7171036005020142,
1139
+ "learning_rate": 1.3032961190855927e-05,
1140
+ "loss": 2.3203,
1141
+ "step": 14100
1142
+ },
1143
+ {
1144
+ "epoch": 7.2708653353814645,
1145
+ "grad_norm": 1.8851195573806763,
1146
+ "learning_rate": 1.2767145135566188e-05,
1147
+ "loss": 2.3111,
1148
+ "step": 14200
1149
+ },
1150
+ {
1151
+ "epoch": 7.322068612391193,
1152
+ "grad_norm": 1.8158164024353027,
1153
+ "learning_rate": 1.250132908027645e-05,
1154
+ "loss": 2.3227,
1155
+ "step": 14300
1156
+ },
1157
+ {
1158
+ "epoch": 7.373271889400922,
1159
+ "grad_norm": 1.833143711090088,
1160
+ "learning_rate": 1.223551302498671e-05,
1161
+ "loss": 2.2796,
1162
+ "step": 14400
1163
+ },
1164
+ {
1165
+ "epoch": 7.42447516641065,
1166
+ "grad_norm": 1.5996191501617432,
1167
+ "learning_rate": 1.196969696969697e-05,
1168
+ "loss": 2.2753,
1169
+ "step": 14500
1170
+ },
1171
+ {
1172
+ "epoch": 7.475678443420379,
1173
+ "grad_norm": 1.7543715238571167,
1174
+ "learning_rate": 1.170388091440723e-05,
1175
+ "loss": 2.3191,
1176
+ "step": 14600
1177
+ },
1178
+ {
1179
+ "epoch": 7.526881720430108,
1180
+ "grad_norm": 1.8224458694458008,
1181
+ "learning_rate": 1.1438064859117491e-05,
1182
+ "loss": 2.3362,
1183
+ "step": 14700
1184
+ },
1185
+ {
1186
+ "epoch": 7.578084997439836,
1187
+ "grad_norm": 1.825265884399414,
1188
+ "learning_rate": 1.1172248803827752e-05,
1189
+ "loss": 2.2788,
1190
+ "step": 14800
1191
+ },
1192
+ {
1193
+ "epoch": 7.629288274449565,
1194
+ "grad_norm": 1.6064096689224243,
1195
+ "learning_rate": 1.0906432748538012e-05,
1196
+ "loss": 2.3287,
1197
+ "step": 14900
1198
+ },
1199
+ {
1200
+ "epoch": 7.680491551459293,
1201
+ "grad_norm": 1.9973961114883423,
1202
+ "learning_rate": 1.0640616693248273e-05,
1203
+ "loss": 2.3054,
1204
+ "step": 15000
1205
+ },
1206
+ {
1207
+ "epoch": 7.731694828469022,
1208
+ "grad_norm": 1.9129321575164795,
1209
+ "learning_rate": 1.0374800637958534e-05,
1210
+ "loss": 2.2897,
1211
+ "step": 15100
1212
+ },
1213
+ {
1214
+ "epoch": 7.7828981054787505,
1215
+ "grad_norm": 1.7142568826675415,
1216
+ "learning_rate": 1.0108984582668795e-05,
1217
+ "loss": 2.2763,
1218
+ "step": 15200
1219
+ },
1220
+ {
1221
+ "epoch": 7.8341013824884795,
1222
+ "grad_norm": 1.8948873281478882,
1223
+ "learning_rate": 9.843168527379054e-06,
1224
+ "loss": 2.2831,
1225
+ "step": 15300
1226
+ },
1227
+ {
1228
+ "epoch": 7.885304659498208,
1229
+ "grad_norm": 1.671781301498413,
1230
+ "learning_rate": 9.577352472089313e-06,
1231
+ "loss": 2.301,
1232
+ "step": 15400
1233
+ },
1234
+ {
1235
+ "epoch": 7.936507936507937,
1236
+ "grad_norm": 1.8178682327270508,
1237
+ "learning_rate": 9.311536416799576e-06,
1238
+ "loss": 2.3297,
1239
+ "step": 15500
1240
+ },
1241
+ {
1242
+ "epoch": 7.987711213517665,
1243
+ "grad_norm": 1.7934144735336304,
1244
+ "learning_rate": 9.045720361509835e-06,
1245
+ "loss": 2.2703,
1246
+ "step": 15600
1247
+ },
1248
+ {
1249
+ "epoch": 8.038914490527393,
1250
+ "grad_norm": 1.7798469066619873,
1251
+ "learning_rate": 8.779904306220096e-06,
1252
+ "loss": 2.3071,
1253
+ "step": 15700
1254
+ },
1255
+ {
1256
+ "epoch": 8.090117767537123,
1257
+ "grad_norm": 2.005934238433838,
1258
+ "learning_rate": 8.514088250930355e-06,
1259
+ "loss": 2.2579,
1260
+ "step": 15800
1261
+ },
1262
+ {
1263
+ "epoch": 8.141321044546851,
1264
+ "grad_norm": 1.7841377258300781,
1265
+ "learning_rate": 8.248272195640618e-06,
1266
+ "loss": 2.2942,
1267
+ "step": 15900
1268
+ },
1269
+ {
1270
+ "epoch": 8.19252432155658,
1271
+ "grad_norm": 1.814175009727478,
1272
+ "learning_rate": 7.982456140350877e-06,
1273
+ "loss": 2.2761,
1274
+ "step": 16000
1275
+ },
1276
+ {
1277
+ "epoch": 8.19252432155658,
1278
+ "eval_accuracy": 0.45419183636131977,
1279
+ "eval_loss": 3.0668067932128906,
1280
+ "eval_runtime": 95.2855,
1281
+ "eval_samples_per_second": 352.037,
1282
+ "eval_steps_per_second": 22.008,
1283
+ "step": 16000
1284
+ },
1285
+ {
1286
+ "epoch": 8.243727598566307,
1287
+ "grad_norm": 1.7196694612503052,
1288
+ "learning_rate": 7.719298245614036e-06,
1289
+ "loss": 2.2475,
1290
+ "step": 16100
1291
+ },
1292
+ {
1293
+ "epoch": 8.294930875576037,
1294
+ "grad_norm": 1.780168890953064,
1295
+ "learning_rate": 7.453482190324296e-06,
1296
+ "loss": 2.2623,
1297
+ "step": 16200
1298
+ },
1299
+ {
1300
+ "epoch": 8.346134152585766,
1301
+ "grad_norm": 1.7353187799453735,
1302
+ "learning_rate": 7.187666135034557e-06,
1303
+ "loss": 2.2436,
1304
+ "step": 16300
1305
+ },
1306
+ {
1307
+ "epoch": 8.397337429595494,
1308
+ "grad_norm": 1.8766587972640991,
1309
+ "learning_rate": 6.921850079744817e-06,
1310
+ "loss": 2.2514,
1311
+ "step": 16400
1312
+ },
1313
+ {
1314
+ "epoch": 8.448540706605222,
1315
+ "grad_norm": 1.8156400918960571,
1316
+ "learning_rate": 6.656034024455077e-06,
1317
+ "loss": 2.2647,
1318
+ "step": 16500
1319
+ },
1320
+ {
1321
+ "epoch": 8.499743983614952,
1322
+ "grad_norm": 1.7604610919952393,
1323
+ "learning_rate": 6.390217969165338e-06,
1324
+ "loss": 2.2633,
1325
+ "step": 16600
1326
+ },
1327
+ {
1328
+ "epoch": 8.55094726062468,
1329
+ "grad_norm": 1.9719775915145874,
1330
+ "learning_rate": 6.124401913875599e-06,
1331
+ "loss": 2.2729,
1332
+ "step": 16700
1333
+ },
1334
+ {
1335
+ "epoch": 8.602150537634408,
1336
+ "grad_norm": 1.7817537784576416,
1337
+ "learning_rate": 5.858585858585859e-06,
1338
+ "loss": 2.2605,
1339
+ "step": 16800
1340
+ },
1341
+ {
1342
+ "epoch": 8.653353814644138,
1343
+ "grad_norm": 2.052873373031616,
1344
+ "learning_rate": 5.59276980329612e-06,
1345
+ "loss": 2.2612,
1346
+ "step": 16900
1347
+ },
1348
+ {
1349
+ "epoch": 8.704557091653866,
1350
+ "grad_norm": 1.7299405336380005,
1351
+ "learning_rate": 5.326953748006379e-06,
1352
+ "loss": 2.2544,
1353
+ "step": 17000
1354
+ },
1355
+ {
1356
+ "epoch": 8.755760368663594,
1357
+ "grad_norm": 1.68231999874115,
1358
+ "learning_rate": 5.06113769271664e-06,
1359
+ "loss": 2.2786,
1360
+ "step": 17100
1361
+ },
1362
+ {
1363
+ "epoch": 8.806963645673322,
1364
+ "grad_norm": 1.740615725517273,
1365
+ "learning_rate": 4.7953216374269005e-06,
1366
+ "loss": 2.2514,
1367
+ "step": 17200
1368
+ },
1369
+ {
1370
+ "epoch": 8.858166922683052,
1371
+ "grad_norm": 1.6444553136825562,
1372
+ "learning_rate": 4.5295055821371614e-06,
1373
+ "loss": 2.2666,
1374
+ "step": 17300
1375
+ },
1376
+ {
1377
+ "epoch": 8.90937019969278,
1378
+ "grad_norm": 1.6717356443405151,
1379
+ "learning_rate": 4.263689526847422e-06,
1380
+ "loss": 2.2459,
1381
+ "step": 17400
1382
+ },
1383
+ {
1384
+ "epoch": 8.960573476702509,
1385
+ "grad_norm": 1.9801234006881714,
1386
+ "learning_rate": 3.9978734715576826e-06,
1387
+ "loss": 2.2494,
1388
+ "step": 17500
1389
+ },
1390
+ {
1391
+ "epoch": 9.011776753712237,
1392
+ "grad_norm": 1.8575870990753174,
1393
+ "learning_rate": 3.732057416267943e-06,
1394
+ "loss": 2.2438,
1395
+ "step": 17600
1396
+ },
1397
+ {
1398
+ "epoch": 9.062980030721967,
1399
+ "grad_norm": 1.7382246255874634,
1400
+ "learning_rate": 3.4662413609782037e-06,
1401
+ "loss": 2.2627,
1402
+ "step": 17700
1403
+ },
1404
+ {
1405
+ "epoch": 9.114183307731695,
1406
+ "grad_norm": 1.739556074142456,
1407
+ "learning_rate": 3.2004253056884642e-06,
1408
+ "loss": 2.249,
1409
+ "step": 17800
1410
+ },
1411
+ {
1412
+ "epoch": 9.165386584741423,
1413
+ "grad_norm": 1.9629456996917725,
1414
+ "learning_rate": 2.9346092503987244e-06,
1415
+ "loss": 2.2321,
1416
+ "step": 17900
1417
+ },
1418
+ {
1419
+ "epoch": 9.216589861751151,
1420
+ "grad_norm": 1.9510672092437744,
1421
+ "learning_rate": 2.668793195108985e-06,
1422
+ "loss": 2.2231,
1423
+ "step": 18000
1424
+ },
1425
+ {
1426
+ "epoch": 9.216589861751151,
1427
+ "eval_accuracy": 0.4517146740233816,
1428
+ "eval_loss": 3.069502830505371,
1429
+ "eval_runtime": 95.8923,
1430
+ "eval_samples_per_second": 349.809,
1431
+ "eval_steps_per_second": 21.868,
1432
+ "step": 18000
1433
+ },
1434
+ {
1435
+ "epoch": 9.267793138760881,
1436
+ "grad_norm": 1.7088594436645508,
1437
+ "learning_rate": 2.4056353003721427e-06,
1438
+ "loss": 2.2545,
1439
+ "step": 18100
1440
+ },
1441
+ {
1442
+ "epoch": 9.31899641577061,
1443
+ "grad_norm": 1.7437766790390015,
1444
+ "learning_rate": 2.1398192450824032e-06,
1445
+ "loss": 2.232,
1446
+ "step": 18200
1447
+ },
1448
+ {
1449
+ "epoch": 9.370199692780337,
1450
+ "grad_norm": 1.936566948890686,
1451
+ "learning_rate": 1.8740031897926634e-06,
1452
+ "loss": 2.2572,
1453
+ "step": 18300
1454
+ },
1455
+ {
1456
+ "epoch": 9.421402969790067,
1457
+ "grad_norm": 1.9586554765701294,
1458
+ "learning_rate": 1.608187134502924e-06,
1459
+ "loss": 2.2504,
1460
+ "step": 18400
1461
+ },
1462
+ {
1463
+ "epoch": 9.472606246799796,
1464
+ "grad_norm": 1.8036390542984009,
1465
+ "learning_rate": 1.3423710792131845e-06,
1466
+ "loss": 2.2402,
1467
+ "step": 18500
1468
+ },
1469
+ {
1470
+ "epoch": 9.523809523809524,
1471
+ "grad_norm": 1.866513967514038,
1472
+ "learning_rate": 1.076555023923445e-06,
1473
+ "loss": 2.2471,
1474
+ "step": 18600
1475
+ },
1476
+ {
1477
+ "epoch": 9.575012800819252,
1478
+ "grad_norm": 1.9675647020339966,
1479
+ "learning_rate": 8.107389686337055e-07,
1480
+ "loss": 2.2286,
1481
+ "step": 18700
1482
+ },
1483
+ {
1484
+ "epoch": 9.626216077828982,
1485
+ "grad_norm": 1.8157950639724731,
1486
+ "learning_rate": 5.44922913343966e-07,
1487
+ "loss": 2.2222,
1488
+ "step": 18800
1489
+ },
1490
+ {
1491
+ "epoch": 9.67741935483871,
1492
+ "grad_norm": 1.8483563661575317,
1493
+ "learning_rate": 2.7910685805422647e-07,
1494
+ "loss": 2.2518,
1495
+ "step": 18900
1496
+ },
1497
+ {
1498
+ "epoch": 9.728622631848438,
1499
+ "grad_norm": 1.9589660167694092,
1500
+ "learning_rate": 1.3290802764486977e-08,
1501
+ "loss": 2.2022,
1502
+ "step": 19000
1503
+ },
1504
+ {
1505
+ "epoch": 9.728622631848438,
1506
+ "step": 19000,
1507
+ "total_flos": 7.9413964701696e+16,
1508
+ "train_loss": 2.775124670731394,
1509
+ "train_runtime": 3271.669,
1510
+ "train_samples_per_second": 92.919,
1511
+ "train_steps_per_second": 5.807
1512
+ }
1513
+ ],
1514
+ "logging_steps": 100,
1515
+ "max_steps": 19000,
1516
+ "num_input_tokens_seen": 0,
1517
+ "num_train_epochs": 10,
1518
+ "save_steps": 99999999,
1519
+ "stateful_callbacks": {
1520
+ "TrainerControl": {
1521
+ "args": {
1522
+ "should_epoch_stop": false,
1523
+ "should_evaluate": false,
1524
+ "should_log": false,
1525
+ "should_save": true,
1526
+ "should_training_stop": true
1527
+ },
1528
+ "attributes": {}
1529
+ }
1530
+ },
1531
+ "total_flos": 7.9413964701696e+16,
1532
+ "train_batch_size": 16,
1533
+ "trial_name": null,
1534
+ "trial_params": null
1535
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d250537b2585c6d06b8b0dcfae2c77def32b4b994aeb860086cca09095186307
3
+ size 5841