HenryZhang commited on
Commit
7f5f327
·
verified ·
1 Parent(s): ea3a4e2

upload rewind preference model

Browse files
README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: rewind_scale_transformer
4
+ tags:
5
+ - reward_model
6
+ - rfm
7
+ - preference_comparisons
8
+ library_name: transformers
9
+ ---
10
+
11
+ # rewardfm/rewind_scale_preference_model
12
+
13
+ ## Model Details
14
+
15
+ - **Base Model**: rewind_scale_transformer
16
+ - **Model Type**: rewind_scale_transformer
17
+
18
+ ## Training Run
19
+
20
+ - **Wandb Run**: [rewind_scale_Progress_Pref_test_save](https://wandb.ai/clvr/rfm/runs/co2bhwuf)
21
+ - **Wandb ID**: `co2bhwuf`
22
+ - **Project**: rfm
23
+ - **Notes**: training RFM
24
+
25
+ ## Citation
26
+
27
+ If you use this model, please cite:
config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ReWiNDScaleTransformer"
4
+ ],
5
+ "causal_mask": false,
6
+ "dropout": 0.1,
7
+ "dtype": "float32",
8
+ "hidden_dim": 512,
9
+ "max_len": 16,
10
+ "model_type": "rewind_scale_transformer",
11
+ "num_attention_heads": 8,
12
+ "num_layers": 4,
13
+ "text_feature_dim": 384,
14
+ "transformers_version": "4.57.2",
15
+ "video_feature_dim": 768
16
+ }
metrics.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "step": 250,
3
+ "metrics": {
4
+ "eval_rew_align/mse_roboarena_eval": 0.0572185566740965,
5
+ "eval_rew_align/pearson_roboarena_eval": -0.20521791406560833,
6
+ "eval_p_rank/spearman_roboarena_eval": 0.1486367653896934,
7
+ "eval_p_rank/spearman_rewind_roboarena_eval": 0.285297764198399,
8
+ "eval_p_rank/ranking_acc_roboarena_eval": 0.5525760135135135,
9
+ "eval_p_rank/ranking_total_pairs_roboarena_eval": 4736.0,
10
+ "eval_rew_align/mse_mw_eval": 0.028775711768743873,
11
+ "eval_rew_align/pearson_mw_eval": -0.2622550899752058,
12
+ "eval_p_rank/spearman_mw_eval": 0.21948339897935923,
13
+ "eval_p_rank/spearman_rewind_mw_eval": 0.23529411764705882,
14
+ "eval_p_rank/avg_succ_subopt_diff_mw_eval": 0.00025275735294118495,
15
+ "eval_p_rank/min_succ_subopt_diff_mw_eval": -0.013867187499999989,
16
+ "eval_p_rank/max_succ_subopt_diff_mw_eval": 0.017187500000000022,
17
+ "eval_p_rank/avg_subopt_fail_diff_mw_eval": 0.0027573529411764708,
18
+ "eval_p_rank/min_subopt_fail_diff_mw_eval": -0.0166015625,
19
+ "eval_p_rank/max_subopt_fail_diff_mw_eval": 0.0234375,
20
+ "eval_p_rank/avg_succ_fail_diff_mw_eval": 0.0030101102941176554,
21
+ "eval_p_rank/min_succ_fail_diff_mw_eval": -0.012890624999999989,
22
+ "eval_p_rank/max_succ_fail_diff_mw_eval": 0.02128906250000001,
23
+ "eval_p_rank/ranking_acc_mw_eval": 0.6078431372549019,
24
+ "eval_p_rank/ranking_total_pairs_mw_eval": 51.0,
25
+ "eval_p_rank/spearman_utd_so101": -0.02834936490538903,
26
+ "eval_p_rank/spearman_rewind_utd_so101": -0.1,
27
+ "eval_p_rank/avg_succ_subopt_diff_utd_so101": -0.00234375,
28
+ "eval_p_rank/min_succ_subopt_diff_utd_so101": -0.029296875,
29
+ "eval_p_rank/max_succ_subopt_diff_utd_so101": 0.029296875,
30
+ "eval_p_rank/avg_subopt_fail_diff_utd_so101": 0.0,
31
+ "eval_p_rank/min_subopt_fail_diff_utd_so101": -0.0361328125,
32
+ "eval_p_rank/max_subopt_fail_diff_utd_so101": 0.0341796875,
33
+ "eval_p_rank/avg_succ_fail_diff_utd_so101": -0.00234375,
34
+ "eval_p_rank/min_succ_fail_diff_utd_so101": -0.0224609375,
35
+ "eval_p_rank/max_succ_fail_diff_utd_so101": 0.021484375,
36
+ "eval_p_rank/ranking_acc_utd_so101": 0.4666666666666667,
37
+ "eval_p_rank/ranking_total_pairs_utd_so101": 30.0,
38
+ "eval_p_rank/spearman_usc_franka": 0.359375,
39
+ "eval_p_rank/spearman_rewind_usc_franka": 0.25,
40
+ "eval_p_rank/avg_succ_subopt_diff_usc_franka": 0.003173828125,
41
+ "eval_p_rank/min_succ_subopt_diff_usc_franka": -0.001953125,
42
+ "eval_p_rank/max_succ_subopt_diff_usc_franka": 0.0107421875,
43
+ "eval_p_rank/avg_subopt_fail_diff_usc_franka": 0.00244140625,
44
+ "eval_p_rank/min_subopt_fail_diff_usc_franka": -0.0107421875,
45
+ "eval_p_rank/max_subopt_fail_diff_usc_franka": 0.017578125,
46
+ "eval_p_rank/avg_succ_fail_diff_usc_franka": 0.005615234375,
47
+ "eval_p_rank/min_succ_fail_diff_usc_franka": -0.0126953125,
48
+ "eval_p_rank/max_succ_fail_diff_usc_franka": 0.0166015625,
49
+ "eval_p_rank/ranking_acc_usc_franka": 0.5833333333333334,
50
+ "eval_p_rank/ranking_total_pairs_usc_franka": 12.0,
51
+ "eval_p_rank/spearman_usc_xarm": 0.05691772515768497,
52
+ "eval_p_rank/spearman_rewind_usc_xarm": 0.08333333333333333,
53
+ "eval_p_rank/avg_succ_subopt_diff_usc_xarm": 0.005045572916666667,
54
+ "eval_p_rank/min_succ_subopt_diff_usc_xarm": -0.015625,
55
+ "eval_p_rank/max_succ_subopt_diff_usc_xarm": 0.0185546875,
56
+ "eval_p_rank/avg_subopt_fail_diff_usc_xarm": -0.004069010416666667,
57
+ "eval_p_rank/min_subopt_fail_diff_usc_xarm": -0.0205078125,
58
+ "eval_p_rank/max_subopt_fail_diff_usc_xarm": 0.01171875,
59
+ "eval_p_rank/avg_succ_fail_diff_usc_xarm": 0.0009765625,
60
+ "eval_p_rank/min_succ_fail_diff_usc_xarm": -0.01171875,
61
+ "eval_p_rank/max_succ_fail_diff_usc_xarm": 0.013671875,
62
+ "eval_p_rank/ranking_acc_usc_xarm": 0.5555555555555556,
63
+ "eval_p_rank/ranking_total_pairs_usc_xarm": 18.0,
64
+ "time/custom_evaluations": 45.65803809789941
65
+ }
66
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a11b484e4d687462e1fdaef9d2866fb0ba7785f390b774ede6303e89196faf66
3
+ size 534874736
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "max_length": 128,
51
+ "model_max_length": 512,
52
+ "never_split": null,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "[PAD]",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "[SEP]",
58
+ "stride": 0,
59
+ "strip_accents": null,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "BertTokenizer",
62
+ "truncation_side": "right",
63
+ "truncation_strategy": "longest_first",
64
+ "unk_token": "[UNK]"
65
+ }
trainer_state.json ADDED
@@ -0,0 +1,1784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.25588536335721596,
6
+ "eval_steps": 250,
7
+ "global_step": 250,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0010235414534288639,
14
+ "grad_norm": 2.0027730464935303,
15
+ "learning_rate": 0.0,
16
+ "loss": 0.7807,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.0020470829068577278,
21
+ "grad_norm": 2.1167027950286865,
22
+ "learning_rate": 2e-09,
23
+ "loss": 0.7864,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.0030706243602865915,
28
+ "grad_norm": 2.759631395339966,
29
+ "learning_rate": 4e-09,
30
+ "loss": 0.7935,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.0040941658137154556,
35
+ "grad_norm": 1.614827036857605,
36
+ "learning_rate": 6e-09,
37
+ "loss": 0.7812,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.00511770726714432,
42
+ "grad_norm": 2.2924065589904785,
43
+ "learning_rate": 8e-09,
44
+ "loss": 0.7841,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.006141248720573183,
49
+ "grad_norm": 2.6293888092041016,
50
+ "learning_rate": 1e-08,
51
+ "loss": 0.7944,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.007164790174002047,
56
+ "grad_norm": 1.7844728231430054,
57
+ "learning_rate": 1.2e-08,
58
+ "loss": 0.7793,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.008188331627430911,
63
+ "grad_norm": 1.4052098989486694,
64
+ "learning_rate": 1.4000000000000001e-08,
65
+ "loss": 0.7671,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.009211873080859774,
70
+ "grad_norm": 2.682335376739502,
71
+ "learning_rate": 1.6e-08,
72
+ "loss": 0.795,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 0.01023541453428864,
77
+ "grad_norm": 2.45478892326355,
78
+ "learning_rate": 1.8000000000000002e-08,
79
+ "loss": 0.7918,
80
+ "step": 10
81
+ },
82
+ {
83
+ "epoch": 0.011258955987717503,
84
+ "grad_norm": 2.523688316345215,
85
+ "learning_rate": 2e-08,
86
+ "loss": 0.799,
87
+ "step": 11
88
+ },
89
+ {
90
+ "epoch": 0.012282497441146366,
91
+ "grad_norm": 2.3234951496124268,
92
+ "learning_rate": 2.2000000000000002e-08,
93
+ "loss": 0.7835,
94
+ "step": 12
95
+ },
96
+ {
97
+ "epoch": 0.01330603889457523,
98
+ "grad_norm": 2.2566919326782227,
99
+ "learning_rate": 2.4e-08,
100
+ "loss": 0.7926,
101
+ "step": 13
102
+ },
103
+ {
104
+ "epoch": 0.014329580348004094,
105
+ "grad_norm": 2.3038065433502197,
106
+ "learning_rate": 2.6e-08,
107
+ "loss": 0.7921,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.015353121801432957,
112
+ "grad_norm": 2.107079267501831,
113
+ "learning_rate": 2.8000000000000003e-08,
114
+ "loss": 0.7864,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.016376663254861822,
119
+ "grad_norm": 2.129422426223755,
120
+ "learning_rate": 3.0000000000000004e-08,
121
+ "loss": 0.7756,
122
+ "step": 16
123
+ },
124
+ {
125
+ "epoch": 0.017400204708290685,
126
+ "grad_norm": 1.5430221557617188,
127
+ "learning_rate": 3.2e-08,
128
+ "loss": 0.7695,
129
+ "step": 17
130
+ },
131
+ {
132
+ "epoch": 0.01842374616171955,
133
+ "grad_norm": 2.061033248901367,
134
+ "learning_rate": 3.4e-08,
135
+ "loss": 0.7812,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.019447287615148412,
140
+ "grad_norm": 2.001169443130493,
141
+ "learning_rate": 3.6000000000000005e-08,
142
+ "loss": 0.7859,
143
+ "step": 19
144
+ },
145
+ {
146
+ "epoch": 0.02047082906857728,
147
+ "grad_norm": 1.981490135192871,
148
+ "learning_rate": 3.8e-08,
149
+ "loss": 0.788,
150
+ "step": 20
151
+ },
152
+ {
153
+ "epoch": 0.021494370522006142,
154
+ "grad_norm": 1.7497801780700684,
155
+ "learning_rate": 4e-08,
156
+ "loss": 0.7762,
157
+ "step": 21
158
+ },
159
+ {
160
+ "epoch": 0.022517911975435005,
161
+ "grad_norm": 1.8900872468948364,
162
+ "learning_rate": 4.2e-08,
163
+ "loss": 0.7835,
164
+ "step": 22
165
+ },
166
+ {
167
+ "epoch": 0.02354145342886387,
168
+ "grad_norm": 2.0690395832061768,
169
+ "learning_rate": 4.4000000000000004e-08,
170
+ "loss": 0.7795,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.02456499488229273,
175
+ "grad_norm": 1.251330852508545,
176
+ "learning_rate": 4.6e-08,
177
+ "loss": 0.7675,
178
+ "step": 24
179
+ },
180
+ {
181
+ "epoch": 0.0255885363357216,
182
+ "grad_norm": 1.5707719326019287,
183
+ "learning_rate": 4.8e-08,
184
+ "loss": 0.7677,
185
+ "step": 25
186
+ },
187
+ {
188
+ "epoch": 0.02661207778915046,
189
+ "grad_norm": 1.831811785697937,
190
+ "learning_rate": 5.0000000000000004e-08,
191
+ "loss": 0.7677,
192
+ "step": 26
193
+ },
194
+ {
195
+ "epoch": 0.027635619242579325,
196
+ "grad_norm": 1.8244602680206299,
197
+ "learning_rate": 5.2e-08,
198
+ "loss": 0.7714,
199
+ "step": 27
200
+ },
201
+ {
202
+ "epoch": 0.028659160696008188,
203
+ "grad_norm": 2.0211703777313232,
204
+ "learning_rate": 5.400000000000001e-08,
205
+ "loss": 0.7848,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 0.02968270214943705,
210
+ "grad_norm": 1.8793987035751343,
211
+ "learning_rate": 5.6000000000000005e-08,
212
+ "loss": 0.7846,
213
+ "step": 29
214
+ },
215
+ {
216
+ "epoch": 0.030706243602865915,
217
+ "grad_norm": 1.2707512378692627,
218
+ "learning_rate": 5.8e-08,
219
+ "loss": 0.7723,
220
+ "step": 30
221
+ },
222
+ {
223
+ "epoch": 0.03172978505629478,
224
+ "grad_norm": 1.710810899734497,
225
+ "learning_rate": 6.000000000000001e-08,
226
+ "loss": 0.7788,
227
+ "step": 31
228
+ },
229
+ {
230
+ "epoch": 0.032753326509723645,
231
+ "grad_norm": 1.5903525352478027,
232
+ "learning_rate": 6.2e-08,
233
+ "loss": 0.7775,
234
+ "step": 32
235
+ },
236
+ {
237
+ "epoch": 0.033776867963152504,
238
+ "grad_norm": 2.2208263874053955,
239
+ "learning_rate": 6.4e-08,
240
+ "loss": 0.78,
241
+ "step": 33
242
+ },
243
+ {
244
+ "epoch": 0.03480040941658137,
245
+ "grad_norm": 2.3763325214385986,
246
+ "learning_rate": 6.600000000000001e-08,
247
+ "loss": 0.7939,
248
+ "step": 34
249
+ },
250
+ {
251
+ "epoch": 0.03582395087001024,
252
+ "grad_norm": 1.6977214813232422,
253
+ "learning_rate": 6.8e-08,
254
+ "loss": 0.7745,
255
+ "step": 35
256
+ },
257
+ {
258
+ "epoch": 0.0368474923234391,
259
+ "grad_norm": 2.1862456798553467,
260
+ "learning_rate": 7e-08,
261
+ "loss": 0.7844,
262
+ "step": 36
263
+ },
264
+ {
265
+ "epoch": 0.037871033776867964,
266
+ "grad_norm": 1.8891886472702026,
267
+ "learning_rate": 7.200000000000001e-08,
268
+ "loss": 0.775,
269
+ "step": 37
270
+ },
271
+ {
272
+ "epoch": 0.038894575230296824,
273
+ "grad_norm": 2.0394537448883057,
274
+ "learning_rate": 7.400000000000001e-08,
275
+ "loss": 0.7771,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 0.03991811668372569,
280
+ "grad_norm": 2.105576992034912,
281
+ "learning_rate": 7.6e-08,
282
+ "loss": 0.7862,
283
+ "step": 39
284
+ },
285
+ {
286
+ "epoch": 0.04094165813715456,
287
+ "grad_norm": 2.123842477798462,
288
+ "learning_rate": 7.8e-08,
289
+ "loss": 0.7804,
290
+ "step": 40
291
+ },
292
+ {
293
+ "epoch": 0.04196519959058342,
294
+ "grad_norm": 1.3442330360412598,
295
+ "learning_rate": 8e-08,
296
+ "loss": 0.7676,
297
+ "step": 41
298
+ },
299
+ {
300
+ "epoch": 0.042988741044012284,
301
+ "grad_norm": 2.431581735610962,
302
+ "learning_rate": 8.200000000000002e-08,
303
+ "loss": 0.7785,
304
+ "step": 42
305
+ },
306
+ {
307
+ "epoch": 0.044012282497441144,
308
+ "grad_norm": 2.3671112060546875,
309
+ "learning_rate": 8.4e-08,
310
+ "loss": 0.7996,
311
+ "step": 43
312
+ },
313
+ {
314
+ "epoch": 0.04503582395087001,
315
+ "grad_norm": 2.2737319469451904,
316
+ "learning_rate": 8.6e-08,
317
+ "loss": 0.7794,
318
+ "step": 44
319
+ },
320
+ {
321
+ "epoch": 0.04605936540429888,
322
+ "grad_norm": 1.5326226949691772,
323
+ "learning_rate": 8.800000000000001e-08,
324
+ "loss": 0.774,
325
+ "step": 45
326
+ },
327
+ {
328
+ "epoch": 0.04708290685772774,
329
+ "grad_norm": 1.845044493675232,
330
+ "learning_rate": 9e-08,
331
+ "loss": 0.7875,
332
+ "step": 46
333
+ },
334
+ {
335
+ "epoch": 0.048106448311156604,
336
+ "grad_norm": 2.3159587383270264,
337
+ "learning_rate": 9.2e-08,
338
+ "loss": 0.7916,
339
+ "step": 47
340
+ },
341
+ {
342
+ "epoch": 0.04912998976458546,
343
+ "grad_norm": 1.6539689302444458,
344
+ "learning_rate": 9.400000000000001e-08,
345
+ "loss": 0.775,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 0.05015353121801433,
350
+ "grad_norm": 1.5031073093414307,
351
+ "learning_rate": 9.6e-08,
352
+ "loss": 0.777,
353
+ "step": 49
354
+ },
355
+ {
356
+ "epoch": 0.0511770726714432,
357
+ "grad_norm": 1.9059022665023804,
358
+ "learning_rate": 9.8e-08,
359
+ "loss": 0.7836,
360
+ "step": 50
361
+ },
362
+ {
363
+ "epoch": 0.052200614124872056,
364
+ "grad_norm": 2.2032649517059326,
365
+ "learning_rate": 1.0000000000000001e-07,
366
+ "loss": 0.7862,
367
+ "step": 51
368
+ },
369
+ {
370
+ "epoch": 0.05322415557830092,
371
+ "grad_norm": 1.2774498462677002,
372
+ "learning_rate": 1.0200000000000001e-07,
373
+ "loss": 0.7691,
374
+ "step": 52
375
+ },
376
+ {
377
+ "epoch": 0.05424769703172978,
378
+ "grad_norm": 1.7091227769851685,
379
+ "learning_rate": 1.04e-07,
380
+ "loss": 0.7843,
381
+ "step": 53
382
+ },
383
+ {
384
+ "epoch": 0.05527123848515865,
385
+ "grad_norm": 1.3407061100006104,
386
+ "learning_rate": 1.0600000000000001e-07,
387
+ "loss": 0.7704,
388
+ "step": 54
389
+ },
390
+ {
391
+ "epoch": 0.05629477993858751,
392
+ "grad_norm": 1.7370580434799194,
393
+ "learning_rate": 1.0800000000000001e-07,
394
+ "loss": 0.7803,
395
+ "step": 55
396
+ },
397
+ {
398
+ "epoch": 0.057318321392016376,
399
+ "grad_norm": 2.144951105117798,
400
+ "learning_rate": 1.1e-07,
401
+ "loss": 0.7816,
402
+ "step": 56
403
+ },
404
+ {
405
+ "epoch": 0.05834186284544524,
406
+ "grad_norm": 1.5752339363098145,
407
+ "learning_rate": 1.1200000000000001e-07,
408
+ "loss": 0.7723,
409
+ "step": 57
410
+ },
411
+ {
412
+ "epoch": 0.0593654042988741,
413
+ "grad_norm": 1.595261573791504,
414
+ "learning_rate": 1.1400000000000001e-07,
415
+ "loss": 0.7785,
416
+ "step": 58
417
+ },
418
+ {
419
+ "epoch": 0.06038894575230297,
420
+ "grad_norm": 2.0418097972869873,
421
+ "learning_rate": 1.16e-07,
422
+ "loss": 0.7864,
423
+ "step": 59
424
+ },
425
+ {
426
+ "epoch": 0.06141248720573183,
427
+ "grad_norm": 1.6113048791885376,
428
+ "learning_rate": 1.1800000000000001e-07,
429
+ "loss": 0.7719,
430
+ "step": 60
431
+ },
432
+ {
433
+ "epoch": 0.062436028659160696,
434
+ "grad_norm": 1.533006191253662,
435
+ "learning_rate": 1.2000000000000002e-07,
436
+ "loss": 0.7743,
437
+ "step": 61
438
+ },
439
+ {
440
+ "epoch": 0.06345957011258956,
441
+ "grad_norm": 0.9601923823356628,
442
+ "learning_rate": 1.22e-07,
443
+ "loss": 0.7737,
444
+ "step": 62
445
+ },
446
+ {
447
+ "epoch": 0.06448311156601842,
448
+ "grad_norm": 1.3146189451217651,
449
+ "learning_rate": 1.24e-07,
450
+ "loss": 0.7683,
451
+ "step": 63
452
+ },
453
+ {
454
+ "epoch": 0.06550665301944729,
455
+ "grad_norm": 1.5222556591033936,
456
+ "learning_rate": 1.2600000000000002e-07,
457
+ "loss": 0.7739,
458
+ "step": 64
459
+ },
460
+ {
461
+ "epoch": 0.06653019447287616,
462
+ "grad_norm": 1.4367096424102783,
463
+ "learning_rate": 1.28e-07,
464
+ "loss": 0.7726,
465
+ "step": 65
466
+ },
467
+ {
468
+ "epoch": 0.06755373592630501,
469
+ "grad_norm": 1.03805673122406,
470
+ "learning_rate": 1.3e-07,
471
+ "loss": 0.7638,
472
+ "step": 66
473
+ },
474
+ {
475
+ "epoch": 0.06857727737973388,
476
+ "grad_norm": 1.9511936902999878,
477
+ "learning_rate": 1.3200000000000002e-07,
478
+ "loss": 0.7815,
479
+ "step": 67
480
+ },
481
+ {
482
+ "epoch": 0.06960081883316274,
483
+ "grad_norm": 1.5974836349487305,
484
+ "learning_rate": 1.34e-07,
485
+ "loss": 0.7713,
486
+ "step": 68
487
+ },
488
+ {
489
+ "epoch": 0.07062436028659161,
490
+ "grad_norm": 1.4479100704193115,
491
+ "learning_rate": 1.36e-07,
492
+ "loss": 0.7761,
493
+ "step": 69
494
+ },
495
+ {
496
+ "epoch": 0.07164790174002048,
497
+ "grad_norm": 1.4680284261703491,
498
+ "learning_rate": 1.3800000000000002e-07,
499
+ "loss": 0.766,
500
+ "step": 70
501
+ },
502
+ {
503
+ "epoch": 0.07267144319344933,
504
+ "grad_norm": 1.4755054712295532,
505
+ "learning_rate": 1.4e-07,
506
+ "loss": 0.7713,
507
+ "step": 71
508
+ },
509
+ {
510
+ "epoch": 0.0736949846468782,
511
+ "grad_norm": 1.3230453729629517,
512
+ "learning_rate": 1.4200000000000003e-07,
513
+ "loss": 0.7755,
514
+ "step": 72
515
+ },
516
+ {
517
+ "epoch": 0.07471852610030706,
518
+ "grad_norm": 1.8009718656539917,
519
+ "learning_rate": 1.4400000000000002e-07,
520
+ "loss": 0.7833,
521
+ "step": 73
522
+ },
523
+ {
524
+ "epoch": 0.07574206755373593,
525
+ "grad_norm": 1.5481526851654053,
526
+ "learning_rate": 1.46e-07,
527
+ "loss": 0.7806,
528
+ "step": 74
529
+ },
530
+ {
531
+ "epoch": 0.0767656090071648,
532
+ "grad_norm": 0.9285205602645874,
533
+ "learning_rate": 1.4800000000000003e-07,
534
+ "loss": 0.767,
535
+ "step": 75
536
+ },
537
+ {
538
+ "epoch": 0.07778915046059365,
539
+ "grad_norm": 1.6149866580963135,
540
+ "learning_rate": 1.5000000000000002e-07,
541
+ "loss": 0.7802,
542
+ "step": 76
543
+ },
544
+ {
545
+ "epoch": 0.07881269191402251,
546
+ "grad_norm": 1.667980670928955,
547
+ "learning_rate": 1.52e-07,
548
+ "loss": 0.7757,
549
+ "step": 77
550
+ },
551
+ {
552
+ "epoch": 0.07983623336745138,
553
+ "grad_norm": 1.862239956855774,
554
+ "learning_rate": 1.5400000000000003e-07,
555
+ "loss": 0.771,
556
+ "step": 78
557
+ },
558
+ {
559
+ "epoch": 0.08085977482088025,
560
+ "grad_norm": 1.3772761821746826,
561
+ "learning_rate": 1.56e-07,
562
+ "loss": 0.782,
563
+ "step": 79
564
+ },
565
+ {
566
+ "epoch": 0.08188331627430911,
567
+ "grad_norm": 1.9442654848098755,
568
+ "learning_rate": 1.5800000000000004e-07,
569
+ "loss": 0.7746,
570
+ "step": 80
571
+ },
572
+ {
573
+ "epoch": 0.08290685772773797,
574
+ "grad_norm": 1.420366644859314,
575
+ "learning_rate": 1.6e-07,
576
+ "loss": 0.7749,
577
+ "step": 81
578
+ },
579
+ {
580
+ "epoch": 0.08393039918116683,
581
+ "grad_norm": 1.7965872287750244,
582
+ "learning_rate": 1.62e-07,
583
+ "loss": 0.776,
584
+ "step": 82
585
+ },
586
+ {
587
+ "epoch": 0.0849539406345957,
588
+ "grad_norm": 1.811287522315979,
589
+ "learning_rate": 1.6400000000000004e-07,
590
+ "loss": 0.7794,
591
+ "step": 83
592
+ },
593
+ {
594
+ "epoch": 0.08597748208802457,
595
+ "grad_norm": 1.489158034324646,
596
+ "learning_rate": 1.66e-07,
597
+ "loss": 0.7785,
598
+ "step": 84
599
+ },
600
+ {
601
+ "epoch": 0.08700102354145343,
602
+ "grad_norm": 1.527341604232788,
603
+ "learning_rate": 1.68e-07,
604
+ "loss": 0.7804,
605
+ "step": 85
606
+ },
607
+ {
608
+ "epoch": 0.08802456499488229,
609
+ "grad_norm": 1.0682123899459839,
610
+ "learning_rate": 1.7000000000000001e-07,
611
+ "loss": 0.7717,
612
+ "step": 86
613
+ },
614
+ {
615
+ "epoch": 0.08904810644831115,
616
+ "grad_norm": 0.7309737205505371,
617
+ "learning_rate": 1.72e-07,
618
+ "loss": 0.7644,
619
+ "step": 87
620
+ },
621
+ {
622
+ "epoch": 0.09007164790174002,
623
+ "grad_norm": 1.156342625617981,
624
+ "learning_rate": 1.74e-07,
625
+ "loss": 0.7703,
626
+ "step": 88
627
+ },
628
+ {
629
+ "epoch": 0.09109518935516889,
630
+ "grad_norm": 1.4098986387252808,
631
+ "learning_rate": 1.7600000000000001e-07,
632
+ "loss": 0.774,
633
+ "step": 89
634
+ },
635
+ {
636
+ "epoch": 0.09211873080859775,
637
+ "grad_norm": 1.740477204322815,
638
+ "learning_rate": 1.78e-07,
639
+ "loss": 0.7737,
640
+ "step": 90
641
+ },
642
+ {
643
+ "epoch": 0.0931422722620266,
644
+ "grad_norm": 1.291332721710205,
645
+ "learning_rate": 1.8e-07,
646
+ "loss": 0.769,
647
+ "step": 91
648
+ },
649
+ {
650
+ "epoch": 0.09416581371545547,
651
+ "grad_norm": 1.403385877609253,
652
+ "learning_rate": 1.8200000000000002e-07,
653
+ "loss": 0.777,
654
+ "step": 92
655
+ },
656
+ {
657
+ "epoch": 0.09518935516888434,
658
+ "grad_norm": 1.0460313558578491,
659
+ "learning_rate": 1.84e-07,
660
+ "loss": 0.7654,
661
+ "step": 93
662
+ },
663
+ {
664
+ "epoch": 0.09621289662231321,
665
+ "grad_norm": 0.8738810420036316,
666
+ "learning_rate": 1.86e-07,
667
+ "loss": 0.7645,
668
+ "step": 94
669
+ },
670
+ {
671
+ "epoch": 0.09723643807574207,
672
+ "grad_norm": 1.2067152261734009,
673
+ "learning_rate": 1.8800000000000002e-07,
674
+ "loss": 0.771,
675
+ "step": 95
676
+ },
677
+ {
678
+ "epoch": 0.09825997952917093,
679
+ "grad_norm": 0.7921656370162964,
680
+ "learning_rate": 1.9e-07,
681
+ "loss": 0.7673,
682
+ "step": 96
683
+ },
684
+ {
685
+ "epoch": 0.0992835209825998,
686
+ "grad_norm": 1.2949451208114624,
687
+ "learning_rate": 1.92e-07,
688
+ "loss": 0.771,
689
+ "step": 97
690
+ },
691
+ {
692
+ "epoch": 0.10030706243602866,
693
+ "grad_norm": 1.0598716735839844,
694
+ "learning_rate": 1.9400000000000002e-07,
695
+ "loss": 0.7661,
696
+ "step": 98
697
+ },
698
+ {
699
+ "epoch": 0.10133060388945753,
700
+ "grad_norm": 0.5756526589393616,
701
+ "learning_rate": 1.96e-07,
702
+ "loss": 0.7629,
703
+ "step": 99
704
+ },
705
+ {
706
+ "epoch": 0.1023541453428864,
707
+ "grad_norm": 0.9098894596099854,
708
+ "learning_rate": 1.9800000000000003e-07,
709
+ "loss": 0.7613,
710
+ "step": 100
711
+ },
712
+ {
713
+ "epoch": 0.10337768679631525,
714
+ "grad_norm": 1.3523019552230835,
715
+ "learning_rate": 2.0000000000000002e-07,
716
+ "loss": 0.7753,
717
+ "step": 101
718
+ },
719
+ {
720
+ "epoch": 0.10440122824974411,
721
+ "grad_norm": 0.9900273680686951,
722
+ "learning_rate": 2.02e-07,
723
+ "loss": 0.7761,
724
+ "step": 102
725
+ },
726
+ {
727
+ "epoch": 0.10542476970317298,
728
+ "grad_norm": 0.7855979204177856,
729
+ "learning_rate": 2.0400000000000003e-07,
730
+ "loss": 0.7651,
731
+ "step": 103
732
+ },
733
+ {
734
+ "epoch": 0.10644831115660185,
735
+ "grad_norm": 1.0023835897445679,
736
+ "learning_rate": 2.0600000000000002e-07,
737
+ "loss": 0.7677,
738
+ "step": 104
739
+ },
740
+ {
741
+ "epoch": 0.10747185261003071,
742
+ "grad_norm": 0.6019173860549927,
743
+ "learning_rate": 2.08e-07,
744
+ "loss": 0.7606,
745
+ "step": 105
746
+ },
747
+ {
748
+ "epoch": 0.10849539406345957,
749
+ "grad_norm": 0.9002220630645752,
750
+ "learning_rate": 2.1000000000000003e-07,
751
+ "loss": 0.7701,
752
+ "step": 106
753
+ },
754
+ {
755
+ "epoch": 0.10951893551688843,
756
+ "grad_norm": 0.8429995775222778,
757
+ "learning_rate": 2.1200000000000002e-07,
758
+ "loss": 0.7656,
759
+ "step": 107
760
+ },
761
+ {
762
+ "epoch": 0.1105424769703173,
763
+ "grad_norm": 0.5726915001869202,
764
+ "learning_rate": 2.14e-07,
765
+ "loss": 0.7667,
766
+ "step": 108
767
+ },
768
+ {
769
+ "epoch": 0.11156601842374617,
770
+ "grad_norm": 0.49694034457206726,
771
+ "learning_rate": 2.1600000000000003e-07,
772
+ "loss": 0.7614,
773
+ "step": 109
774
+ },
775
+ {
776
+ "epoch": 0.11258955987717502,
777
+ "grad_norm": 0.5831499099731445,
778
+ "learning_rate": 2.1800000000000002e-07,
779
+ "loss": 0.7646,
780
+ "step": 110
781
+ },
782
+ {
783
+ "epoch": 0.11361310133060389,
784
+ "grad_norm": 0.49775540828704834,
785
+ "learning_rate": 2.2e-07,
786
+ "loss": 0.7611,
787
+ "step": 111
788
+ },
789
+ {
790
+ "epoch": 0.11463664278403275,
791
+ "grad_norm": 1.5634126663208008,
792
+ "learning_rate": 2.2200000000000003e-07,
793
+ "loss": 0.7824,
794
+ "step": 112
795
+ },
796
+ {
797
+ "epoch": 0.11566018423746162,
798
+ "grad_norm": 0.9528007507324219,
799
+ "learning_rate": 2.2400000000000002e-07,
800
+ "loss": 0.7595,
801
+ "step": 113
802
+ },
803
+ {
804
+ "epoch": 0.11668372569089049,
805
+ "grad_norm": 0.5957873463630676,
806
+ "learning_rate": 2.26e-07,
807
+ "loss": 0.761,
808
+ "step": 114
809
+ },
810
+ {
811
+ "epoch": 0.11770726714431934,
812
+ "grad_norm": 0.6020492911338806,
813
+ "learning_rate": 2.2800000000000003e-07,
814
+ "loss": 0.7605,
815
+ "step": 115
816
+ },
817
+ {
818
+ "epoch": 0.1187308085977482,
819
+ "grad_norm": 0.578925371170044,
820
+ "learning_rate": 2.3000000000000002e-07,
821
+ "loss": 0.768,
822
+ "step": 116
823
+ },
824
+ {
825
+ "epoch": 0.11975435005117707,
826
+ "grad_norm": 0.5749679207801819,
827
+ "learning_rate": 2.32e-07,
828
+ "loss": 0.7636,
829
+ "step": 117
830
+ },
831
+ {
832
+ "epoch": 0.12077789150460594,
833
+ "grad_norm": 1.0057039260864258,
834
+ "learning_rate": 2.3400000000000003e-07,
835
+ "loss": 0.7591,
836
+ "step": 118
837
+ },
838
+ {
839
+ "epoch": 0.1218014329580348,
840
+ "grad_norm": 0.5447669625282288,
841
+ "learning_rate": 2.3600000000000002e-07,
842
+ "loss": 0.7671,
843
+ "step": 119
844
+ },
845
+ {
846
+ "epoch": 0.12282497441146366,
847
+ "grad_norm": 0.5045008063316345,
848
+ "learning_rate": 2.3800000000000004e-07,
849
+ "loss": 0.7614,
850
+ "step": 120
851
+ },
852
+ {
853
+ "epoch": 0.12384851586489252,
854
+ "grad_norm": 1.2467900514602661,
855
+ "learning_rate": 2.4000000000000003e-07,
856
+ "loss": 0.7683,
857
+ "step": 121
858
+ },
859
+ {
860
+ "epoch": 0.12487205731832139,
861
+ "grad_norm": 0.49536195397377014,
862
+ "learning_rate": 2.42e-07,
863
+ "loss": 0.7626,
864
+ "step": 122
865
+ },
866
+ {
867
+ "epoch": 0.12589559877175024,
868
+ "grad_norm": 1.1078747510910034,
869
+ "learning_rate": 2.44e-07,
870
+ "loss": 0.7679,
871
+ "step": 123
872
+ },
873
+ {
874
+ "epoch": 0.1269191402251791,
875
+ "grad_norm": 0.5230876207351685,
876
+ "learning_rate": 2.46e-07,
877
+ "loss": 0.7579,
878
+ "step": 124
879
+ },
880
+ {
881
+ "epoch": 0.12794268167860798,
882
+ "grad_norm": 0.7761886119842529,
883
+ "learning_rate": 2.48e-07,
884
+ "loss": 0.768,
885
+ "step": 125
886
+ },
887
+ {
888
+ "epoch": 0.12896622313203684,
889
+ "grad_norm": 0.4926793575286865,
890
+ "learning_rate": 2.5000000000000004e-07,
891
+ "loss": 0.7639,
892
+ "step": 126
893
+ },
894
+ {
895
+ "epoch": 0.1299897645854657,
896
+ "grad_norm": 0.7263102531433105,
897
+ "learning_rate": 2.5200000000000003e-07,
898
+ "loss": 0.7619,
899
+ "step": 127
900
+ },
901
+ {
902
+ "epoch": 0.13101330603889458,
903
+ "grad_norm": 0.846560537815094,
904
+ "learning_rate": 2.54e-07,
905
+ "loss": 0.7656,
906
+ "step": 128
907
+ },
908
+ {
909
+ "epoch": 0.13203684749232344,
910
+ "grad_norm": 1.296027421951294,
911
+ "learning_rate": 2.56e-07,
912
+ "loss": 0.7713,
913
+ "step": 129
914
+ },
915
+ {
916
+ "epoch": 0.1330603889457523,
917
+ "grad_norm": 1.0898621082305908,
918
+ "learning_rate": 2.58e-07,
919
+ "loss": 0.7644,
920
+ "step": 130
921
+ },
922
+ {
923
+ "epoch": 0.13408393039918118,
924
+ "grad_norm": 0.9062381982803345,
925
+ "learning_rate": 2.6e-07,
926
+ "loss": 0.7606,
927
+ "step": 131
928
+ },
929
+ {
930
+ "epoch": 0.13510747185261002,
931
+ "grad_norm": 0.9521744847297668,
932
+ "learning_rate": 2.6200000000000004e-07,
933
+ "loss": 0.7563,
934
+ "step": 132
935
+ },
936
+ {
937
+ "epoch": 0.13613101330603888,
938
+ "grad_norm": 0.5530220866203308,
939
+ "learning_rate": 2.6400000000000003e-07,
940
+ "loss": 0.7647,
941
+ "step": 133
942
+ },
943
+ {
944
+ "epoch": 0.13715455475946775,
945
+ "grad_norm": 0.6682366132736206,
946
+ "learning_rate": 2.66e-07,
947
+ "loss": 0.7605,
948
+ "step": 134
949
+ },
950
+ {
951
+ "epoch": 0.13817809621289662,
952
+ "grad_norm": 0.647662341594696,
953
+ "learning_rate": 2.68e-07,
954
+ "loss": 0.7725,
955
+ "step": 135
956
+ },
957
+ {
958
+ "epoch": 0.13920163766632548,
959
+ "grad_norm": 0.4923568665981293,
960
+ "learning_rate": 2.7e-07,
961
+ "loss": 0.7535,
962
+ "step": 136
963
+ },
964
+ {
965
+ "epoch": 0.14022517911975435,
966
+ "grad_norm": 0.661147952079773,
967
+ "learning_rate": 2.72e-07,
968
+ "loss": 0.7634,
969
+ "step": 137
970
+ },
971
+ {
972
+ "epoch": 0.14124872057318322,
973
+ "grad_norm": 0.5441955924034119,
974
+ "learning_rate": 2.7400000000000004e-07,
975
+ "loss": 0.7635,
976
+ "step": 138
977
+ },
978
+ {
979
+ "epoch": 0.14227226202661208,
980
+ "grad_norm": 0.4902069568634033,
981
+ "learning_rate": 2.7600000000000004e-07,
982
+ "loss": 0.7602,
983
+ "step": 139
984
+ },
985
+ {
986
+ "epoch": 0.14329580348004095,
987
+ "grad_norm": 0.9088981747627258,
988
+ "learning_rate": 2.7800000000000003e-07,
989
+ "loss": 0.7597,
990
+ "step": 140
991
+ },
992
+ {
993
+ "epoch": 0.14431934493346982,
994
+ "grad_norm": 0.8669309020042419,
995
+ "learning_rate": 2.8e-07,
996
+ "loss": 0.763,
997
+ "step": 141
998
+ },
999
+ {
1000
+ "epoch": 0.14534288638689866,
1001
+ "grad_norm": 1.1272106170654297,
1002
+ "learning_rate": 2.82e-07,
1003
+ "loss": 0.7685,
1004
+ "step": 142
1005
+ },
1006
+ {
1007
+ "epoch": 0.14636642784032752,
1008
+ "grad_norm": 0.9253846406936646,
1009
+ "learning_rate": 2.8400000000000005e-07,
1010
+ "loss": 0.761,
1011
+ "step": 143
1012
+ },
1013
+ {
1014
+ "epoch": 0.1473899692937564,
1015
+ "grad_norm": 0.6668679118156433,
1016
+ "learning_rate": 2.8600000000000005e-07,
1017
+ "loss": 0.7636,
1018
+ "step": 144
1019
+ },
1020
+ {
1021
+ "epoch": 0.14841351074718526,
1022
+ "grad_norm": 0.6984054446220398,
1023
+ "learning_rate": 2.8800000000000004e-07,
1024
+ "loss": 0.7639,
1025
+ "step": 145
1026
+ },
1027
+ {
1028
+ "epoch": 0.14943705220061412,
1029
+ "grad_norm": 0.5088668465614319,
1030
+ "learning_rate": 2.9000000000000003e-07,
1031
+ "loss": 0.7575,
1032
+ "step": 146
1033
+ },
1034
+ {
1035
+ "epoch": 0.150460593654043,
1036
+ "grad_norm": 0.8230961561203003,
1037
+ "learning_rate": 2.92e-07,
1038
+ "loss": 0.763,
1039
+ "step": 147
1040
+ },
1041
+ {
1042
+ "epoch": 0.15148413510747186,
1043
+ "grad_norm": 0.5158461332321167,
1044
+ "learning_rate": 2.94e-07,
1045
+ "loss": 0.7626,
1046
+ "step": 148
1047
+ },
1048
+ {
1049
+ "epoch": 0.15250767656090072,
1050
+ "grad_norm": 1.0442547798156738,
1051
+ "learning_rate": 2.9600000000000006e-07,
1052
+ "loss": 0.7615,
1053
+ "step": 149
1054
+ },
1055
+ {
1056
+ "epoch": 0.1535312180143296,
1057
+ "grad_norm": 0.5645190477371216,
1058
+ "learning_rate": 2.9800000000000005e-07,
1059
+ "loss": 0.7549,
1060
+ "step": 150
1061
+ },
1062
+ {
1063
+ "epoch": 0.15455475946775846,
1064
+ "grad_norm": 0.4772261083126068,
1065
+ "learning_rate": 3.0000000000000004e-07,
1066
+ "loss": 0.7645,
1067
+ "step": 151
1068
+ },
1069
+ {
1070
+ "epoch": 0.1555783009211873,
1071
+ "grad_norm": 0.5313320159912109,
1072
+ "learning_rate": 3.0200000000000003e-07,
1073
+ "loss": 0.7589,
1074
+ "step": 152
1075
+ },
1076
+ {
1077
+ "epoch": 0.15660184237461616,
1078
+ "grad_norm": 0.5733660459518433,
1079
+ "learning_rate": 3.04e-07,
1080
+ "loss": 0.7556,
1081
+ "step": 153
1082
+ },
1083
+ {
1084
+ "epoch": 0.15762538382804503,
1085
+ "grad_norm": 0.7178833484649658,
1086
+ "learning_rate": 3.06e-07,
1087
+ "loss": 0.7575,
1088
+ "step": 154
1089
+ },
1090
+ {
1091
+ "epoch": 0.1586489252814739,
1092
+ "grad_norm": 0.4526512026786804,
1093
+ "learning_rate": 3.0800000000000006e-07,
1094
+ "loss": 0.7578,
1095
+ "step": 155
1096
+ },
1097
+ {
1098
+ "epoch": 0.15967246673490276,
1099
+ "grad_norm": 0.5467422008514404,
1100
+ "learning_rate": 3.1000000000000005e-07,
1101
+ "loss": 0.7574,
1102
+ "step": 156
1103
+ },
1104
+ {
1105
+ "epoch": 0.16069600818833163,
1106
+ "grad_norm": 0.6100950241088867,
1107
+ "learning_rate": 3.12e-07,
1108
+ "loss": 0.7575,
1109
+ "step": 157
1110
+ },
1111
+ {
1112
+ "epoch": 0.1617195496417605,
1113
+ "grad_norm": 0.44614189863204956,
1114
+ "learning_rate": 3.14e-07,
1115
+ "loss": 0.7581,
1116
+ "step": 158
1117
+ },
1118
+ {
1119
+ "epoch": 0.16274309109518936,
1120
+ "grad_norm": 0.5598356127738953,
1121
+ "learning_rate": 3.160000000000001e-07,
1122
+ "loss": 0.7617,
1123
+ "step": 159
1124
+ },
1125
+ {
1126
+ "epoch": 0.16376663254861823,
1127
+ "grad_norm": 0.5831120610237122,
1128
+ "learning_rate": 3.1800000000000007e-07,
1129
+ "loss": 0.7619,
1130
+ "step": 160
1131
+ },
1132
+ {
1133
+ "epoch": 0.1647901740020471,
1134
+ "grad_norm": 0.47710853815078735,
1135
+ "learning_rate": 3.2e-07,
1136
+ "loss": 0.7576,
1137
+ "step": 161
1138
+ },
1139
+ {
1140
+ "epoch": 0.16581371545547594,
1141
+ "grad_norm": 0.4452659487724304,
1142
+ "learning_rate": 3.22e-07,
1143
+ "loss": 0.763,
1144
+ "step": 162
1145
+ },
1146
+ {
1147
+ "epoch": 0.1668372569089048,
1148
+ "grad_norm": 0.5317380428314209,
1149
+ "learning_rate": 3.24e-07,
1150
+ "loss": 0.7594,
1151
+ "step": 163
1152
+ },
1153
+ {
1154
+ "epoch": 0.16786079836233367,
1155
+ "grad_norm": 0.5500309467315674,
1156
+ "learning_rate": 3.26e-07,
1157
+ "loss": 0.7523,
1158
+ "step": 164
1159
+ },
1160
+ {
1161
+ "epoch": 0.16888433981576254,
1162
+ "grad_norm": 0.5401707291603088,
1163
+ "learning_rate": 3.280000000000001e-07,
1164
+ "loss": 0.7635,
1165
+ "step": 165
1166
+ },
1167
+ {
1168
+ "epoch": 0.1699078812691914,
1169
+ "grad_norm": 0.6862873435020447,
1170
+ "learning_rate": 3.3e-07,
1171
+ "loss": 0.7547,
1172
+ "step": 166
1173
+ },
1174
+ {
1175
+ "epoch": 0.17093142272262027,
1176
+ "grad_norm": 0.4394344687461853,
1177
+ "learning_rate": 3.32e-07,
1178
+ "loss": 0.7542,
1179
+ "step": 167
1180
+ },
1181
+ {
1182
+ "epoch": 0.17195496417604914,
1183
+ "grad_norm": 0.46235671639442444,
1184
+ "learning_rate": 3.34e-07,
1185
+ "loss": 0.7493,
1186
+ "step": 168
1187
+ },
1188
+ {
1189
+ "epoch": 0.172978505629478,
1190
+ "grad_norm": 0.5838847756385803,
1191
+ "learning_rate": 3.36e-07,
1192
+ "loss": 0.7618,
1193
+ "step": 169
1194
+ },
1195
+ {
1196
+ "epoch": 0.17400204708290687,
1197
+ "grad_norm": 0.760307788848877,
1198
+ "learning_rate": 3.38e-07,
1199
+ "loss": 0.75,
1200
+ "step": 170
1201
+ },
1202
+ {
1203
+ "epoch": 0.1750255885363357,
1204
+ "grad_norm": 0.5808561444282532,
1205
+ "learning_rate": 3.4000000000000003e-07,
1206
+ "loss": 0.7603,
1207
+ "step": 171
1208
+ },
1209
+ {
1210
+ "epoch": 0.17604912998976457,
1211
+ "grad_norm": 0.7596063613891602,
1212
+ "learning_rate": 3.42e-07,
1213
+ "loss": 0.7549,
1214
+ "step": 172
1215
+ },
1216
+ {
1217
+ "epoch": 0.17707267144319344,
1218
+ "grad_norm": 0.6428470015525818,
1219
+ "learning_rate": 3.44e-07,
1220
+ "loss": 0.7548,
1221
+ "step": 173
1222
+ },
1223
+ {
1224
+ "epoch": 0.1780962128966223,
1225
+ "grad_norm": 0.47784337401390076,
1226
+ "learning_rate": 3.46e-07,
1227
+ "loss": 0.7647,
1228
+ "step": 174
1229
+ },
1230
+ {
1231
+ "epoch": 0.17911975435005117,
1232
+ "grad_norm": 0.42269936203956604,
1233
+ "learning_rate": 3.48e-07,
1234
+ "loss": 0.7577,
1235
+ "step": 175
1236
+ },
1237
+ {
1238
+ "epoch": 0.18014329580348004,
1239
+ "grad_norm": 0.6663862466812134,
1240
+ "learning_rate": 3.5000000000000004e-07,
1241
+ "loss": 0.7593,
1242
+ "step": 176
1243
+ },
1244
+ {
1245
+ "epoch": 0.1811668372569089,
1246
+ "grad_norm": 0.443998783826828,
1247
+ "learning_rate": 3.5200000000000003e-07,
1248
+ "loss": 0.7524,
1249
+ "step": 177
1250
+ },
1251
+ {
1252
+ "epoch": 0.18219037871033777,
1253
+ "grad_norm": 0.5712008476257324,
1254
+ "learning_rate": 3.54e-07,
1255
+ "loss": 0.7624,
1256
+ "step": 178
1257
+ },
1258
+ {
1259
+ "epoch": 0.18321392016376664,
1260
+ "grad_norm": 0.5632140636444092,
1261
+ "learning_rate": 3.56e-07,
1262
+ "loss": 0.7631,
1263
+ "step": 179
1264
+ },
1265
+ {
1266
+ "epoch": 0.1842374616171955,
1267
+ "grad_norm": 0.5184634327888489,
1268
+ "learning_rate": 3.58e-07,
1269
+ "loss": 0.7572,
1270
+ "step": 180
1271
+ },
1272
+ {
1273
+ "epoch": 0.18526100307062435,
1274
+ "grad_norm": 0.5643100142478943,
1275
+ "learning_rate": 3.6e-07,
1276
+ "loss": 0.7588,
1277
+ "step": 181
1278
+ },
1279
+ {
1280
+ "epoch": 0.1862845445240532,
1281
+ "grad_norm": 0.4550904333591461,
1282
+ "learning_rate": 3.6200000000000004e-07,
1283
+ "loss": 0.7603,
1284
+ "step": 182
1285
+ },
1286
+ {
1287
+ "epoch": 0.18730808597748208,
1288
+ "grad_norm": 0.6727386713027954,
1289
+ "learning_rate": 3.6400000000000003e-07,
1290
+ "loss": 0.755,
1291
+ "step": 183
1292
+ },
1293
+ {
1294
+ "epoch": 0.18833162743091095,
1295
+ "grad_norm": 0.4629902243614197,
1296
+ "learning_rate": 3.66e-07,
1297
+ "loss": 0.764,
1298
+ "step": 184
1299
+ },
1300
+ {
1301
+ "epoch": 0.18935516888433981,
1302
+ "grad_norm": 0.5423149466514587,
1303
+ "learning_rate": 3.68e-07,
1304
+ "loss": 0.7583,
1305
+ "step": 185
1306
+ },
1307
+ {
1308
+ "epoch": 0.19037871033776868,
1309
+ "grad_norm": 0.5308339595794678,
1310
+ "learning_rate": 3.7e-07,
1311
+ "loss": 0.7535,
1312
+ "step": 186
1313
+ },
1314
+ {
1315
+ "epoch": 0.19140225179119755,
1316
+ "grad_norm": 0.497243732213974,
1317
+ "learning_rate": 3.72e-07,
1318
+ "loss": 0.7566,
1319
+ "step": 187
1320
+ },
1321
+ {
1322
+ "epoch": 0.19242579324462641,
1323
+ "grad_norm": 0.5698720216751099,
1324
+ "learning_rate": 3.7400000000000004e-07,
1325
+ "loss": 0.7576,
1326
+ "step": 188
1327
+ },
1328
+ {
1329
+ "epoch": 0.19344933469805528,
1330
+ "grad_norm": 0.546074628829956,
1331
+ "learning_rate": 3.7600000000000003e-07,
1332
+ "loss": 0.7543,
1333
+ "step": 189
1334
+ },
1335
+ {
1336
+ "epoch": 0.19447287615148415,
1337
+ "grad_norm": 0.6073132157325745,
1338
+ "learning_rate": 3.78e-07,
1339
+ "loss": 0.7473,
1340
+ "step": 190
1341
+ },
1342
+ {
1343
+ "epoch": 0.195496417604913,
1344
+ "grad_norm": 0.5039142370223999,
1345
+ "learning_rate": 3.8e-07,
1346
+ "loss": 0.7528,
1347
+ "step": 191
1348
+ },
1349
+ {
1350
+ "epoch": 0.19651995905834185,
1351
+ "grad_norm": 0.5228015780448914,
1352
+ "learning_rate": 3.82e-07,
1353
+ "loss": 0.7657,
1354
+ "step": 192
1355
+ },
1356
+ {
1357
+ "epoch": 0.19754350051177072,
1358
+ "grad_norm": 0.43683943152427673,
1359
+ "learning_rate": 3.84e-07,
1360
+ "loss": 0.7541,
1361
+ "step": 193
1362
+ },
1363
+ {
1364
+ "epoch": 0.1985670419651996,
1365
+ "grad_norm": 0.42550554871559143,
1366
+ "learning_rate": 3.8600000000000004e-07,
1367
+ "loss": 0.752,
1368
+ "step": 194
1369
+ },
1370
+ {
1371
+ "epoch": 0.19959058341862845,
1372
+ "grad_norm": 0.5053896307945251,
1373
+ "learning_rate": 3.8800000000000003e-07,
1374
+ "loss": 0.7558,
1375
+ "step": 195
1376
+ },
1377
+ {
1378
+ "epoch": 0.20061412487205732,
1379
+ "grad_norm": 0.6906818151473999,
1380
+ "learning_rate": 3.9e-07,
1381
+ "loss": 0.7576,
1382
+ "step": 196
1383
+ },
1384
+ {
1385
+ "epoch": 0.2016376663254862,
1386
+ "grad_norm": 0.46059173345565796,
1387
+ "learning_rate": 3.92e-07,
1388
+ "loss": 0.7549,
1389
+ "step": 197
1390
+ },
1391
+ {
1392
+ "epoch": 0.20266120777891505,
1393
+ "grad_norm": 0.4829753339290619,
1394
+ "learning_rate": 3.94e-07,
1395
+ "loss": 0.7594,
1396
+ "step": 198
1397
+ },
1398
+ {
1399
+ "epoch": 0.20368474923234392,
1400
+ "grad_norm": 0.504260241985321,
1401
+ "learning_rate": 3.9600000000000005e-07,
1402
+ "loss": 0.7564,
1403
+ "step": 199
1404
+ },
1405
+ {
1406
+ "epoch": 0.2047082906857728,
1407
+ "grad_norm": 0.6687684655189514,
1408
+ "learning_rate": 3.9800000000000004e-07,
1409
+ "loss": 0.7524,
1410
+ "step": 200
1411
+ },
1412
+ {
1413
+ "epoch": 0.20573183213920163,
1414
+ "grad_norm": 0.5243176817893982,
1415
+ "learning_rate": 4.0000000000000003e-07,
1416
+ "loss": 0.7576,
1417
+ "step": 201
1418
+ },
1419
+ {
1420
+ "epoch": 0.2067553735926305,
1421
+ "grad_norm": 0.6751272082328796,
1422
+ "learning_rate": 4.02e-07,
1423
+ "loss": 0.7609,
1424
+ "step": 202
1425
+ },
1426
+ {
1427
+ "epoch": 0.20777891504605936,
1428
+ "grad_norm": 0.5937652587890625,
1429
+ "learning_rate": 4.04e-07,
1430
+ "loss": 0.7561,
1431
+ "step": 203
1432
+ },
1433
+ {
1434
+ "epoch": 0.20880245649948823,
1435
+ "grad_norm": 0.6328868269920349,
1436
+ "learning_rate": 4.06e-07,
1437
+ "loss": 0.7544,
1438
+ "step": 204
1439
+ },
1440
+ {
1441
+ "epoch": 0.2098259979529171,
1442
+ "grad_norm": 0.46846815943717957,
1443
+ "learning_rate": 4.0800000000000005e-07,
1444
+ "loss": 0.7582,
1445
+ "step": 205
1446
+ },
1447
+ {
1448
+ "epoch": 0.21084953940634596,
1449
+ "grad_norm": 0.4920537769794464,
1450
+ "learning_rate": 4.1000000000000004e-07,
1451
+ "loss": 0.7542,
1452
+ "step": 206
1453
+ },
1454
+ {
1455
+ "epoch": 0.21187308085977483,
1456
+ "grad_norm": 0.455229789018631,
1457
+ "learning_rate": 4.1200000000000004e-07,
1458
+ "loss": 0.7571,
1459
+ "step": 207
1460
+ },
1461
+ {
1462
+ "epoch": 0.2128966223132037,
1463
+ "grad_norm": 0.4706554114818573,
1464
+ "learning_rate": 4.1400000000000003e-07,
1465
+ "loss": 0.7583,
1466
+ "step": 208
1467
+ },
1468
+ {
1469
+ "epoch": 0.21392016376663256,
1470
+ "grad_norm": 0.5521472096443176,
1471
+ "learning_rate": 4.16e-07,
1472
+ "loss": 0.7564,
1473
+ "step": 209
1474
+ },
1475
+ {
1476
+ "epoch": 0.21494370522006143,
1477
+ "grad_norm": 0.45866256952285767,
1478
+ "learning_rate": 4.18e-07,
1479
+ "loss": 0.755,
1480
+ "step": 210
1481
+ },
1482
+ {
1483
+ "epoch": 0.21596724667349027,
1484
+ "grad_norm": 0.5188294649124146,
1485
+ "learning_rate": 4.2000000000000006e-07,
1486
+ "loss": 0.7544,
1487
+ "step": 211
1488
+ },
1489
+ {
1490
+ "epoch": 0.21699078812691913,
1491
+ "grad_norm": 0.413843035697937,
1492
+ "learning_rate": 4.2200000000000005e-07,
1493
+ "loss": 0.7508,
1494
+ "step": 212
1495
+ },
1496
+ {
1497
+ "epoch": 0.218014329580348,
1498
+ "grad_norm": 0.4665946066379547,
1499
+ "learning_rate": 4.2400000000000004e-07,
1500
+ "loss": 0.7557,
1501
+ "step": 213
1502
+ },
1503
+ {
1504
+ "epoch": 0.21903787103377687,
1505
+ "grad_norm": 0.5579046010971069,
1506
+ "learning_rate": 4.2600000000000003e-07,
1507
+ "loss": 0.7511,
1508
+ "step": 214
1509
+ },
1510
+ {
1511
+ "epoch": 0.22006141248720573,
1512
+ "grad_norm": 0.4209223687648773,
1513
+ "learning_rate": 4.28e-07,
1514
+ "loss": 0.7516,
1515
+ "step": 215
1516
+ },
1517
+ {
1518
+ "epoch": 0.2210849539406346,
1519
+ "grad_norm": 0.7076475024223328,
1520
+ "learning_rate": 4.3e-07,
1521
+ "loss": 0.7507,
1522
+ "step": 216
1523
+ },
1524
+ {
1525
+ "epoch": 0.22210849539406347,
1526
+ "grad_norm": 0.5781177282333374,
1527
+ "learning_rate": 4.3200000000000006e-07,
1528
+ "loss": 0.7475,
1529
+ "step": 217
1530
+ },
1531
+ {
1532
+ "epoch": 0.22313203684749233,
1533
+ "grad_norm": 0.8091248273849487,
1534
+ "learning_rate": 4.3400000000000005e-07,
1535
+ "loss": 0.7503,
1536
+ "step": 218
1537
+ },
1538
+ {
1539
+ "epoch": 0.2241555783009212,
1540
+ "grad_norm": 0.46331843733787537,
1541
+ "learning_rate": 4.3600000000000004e-07,
1542
+ "loss": 0.756,
1543
+ "step": 219
1544
+ },
1545
+ {
1546
+ "epoch": 0.22517911975435004,
1547
+ "grad_norm": 0.4318588674068451,
1548
+ "learning_rate": 4.3800000000000003e-07,
1549
+ "loss": 0.7534,
1550
+ "step": 220
1551
+ },
1552
+ {
1553
+ "epoch": 0.2262026612077789,
1554
+ "grad_norm": 0.5794275999069214,
1555
+ "learning_rate": 4.4e-07,
1556
+ "loss": 0.7563,
1557
+ "step": 221
1558
+ },
1559
+ {
1560
+ "epoch": 0.22722620266120777,
1561
+ "grad_norm": 0.5097821354866028,
1562
+ "learning_rate": 4.4200000000000007e-07,
1563
+ "loss": 0.7553,
1564
+ "step": 222
1565
+ },
1566
+ {
1567
+ "epoch": 0.22824974411463664,
1568
+ "grad_norm": 0.4925571084022522,
1569
+ "learning_rate": 4.4400000000000006e-07,
1570
+ "loss": 0.7549,
1571
+ "step": 223
1572
+ },
1573
+ {
1574
+ "epoch": 0.2292732855680655,
1575
+ "grad_norm": 0.5328956842422485,
1576
+ "learning_rate": 4.4600000000000005e-07,
1577
+ "loss": 0.757,
1578
+ "step": 224
1579
+ },
1580
+ {
1581
+ "epoch": 0.23029682702149437,
1582
+ "grad_norm": 0.4485833942890167,
1583
+ "learning_rate": 4.4800000000000004e-07,
1584
+ "loss": 0.7535,
1585
+ "step": 225
1586
+ },
1587
+ {
1588
+ "epoch": 0.23132036847492324,
1589
+ "grad_norm": 0.538770854473114,
1590
+ "learning_rate": 4.5000000000000003e-07,
1591
+ "loss": 0.7551,
1592
+ "step": 226
1593
+ },
1594
+ {
1595
+ "epoch": 0.2323439099283521,
1596
+ "grad_norm": 0.5424289703369141,
1597
+ "learning_rate": 4.52e-07,
1598
+ "loss": 0.7528,
1599
+ "step": 227
1600
+ },
1601
+ {
1602
+ "epoch": 0.23336745138178097,
1603
+ "grad_norm": 0.5607970356941223,
1604
+ "learning_rate": 4.5400000000000007e-07,
1605
+ "loss": 0.7534,
1606
+ "step": 228
1607
+ },
1608
+ {
1609
+ "epoch": 0.23439099283520984,
1610
+ "grad_norm": 0.5806226134300232,
1611
+ "learning_rate": 4.5600000000000006e-07,
1612
+ "loss": 0.7516,
1613
+ "step": 229
1614
+ },
1615
+ {
1616
+ "epoch": 0.23541453428863868,
1617
+ "grad_norm": 0.47338593006134033,
1618
+ "learning_rate": 4.5800000000000005e-07,
1619
+ "loss": 0.75,
1620
+ "step": 230
1621
+ },
1622
+ {
1623
+ "epoch": 0.23643807574206754,
1624
+ "grad_norm": 0.805225670337677,
1625
+ "learning_rate": 4.6000000000000004e-07,
1626
+ "loss": 0.7498,
1627
+ "step": 231
1628
+ },
1629
+ {
1630
+ "epoch": 0.2374616171954964,
1631
+ "grad_norm": 0.5711566209793091,
1632
+ "learning_rate": 4.6200000000000003e-07,
1633
+ "loss": 0.7527,
1634
+ "step": 232
1635
+ },
1636
+ {
1637
+ "epoch": 0.23848515864892528,
1638
+ "grad_norm": 0.4319440722465515,
1639
+ "learning_rate": 4.64e-07,
1640
+ "loss": 0.7493,
1641
+ "step": 233
1642
+ },
1643
+ {
1644
+ "epoch": 0.23950870010235414,
1645
+ "grad_norm": 0.49590882658958435,
1646
+ "learning_rate": 4.6600000000000007e-07,
1647
+ "loss": 0.752,
1648
+ "step": 234
1649
+ },
1650
+ {
1651
+ "epoch": 0.240532241555783,
1652
+ "grad_norm": 0.6124209761619568,
1653
+ "learning_rate": 4.6800000000000006e-07,
1654
+ "loss": 0.7517,
1655
+ "step": 235
1656
+ },
1657
+ {
1658
+ "epoch": 0.24155578300921188,
1659
+ "grad_norm": 0.4623505771160126,
1660
+ "learning_rate": 4.7000000000000005e-07,
1661
+ "loss": 0.7558,
1662
+ "step": 236
1663
+ },
1664
+ {
1665
+ "epoch": 0.24257932446264074,
1666
+ "grad_norm": 0.43490421772003174,
1667
+ "learning_rate": 4.7200000000000004e-07,
1668
+ "loss": 0.754,
1669
+ "step": 237
1670
+ },
1671
+ {
1672
+ "epoch": 0.2436028659160696,
1673
+ "grad_norm": 0.41201770305633545,
1674
+ "learning_rate": 4.7400000000000004e-07,
1675
+ "loss": 0.7554,
1676
+ "step": 238
1677
+ },
1678
+ {
1679
+ "epoch": 0.24462640736949848,
1680
+ "grad_norm": 0.5332440137863159,
1681
+ "learning_rate": 4.760000000000001e-07,
1682
+ "loss": 0.7523,
1683
+ "step": 239
1684
+ },
1685
+ {
1686
+ "epoch": 0.24564994882292732,
1687
+ "grad_norm": 0.45751631259918213,
1688
+ "learning_rate": 4.78e-07,
1689
+ "loss": 0.7501,
1690
+ "step": 240
1691
+ },
1692
+ {
1693
+ "epoch": 0.24667349027635618,
1694
+ "grad_norm": 0.46563875675201416,
1695
+ "learning_rate": 4.800000000000001e-07,
1696
+ "loss": 0.7475,
1697
+ "step": 241
1698
+ },
1699
+ {
1700
+ "epoch": 0.24769703172978505,
1701
+ "grad_norm": 0.4648246765136719,
1702
+ "learning_rate": 4.82e-07,
1703
+ "loss": 0.7607,
1704
+ "step": 242
1705
+ },
1706
+ {
1707
+ "epoch": 0.24872057318321392,
1708
+ "grad_norm": 0.42473065853118896,
1709
+ "learning_rate": 4.84e-07,
1710
+ "loss": 0.7521,
1711
+ "step": 243
1712
+ },
1713
+ {
1714
+ "epoch": 0.24974411463664278,
1715
+ "grad_norm": 0.4116007387638092,
1716
+ "learning_rate": 4.86e-07,
1717
+ "loss": 0.753,
1718
+ "step": 244
1719
+ },
1720
+ {
1721
+ "epoch": 0.2507676560900716,
1722
+ "grad_norm": 0.4429609477519989,
1723
+ "learning_rate": 4.88e-07,
1724
+ "loss": 0.7513,
1725
+ "step": 245
1726
+ },
1727
+ {
1728
+ "epoch": 0.2517911975435005,
1729
+ "grad_norm": 0.40569669008255005,
1730
+ "learning_rate": 4.900000000000001e-07,
1731
+ "loss": 0.7466,
1732
+ "step": 246
1733
+ },
1734
+ {
1735
+ "epoch": 0.25281473899692936,
1736
+ "grad_norm": 0.6898592114448547,
1737
+ "learning_rate": 4.92e-07,
1738
+ "loss": 0.7488,
1739
+ "step": 247
1740
+ },
1741
+ {
1742
+ "epoch": 0.2538382804503582,
1743
+ "grad_norm": 0.5965420603752136,
1744
+ "learning_rate": 4.940000000000001e-07,
1745
+ "loss": 0.7496,
1746
+ "step": 248
1747
+ },
1748
+ {
1749
+ "epoch": 0.2548618219037871,
1750
+ "grad_norm": 0.4469277858734131,
1751
+ "learning_rate": 4.96e-07,
1752
+ "loss": 0.7554,
1753
+ "step": 249
1754
+ },
1755
+ {
1756
+ "epoch": 0.25588536335721596,
1757
+ "grad_norm": 0.5688977837562561,
1758
+ "learning_rate": 4.98e-07,
1759
+ "loss": 0.761,
1760
+ "step": 250
1761
+ }
1762
+ ],
1763
+ "logging_steps": 1,
1764
+ "max_steps": 100000,
1765
+ "num_input_tokens_seen": 0,
1766
+ "num_train_epochs": 103,
1767
+ "save_steps": 250,
1768
+ "stateful_callbacks": {
1769
+ "TrainerControl": {
1770
+ "args": {
1771
+ "should_epoch_stop": false,
1772
+ "should_evaluate": false,
1773
+ "should_log": false,
1774
+ "should_save": false,
1775
+ "should_training_stop": false
1776
+ },
1777
+ "attributes": {}
1778
+ }
1779
+ },
1780
+ "total_flos": 0.0,
1781
+ "train_batch_size": 1024,
1782
+ "trial_name": null,
1783
+ "trial_params": null
1784
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:651726ac923849545c67ac45f6b85a4f2f97ef4b97ca89f50de722eb24f917ad
3
+ size 5841
vocab.txt ADDED
The diff for this file is too large to render. See raw diff