johnomeara commited on
Commit
3e7119d
·
verified ·
1 Parent(s): 21de8eb

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/dinov2-large",
3
+ "apply_layernorm": true,
4
+ "architectures": [
5
+ "Dinov2ForImageClassification"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.0,
8
+ "drop_path_rate": 0.0,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.0,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "correct",
14
+ "1": "incorrect"
15
+ },
16
+ "image_size": 518,
17
+ "initializer_range": 0.02,
18
+ "label2id": {
19
+ "correct": 0,
20
+ "incorrect": 1
21
+ },
22
+ "layer_norm_eps": 1e-06,
23
+ "layerscale_value": 1.0,
24
+ "mlp_ratio": 4,
25
+ "model_type": "dinov2",
26
+ "num_attention_heads": 16,
27
+ "num_channels": 3,
28
+ "num_hidden_layers": 24,
29
+ "out_features": [
30
+ "stage24"
31
+ ],
32
+ "out_indices": [
33
+ 24
34
+ ],
35
+ "patch_size": 14,
36
+ "problem_type": "single_label_classification",
37
+ "qkv_bias": true,
38
+ "reshape_hidden_states": true,
39
+ "stage_names": [
40
+ "stem",
41
+ "stage1",
42
+ "stage2",
43
+ "stage3",
44
+ "stage4",
45
+ "stage5",
46
+ "stage6",
47
+ "stage7",
48
+ "stage8",
49
+ "stage9",
50
+ "stage10",
51
+ "stage11",
52
+ "stage12",
53
+ "stage13",
54
+ "stage14",
55
+ "stage15",
56
+ "stage16",
57
+ "stage17",
58
+ "stage18",
59
+ "stage19",
60
+ "stage20",
61
+ "stage21",
62
+ "stage22",
63
+ "stage23",
64
+ "stage24"
65
+ ],
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.47.1",
68
+ "use_swiglu_ffn": false
69
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6959497928ab28564b34d956a334baee012c6a52af5ed46fd6e33dd6999246a
3
+ size 1217542512
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:068f0ef49469943ce4e61797f2c8dd1435b87edb8111a3f8f05d754152d12ca0
3
+ size 2435341946
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.485,
13
+ 0.456,
14
+ 0.406
15
+ ],
16
+ "image_processor_type": "BitImageProcessor",
17
+ "image_std": [
18
+ 0.229,
19
+ 0.224,
20
+ 0.225
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 256
26
+ }
27
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e89cbde481fa889fb095dccd0e27796b9df4d9d6e2b39417c22fb2c0ec6285b
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e26293a4ac99ad99306eb23f6a8d8c5e13b975b27036f16e65252fbc3ffff2ff
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,2218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8017572519758047,
3
+ "best_model_checkpoint": "Obstacle/dinov2/checkpoint-2800",
4
+ "epoch": 24.783964365256125,
5
+ "eval_steps": 500,
6
+ "global_step": 2800,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08908685968819599,
13
+ "grad_norm": 105.50035095214844,
14
+ "learning_rate": 3.5714285714285716e-07,
15
+ "loss": 3.6297,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.17817371937639198,
20
+ "grad_norm": 120.46121978759766,
21
+ "learning_rate": 7.142857142857143e-07,
22
+ "loss": 3.5465,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.267260579064588,
27
+ "grad_norm": 201.12693786621094,
28
+ "learning_rate": 1.0714285714285714e-06,
29
+ "loss": 3.1817,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.35634743875278396,
34
+ "grad_norm": 125.14132690429688,
35
+ "learning_rate": 1.4285714285714286e-06,
36
+ "loss": 3.1466,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.44543429844098,
41
+ "grad_norm": 125.6826400756836,
42
+ "learning_rate": 1.7857142857142859e-06,
43
+ "loss": 2.6557,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.534521158129176,
48
+ "grad_norm": 85.71980285644531,
49
+ "learning_rate": 2.1428571428571427e-06,
50
+ "loss": 2.5632,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.623608017817372,
55
+ "grad_norm": 109.28083801269531,
56
+ "learning_rate": 2.5e-06,
57
+ "loss": 2.8634,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.7126948775055679,
62
+ "grad_norm": 151.45330810546875,
63
+ "learning_rate": 2.8571428571428573e-06,
64
+ "loss": 2.6854,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.8017817371937639,
69
+ "grad_norm": 85.33807373046875,
70
+ "learning_rate": 3.2142857142857147e-06,
71
+ "loss": 2.6114,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.89086859688196,
76
+ "grad_norm": 186.34913635253906,
77
+ "learning_rate": 3.5714285714285718e-06,
78
+ "loss": 2.5097,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.9799554565701559,
83
+ "grad_norm": 64.78919219970703,
84
+ "learning_rate": 3.928571428571429e-06,
85
+ "loss": 2.5137,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 1.0,
90
+ "eval_loss": 0.5945847630500793,
91
+ "eval_macro_f1": 0.6702382389633386,
92
+ "eval_runtime": 38.7104,
93
+ "eval_samples_per_second": 23.198,
94
+ "eval_steps_per_second": 2.919,
95
+ "step": 113
96
+ },
97
+ {
98
+ "epoch": 1.0623608017817372,
99
+ "grad_norm": 70.03704071044922,
100
+ "learning_rate": 4.2857142857142855e-06,
101
+ "loss": 2.0547,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 1.1514476614699332,
106
+ "grad_norm": 60.801490783691406,
107
+ "learning_rate": 4.642857142857144e-06,
108
+ "loss": 2.2826,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 1.2405345211581291,
113
+ "grad_norm": 316.7510070800781,
114
+ "learning_rate": 5e-06,
115
+ "loss": 2.4794,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 1.329621380846325,
120
+ "grad_norm": 58.09334945678711,
121
+ "learning_rate": 5.357142857142857e-06,
122
+ "loss": 2.1821,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 1.4187082405345213,
127
+ "grad_norm": 189.68980407714844,
128
+ "learning_rate": 5.7142857142857145e-06,
129
+ "loss": 2.4679,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 1.507795100222717,
134
+ "grad_norm": 3325.044189453125,
135
+ "learning_rate": 6.071428571428571e-06,
136
+ "loss": 2.1725,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 1.5968819599109132,
141
+ "grad_norm": 189.6019287109375,
142
+ "learning_rate": 6.4285714285714295e-06,
143
+ "loss": 2.2086,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 1.6859688195991092,
148
+ "grad_norm": 46.3530387878418,
149
+ "learning_rate": 6.785714285714287e-06,
150
+ "loss": 2.27,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 1.7750556792873051,
155
+ "grad_norm": 311.6956787109375,
156
+ "learning_rate": 7.1428571428571436e-06,
157
+ "loss": 2.2687,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 1.864142538975501,
162
+ "grad_norm": 59.799678802490234,
163
+ "learning_rate": 7.500000000000001e-06,
164
+ "loss": 2.219,
165
+ "step": 210
166
+ },
167
+ {
168
+ "epoch": 1.953229398663697,
169
+ "grad_norm": 110.08448791503906,
170
+ "learning_rate": 7.857142857142858e-06,
171
+ "loss": 2.6144,
172
+ "step": 220
173
+ },
174
+ {
175
+ "epoch": 2.0,
176
+ "eval_loss": 0.5521021485328674,
177
+ "eval_macro_f1": 0.7138090898678897,
178
+ "eval_runtime": 25.7637,
179
+ "eval_samples_per_second": 34.855,
180
+ "eval_steps_per_second": 4.386,
181
+ "step": 226
182
+ },
183
+ {
184
+ "epoch": 2.035634743875278,
185
+ "grad_norm": 102.7503433227539,
186
+ "learning_rate": 8.214285714285714e-06,
187
+ "loss": 2.4624,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 2.1247216035634744,
192
+ "grad_norm": 145.72938537597656,
193
+ "learning_rate": 8.571428571428571e-06,
194
+ "loss": 2.0923,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 2.2138084632516706,
199
+ "grad_norm": 55.424922943115234,
200
+ "learning_rate": 8.92857142857143e-06,
201
+ "loss": 1.8513,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 2.3028953229398663,
206
+ "grad_norm": 60.43095779418945,
207
+ "learning_rate": 9.285714285714288e-06,
208
+ "loss": 2.278,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 2.3919821826280625,
213
+ "grad_norm": 51.914947509765625,
214
+ "learning_rate": 9.642857142857144e-06,
215
+ "loss": 2.2194,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 2.4810690423162582,
220
+ "grad_norm": 72.88397979736328,
221
+ "learning_rate": 1e-05,
222
+ "loss": 2.1676,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 2.5701559020044544,
227
+ "grad_norm": 70.16185760498047,
228
+ "learning_rate": 9.960317460317462e-06,
229
+ "loss": 2.0797,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 2.65924276169265,
234
+ "grad_norm": 42.27340316772461,
235
+ "learning_rate": 9.920634920634922e-06,
236
+ "loss": 2.0556,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 2.7483296213808464,
241
+ "grad_norm": 147.33399963378906,
242
+ "learning_rate": 9.880952380952381e-06,
243
+ "loss": 1.7408,
244
+ "step": 310
245
+ },
246
+ {
247
+ "epoch": 2.8374164810690425,
248
+ "grad_norm": 101.6998519897461,
249
+ "learning_rate": 9.841269841269842e-06,
250
+ "loss": 2.1287,
251
+ "step": 320
252
+ },
253
+ {
254
+ "epoch": 2.9265033407572383,
255
+ "grad_norm": 62.05525207519531,
256
+ "learning_rate": 9.801587301587301e-06,
257
+ "loss": 2.2084,
258
+ "step": 330
259
+ },
260
+ {
261
+ "epoch": 3.0,
262
+ "eval_loss": 0.9312511682510376,
263
+ "eval_macro_f1": 0.5517047201798773,
264
+ "eval_runtime": 30.6876,
265
+ "eval_samples_per_second": 29.263,
266
+ "eval_steps_per_second": 3.682,
267
+ "step": 339
268
+ },
269
+ {
270
+ "epoch": 3.0089086859688194,
271
+ "grad_norm": 223.84640502929688,
272
+ "learning_rate": 9.761904761904762e-06,
273
+ "loss": 1.9799,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 3.0979955456570156,
278
+ "grad_norm": 96.64312744140625,
279
+ "learning_rate": 9.722222222222223e-06,
280
+ "loss": 2.769,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 3.187082405345212,
285
+ "grad_norm": 45.61930465698242,
286
+ "learning_rate": 9.682539682539683e-06,
287
+ "loss": 1.853,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 3.2761692650334076,
292
+ "grad_norm": 66.43023681640625,
293
+ "learning_rate": 9.642857142857144e-06,
294
+ "loss": 1.4603,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 3.3652561247216037,
299
+ "grad_norm": 123.35740661621094,
300
+ "learning_rate": 9.603174603174605e-06,
301
+ "loss": 2.124,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 3.4543429844097995,
306
+ "grad_norm": 46.054222106933594,
307
+ "learning_rate": 9.563492063492064e-06,
308
+ "loss": 1.8356,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 3.5434298440979957,
313
+ "grad_norm": 202.58151245117188,
314
+ "learning_rate": 9.523809523809525e-06,
315
+ "loss": 1.6048,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 3.6325167037861914,
320
+ "grad_norm": 80.48709869384766,
321
+ "learning_rate": 9.484126984126984e-06,
322
+ "loss": 2.1942,
323
+ "step": 410
324
+ },
325
+ {
326
+ "epoch": 3.7216035634743876,
327
+ "grad_norm": 73.6802978515625,
328
+ "learning_rate": 9.444444444444445e-06,
329
+ "loss": 1.7107,
330
+ "step": 420
331
+ },
332
+ {
333
+ "epoch": 3.8106904231625833,
334
+ "grad_norm": 80.23323822021484,
335
+ "learning_rate": 9.404761904761905e-06,
336
+ "loss": 1.5238,
337
+ "step": 430
338
+ },
339
+ {
340
+ "epoch": 3.8997772828507795,
341
+ "grad_norm": 62.258705139160156,
342
+ "learning_rate": 9.365079365079366e-06,
343
+ "loss": 1.7122,
344
+ "step": 440
345
+ },
346
+ {
347
+ "epoch": 3.9888641425389757,
348
+ "grad_norm": 76.88375854492188,
349
+ "learning_rate": 9.325396825396827e-06,
350
+ "loss": 1.6925,
351
+ "step": 450
352
+ },
353
+ {
354
+ "epoch": 4.0,
355
+ "eval_loss": 0.49176380038261414,
356
+ "eval_macro_f1": 0.7682737403294986,
357
+ "eval_runtime": 27.8235,
358
+ "eval_samples_per_second": 32.275,
359
+ "eval_steps_per_second": 4.061,
360
+ "step": 452
361
+ },
362
+ {
363
+ "epoch": 4.071269487750556,
364
+ "grad_norm": 46.56773376464844,
365
+ "learning_rate": 9.285714285714288e-06,
366
+ "loss": 1.1574,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 4.160356347438753,
371
+ "grad_norm": 53.96033477783203,
372
+ "learning_rate": 9.246031746031747e-06,
373
+ "loss": 1.5659,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 4.249443207126949,
378
+ "grad_norm": 61.506927490234375,
379
+ "learning_rate": 9.206349206349207e-06,
380
+ "loss": 1.3966,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 4.338530066815145,
385
+ "grad_norm": 60.22172546386719,
386
+ "learning_rate": 9.166666666666666e-06,
387
+ "loss": 1.1754,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 4.427616926503341,
392
+ "grad_norm": 56.39080810546875,
393
+ "learning_rate": 9.126984126984127e-06,
394
+ "loss": 1.6733,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 4.5167037861915365,
399
+ "grad_norm": 66.48773193359375,
400
+ "learning_rate": 9.087301587301588e-06,
401
+ "loss": 1.1759,
402
+ "step": 510
403
+ },
404
+ {
405
+ "epoch": 4.605790645879733,
406
+ "grad_norm": 75.4937973022461,
407
+ "learning_rate": 9.047619047619049e-06,
408
+ "loss": 1.3468,
409
+ "step": 520
410
+ },
411
+ {
412
+ "epoch": 4.694877505567929,
413
+ "grad_norm": 58.68571090698242,
414
+ "learning_rate": 9.00793650793651e-06,
415
+ "loss": 1.3929,
416
+ "step": 530
417
+ },
418
+ {
419
+ "epoch": 4.783964365256125,
420
+ "grad_norm": 52.5469970703125,
421
+ "learning_rate": 8.968253968253968e-06,
422
+ "loss": 1.3248,
423
+ "step": 540
424
+ },
425
+ {
426
+ "epoch": 4.873051224944321,
427
+ "grad_norm": 138.5091094970703,
428
+ "learning_rate": 8.92857142857143e-06,
429
+ "loss": 1.5388,
430
+ "step": 550
431
+ },
432
+ {
433
+ "epoch": 4.9621380846325165,
434
+ "grad_norm": 66.50778198242188,
435
+ "learning_rate": 8.888888888888888e-06,
436
+ "loss": 1.4344,
437
+ "step": 560
438
+ },
439
+ {
440
+ "epoch": 5.0,
441
+ "eval_loss": 0.49091583490371704,
442
+ "eval_macro_f1": 0.7592655412453324,
443
+ "eval_runtime": 27.4798,
444
+ "eval_samples_per_second": 32.679,
445
+ "eval_steps_per_second": 4.112,
446
+ "step": 565
447
+ },
448
+ {
449
+ "epoch": 5.044543429844098,
450
+ "grad_norm": 99.18921661376953,
451
+ "learning_rate": 8.849206349206349e-06,
452
+ "loss": 1.3262,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 5.133630289532294,
457
+ "grad_norm": 73.59988403320312,
458
+ "learning_rate": 8.80952380952381e-06,
459
+ "loss": 1.2284,
460
+ "step": 580
461
+ },
462
+ {
463
+ "epoch": 5.22271714922049,
464
+ "grad_norm": 258.84918212890625,
465
+ "learning_rate": 8.76984126984127e-06,
466
+ "loss": 1.4667,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 5.311804008908686,
471
+ "grad_norm": 69.11897277832031,
472
+ "learning_rate": 8.730158730158731e-06,
473
+ "loss": 1.3145,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 5.400890868596882,
478
+ "grad_norm": 33.039669036865234,
479
+ "learning_rate": 8.690476190476192e-06,
480
+ "loss": 0.9991,
481
+ "step": 610
482
+ },
483
+ {
484
+ "epoch": 5.489977728285078,
485
+ "grad_norm": 78.3748779296875,
486
+ "learning_rate": 8.650793650793651e-06,
487
+ "loss": 1.1708,
488
+ "step": 620
489
+ },
490
+ {
491
+ "epoch": 5.579064587973274,
492
+ "grad_norm": 61.205711364746094,
493
+ "learning_rate": 8.611111111111112e-06,
494
+ "loss": 1.0329,
495
+ "step": 630
496
+ },
497
+ {
498
+ "epoch": 5.66815144766147,
499
+ "grad_norm": 62.874446868896484,
500
+ "learning_rate": 8.571428571428571e-06,
501
+ "loss": 1.1144,
502
+ "step": 640
503
+ },
504
+ {
505
+ "epoch": 5.757238307349666,
506
+ "grad_norm": 86.9574203491211,
507
+ "learning_rate": 8.531746031746032e-06,
508
+ "loss": 0.9814,
509
+ "step": 650
510
+ },
511
+ {
512
+ "epoch": 5.846325167037862,
513
+ "grad_norm": 55.24412536621094,
514
+ "learning_rate": 8.492063492063492e-06,
515
+ "loss": 1.0457,
516
+ "step": 660
517
+ },
518
+ {
519
+ "epoch": 5.935412026726058,
520
+ "grad_norm": 96.79296112060547,
521
+ "learning_rate": 8.452380952380953e-06,
522
+ "loss": 1.2971,
523
+ "step": 670
524
+ },
525
+ {
526
+ "epoch": 6.0,
527
+ "eval_loss": 0.6005875468254089,
528
+ "eval_macro_f1": 0.7454914303614315,
529
+ "eval_runtime": 29.9288,
530
+ "eval_samples_per_second": 30.004,
531
+ "eval_steps_per_second": 3.776,
532
+ "step": 678
533
+ },
534
+ {
535
+ "epoch": 6.017817371937639,
536
+ "grad_norm": 124.40799713134766,
537
+ "learning_rate": 8.412698412698414e-06,
538
+ "loss": 1.3457,
539
+ "step": 680
540
+ },
541
+ {
542
+ "epoch": 6.106904231625835,
543
+ "grad_norm": 75.6956787109375,
544
+ "learning_rate": 8.373015873015875e-06,
545
+ "loss": 1.1006,
546
+ "step": 690
547
+ },
548
+ {
549
+ "epoch": 6.195991091314031,
550
+ "grad_norm": 94.5655746459961,
551
+ "learning_rate": 8.333333333333334e-06,
552
+ "loss": 1.0031,
553
+ "step": 700
554
+ },
555
+ {
556
+ "epoch": 6.285077951002227,
557
+ "grad_norm": 84.76609802246094,
558
+ "learning_rate": 8.293650793650794e-06,
559
+ "loss": 0.9654,
560
+ "step": 710
561
+ },
562
+ {
563
+ "epoch": 6.374164810690424,
564
+ "grad_norm": 44.62492752075195,
565
+ "learning_rate": 8.253968253968254e-06,
566
+ "loss": 1.0131,
567
+ "step": 720
568
+ },
569
+ {
570
+ "epoch": 6.463251670378619,
571
+ "grad_norm": 65.90361022949219,
572
+ "learning_rate": 8.214285714285714e-06,
573
+ "loss": 0.8061,
574
+ "step": 730
575
+ },
576
+ {
577
+ "epoch": 6.552338530066815,
578
+ "grad_norm": 81.76494598388672,
579
+ "learning_rate": 8.174603174603175e-06,
580
+ "loss": 0.9876,
581
+ "step": 740
582
+ },
583
+ {
584
+ "epoch": 6.641425389755011,
585
+ "grad_norm": 116.49346923828125,
586
+ "learning_rate": 8.134920634920636e-06,
587
+ "loss": 1.208,
588
+ "step": 750
589
+ },
590
+ {
591
+ "epoch": 6.7305122494432075,
592
+ "grad_norm": 50.356597900390625,
593
+ "learning_rate": 8.095238095238097e-06,
594
+ "loss": 0.9099,
595
+ "step": 760
596
+ },
597
+ {
598
+ "epoch": 6.819599109131403,
599
+ "grad_norm": 98.01029205322266,
600
+ "learning_rate": 8.055555555555557e-06,
601
+ "loss": 0.7744,
602
+ "step": 770
603
+ },
604
+ {
605
+ "epoch": 6.908685968819599,
606
+ "grad_norm": 72.4462661743164,
607
+ "learning_rate": 8.015873015873016e-06,
608
+ "loss": 1.1695,
609
+ "step": 780
610
+ },
611
+ {
612
+ "epoch": 6.997772828507795,
613
+ "grad_norm": 67.33780670166016,
614
+ "learning_rate": 7.976190476190477e-06,
615
+ "loss": 0.8807,
616
+ "step": 790
617
+ },
618
+ {
619
+ "epoch": 7.0,
620
+ "eval_loss": 0.5682073831558228,
621
+ "eval_macro_f1": 0.7816147859922179,
622
+ "eval_runtime": 24.4148,
623
+ "eval_samples_per_second": 36.781,
624
+ "eval_steps_per_second": 4.628,
625
+ "step": 791
626
+ },
627
+ {
628
+ "epoch": 7.080178173719377,
629
+ "grad_norm": 44.45954513549805,
630
+ "learning_rate": 7.936507936507936e-06,
631
+ "loss": 0.591,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 7.169265033407572,
636
+ "grad_norm": 36.26639175415039,
637
+ "learning_rate": 7.896825396825397e-06,
638
+ "loss": 0.6219,
639
+ "step": 810
640
+ },
641
+ {
642
+ "epoch": 7.258351893095768,
643
+ "grad_norm": 40.69953155517578,
644
+ "learning_rate": 7.857142857142858e-06,
645
+ "loss": 0.6987,
646
+ "step": 820
647
+ },
648
+ {
649
+ "epoch": 7.347438752783964,
650
+ "grad_norm": 64.46514129638672,
651
+ "learning_rate": 7.817460317460318e-06,
652
+ "loss": 0.7194,
653
+ "step": 830
654
+ },
655
+ {
656
+ "epoch": 7.436525612472161,
657
+ "grad_norm": 63.40292739868164,
658
+ "learning_rate": 7.77777777777778e-06,
659
+ "loss": 0.7534,
660
+ "step": 840
661
+ },
662
+ {
663
+ "epoch": 7.525612472160356,
664
+ "grad_norm": 42.322147369384766,
665
+ "learning_rate": 7.738095238095238e-06,
666
+ "loss": 0.8983,
667
+ "step": 850
668
+ },
669
+ {
670
+ "epoch": 7.614699331848552,
671
+ "grad_norm": 70.19657135009766,
672
+ "learning_rate": 7.698412698412699e-06,
673
+ "loss": 0.9567,
674
+ "step": 860
675
+ },
676
+ {
677
+ "epoch": 7.703786191536748,
678
+ "grad_norm": 122.25581359863281,
679
+ "learning_rate": 7.65873015873016e-06,
680
+ "loss": 0.8068,
681
+ "step": 870
682
+ },
683
+ {
684
+ "epoch": 7.7928730512249444,
685
+ "grad_norm": 71.20338439941406,
686
+ "learning_rate": 7.61904761904762e-06,
687
+ "loss": 0.8394,
688
+ "step": 880
689
+ },
690
+ {
691
+ "epoch": 7.881959910913141,
692
+ "grad_norm": 79.9009017944336,
693
+ "learning_rate": 7.5793650793650795e-06,
694
+ "loss": 0.8389,
695
+ "step": 890
696
+ },
697
+ {
698
+ "epoch": 7.971046770601336,
699
+ "grad_norm": 113.75267791748047,
700
+ "learning_rate": 7.53968253968254e-06,
701
+ "loss": 0.9274,
702
+ "step": 900
703
+ },
704
+ {
705
+ "epoch": 8.0,
706
+ "eval_loss": 0.6042653322219849,
707
+ "eval_macro_f1": 0.7506470612211624,
708
+ "eval_runtime": 23.2579,
709
+ "eval_samples_per_second": 38.611,
710
+ "eval_steps_per_second": 4.859,
711
+ "step": 904
712
+ },
713
+ {
714
+ "epoch": 8.053452115812918,
715
+ "grad_norm": 67.79409790039062,
716
+ "learning_rate": 7.500000000000001e-06,
717
+ "loss": 0.7653,
718
+ "step": 910
719
+ },
720
+ {
721
+ "epoch": 8.142538975501113,
722
+ "grad_norm": 45.01826095581055,
723
+ "learning_rate": 7.460317460317461e-06,
724
+ "loss": 0.4532,
725
+ "step": 920
726
+ },
727
+ {
728
+ "epoch": 8.231625835189309,
729
+ "grad_norm": 66.2216796875,
730
+ "learning_rate": 7.420634920634922e-06,
731
+ "loss": 0.4336,
732
+ "step": 930
733
+ },
734
+ {
735
+ "epoch": 8.320712694877505,
736
+ "grad_norm": 82.00172424316406,
737
+ "learning_rate": 7.380952380952382e-06,
738
+ "loss": 0.771,
739
+ "step": 940
740
+ },
741
+ {
742
+ "epoch": 8.409799554565701,
743
+ "grad_norm": 37.49010467529297,
744
+ "learning_rate": 7.3412698412698415e-06,
745
+ "loss": 0.7434,
746
+ "step": 950
747
+ },
748
+ {
749
+ "epoch": 8.498886414253898,
750
+ "grad_norm": 52.00114440917969,
751
+ "learning_rate": 7.301587301587301e-06,
752
+ "loss": 0.5471,
753
+ "step": 960
754
+ },
755
+ {
756
+ "epoch": 8.587973273942094,
757
+ "grad_norm": 58.92430114746094,
758
+ "learning_rate": 7.261904761904762e-06,
759
+ "loss": 0.95,
760
+ "step": 970
761
+ },
762
+ {
763
+ "epoch": 8.67706013363029,
764
+ "grad_norm": 48.648902893066406,
765
+ "learning_rate": 7.222222222222223e-06,
766
+ "loss": 0.9863,
767
+ "step": 980
768
+ },
769
+ {
770
+ "epoch": 8.766146993318486,
771
+ "grad_norm": 50.49428176879883,
772
+ "learning_rate": 7.182539682539683e-06,
773
+ "loss": 0.7554,
774
+ "step": 990
775
+ },
776
+ {
777
+ "epoch": 8.855233853006682,
778
+ "grad_norm": 24.385663986206055,
779
+ "learning_rate": 7.1428571428571436e-06,
780
+ "loss": 0.5002,
781
+ "step": 1000
782
+ },
783
+ {
784
+ "epoch": 8.944320712694877,
785
+ "grad_norm": 54.94889450073242,
786
+ "learning_rate": 7.103174603174604e-06,
787
+ "loss": 0.6093,
788
+ "step": 1010
789
+ },
790
+ {
791
+ "epoch": 9.0,
792
+ "eval_loss": 0.6689905524253845,
793
+ "eval_macro_f1": 0.7741357801118758,
794
+ "eval_runtime": 19.7981,
795
+ "eval_samples_per_second": 45.358,
796
+ "eval_steps_per_second": 5.708,
797
+ "step": 1017
798
+ },
799
+ {
800
+ "epoch": 9.026726057906458,
801
+ "grad_norm": 60.106876373291016,
802
+ "learning_rate": 7.063492063492064e-06,
803
+ "loss": 0.6911,
804
+ "step": 1020
805
+ },
806
+ {
807
+ "epoch": 9.115812917594655,
808
+ "grad_norm": 39.45969772338867,
809
+ "learning_rate": 7.023809523809524e-06,
810
+ "loss": 0.5881,
811
+ "step": 1030
812
+ },
813
+ {
814
+ "epoch": 9.20489977728285,
815
+ "grad_norm": 60.09040069580078,
816
+ "learning_rate": 6.984126984126984e-06,
817
+ "loss": 0.4936,
818
+ "step": 1040
819
+ },
820
+ {
821
+ "epoch": 9.293986636971047,
822
+ "grad_norm": 53.53305435180664,
823
+ "learning_rate": 6.944444444444445e-06,
824
+ "loss": 0.6178,
825
+ "step": 1050
826
+ },
827
+ {
828
+ "epoch": 9.383073496659243,
829
+ "grad_norm": 23.2546443939209,
830
+ "learning_rate": 6.9047619047619055e-06,
831
+ "loss": 0.6101,
832
+ "step": 1060
833
+ },
834
+ {
835
+ "epoch": 9.47216035634744,
836
+ "grad_norm": 53.65586471557617,
837
+ "learning_rate": 6.8650793650793654e-06,
838
+ "loss": 0.5035,
839
+ "step": 1070
840
+ },
841
+ {
842
+ "epoch": 9.561247216035635,
843
+ "grad_norm": 48.904598236083984,
844
+ "learning_rate": 6.825396825396826e-06,
845
+ "loss": 0.5098,
846
+ "step": 1080
847
+ },
848
+ {
849
+ "epoch": 9.65033407572383,
850
+ "grad_norm": 30.416839599609375,
851
+ "learning_rate": 6.785714285714287e-06,
852
+ "loss": 0.5424,
853
+ "step": 1090
854
+ },
855
+ {
856
+ "epoch": 9.739420935412026,
857
+ "grad_norm": 66.94727325439453,
858
+ "learning_rate": 6.746031746031747e-06,
859
+ "loss": 0.5678,
860
+ "step": 1100
861
+ },
862
+ {
863
+ "epoch": 9.828507795100222,
864
+ "grad_norm": 46.05035400390625,
865
+ "learning_rate": 6.706349206349207e-06,
866
+ "loss": 0.4964,
867
+ "step": 1110
868
+ },
869
+ {
870
+ "epoch": 9.917594654788418,
871
+ "grad_norm": 137.4243927001953,
872
+ "learning_rate": 6.666666666666667e-06,
873
+ "loss": 0.6045,
874
+ "step": 1120
875
+ },
876
+ {
877
+ "epoch": 10.0,
878
+ "grad_norm": 7.9022393226623535,
879
+ "learning_rate": 6.626984126984127e-06,
880
+ "loss": 0.7078,
881
+ "step": 1130
882
+ },
883
+ {
884
+ "epoch": 10.0,
885
+ "eval_loss": 0.6592049598693848,
886
+ "eval_macro_f1": 0.7537628898597082,
887
+ "eval_runtime": 22.1013,
888
+ "eval_samples_per_second": 40.631,
889
+ "eval_steps_per_second": 5.113,
890
+ "step": 1130
891
+ },
892
+ {
893
+ "epoch": 10.089086859688196,
894
+ "grad_norm": 52.17204284667969,
895
+ "learning_rate": 6.587301587301588e-06,
896
+ "loss": 0.6119,
897
+ "step": 1140
898
+ },
899
+ {
900
+ "epoch": 10.178173719376392,
901
+ "grad_norm": 21.41937255859375,
902
+ "learning_rate": 6.547619047619048e-06,
903
+ "loss": 0.6984,
904
+ "step": 1150
905
+ },
906
+ {
907
+ "epoch": 10.267260579064589,
908
+ "grad_norm": 68.02864837646484,
909
+ "learning_rate": 6.507936507936509e-06,
910
+ "loss": 0.6225,
911
+ "step": 1160
912
+ },
913
+ {
914
+ "epoch": 10.356347438752785,
915
+ "grad_norm": 125.40614318847656,
916
+ "learning_rate": 6.468253968253969e-06,
917
+ "loss": 0.6566,
918
+ "step": 1170
919
+ },
920
+ {
921
+ "epoch": 10.44543429844098,
922
+ "grad_norm": 37.92251205444336,
923
+ "learning_rate": 6.4285714285714295e-06,
924
+ "loss": 0.5528,
925
+ "step": 1180
926
+ },
927
+ {
928
+ "epoch": 10.534521158129175,
929
+ "grad_norm": 51.56312561035156,
930
+ "learning_rate": 6.3888888888888885e-06,
931
+ "loss": 0.3961,
932
+ "step": 1190
933
+ },
934
+ {
935
+ "epoch": 10.623608017817372,
936
+ "grad_norm": 56.06890869140625,
937
+ "learning_rate": 6.349206349206349e-06,
938
+ "loss": 0.4413,
939
+ "step": 1200
940
+ },
941
+ {
942
+ "epoch": 10.712694877505568,
943
+ "grad_norm": 396.573974609375,
944
+ "learning_rate": 6.30952380952381e-06,
945
+ "loss": 0.625,
946
+ "step": 1210
947
+ },
948
+ {
949
+ "epoch": 10.801781737193764,
950
+ "grad_norm": 28.54755401611328,
951
+ "learning_rate": 6.26984126984127e-06,
952
+ "loss": 0.2236,
953
+ "step": 1220
954
+ },
955
+ {
956
+ "epoch": 10.89086859688196,
957
+ "grad_norm": 90.92909240722656,
958
+ "learning_rate": 6.230158730158731e-06,
959
+ "loss": 0.6644,
960
+ "step": 1230
961
+ },
962
+ {
963
+ "epoch": 10.979955456570156,
964
+ "grad_norm": 42.810672760009766,
965
+ "learning_rate": 6.1904761904761914e-06,
966
+ "loss": 0.5941,
967
+ "step": 1240
968
+ },
969
+ {
970
+ "epoch": 11.0,
971
+ "eval_loss": 0.8170925974845886,
972
+ "eval_macro_f1": 0.7553111396076517,
973
+ "eval_runtime": 21.8437,
974
+ "eval_samples_per_second": 41.11,
975
+ "eval_steps_per_second": 5.173,
976
+ "step": 1243
977
+ },
978
+ {
979
+ "epoch": 11.062360801781738,
980
+ "grad_norm": 45.25354766845703,
981
+ "learning_rate": 6.150793650793651e-06,
982
+ "loss": 0.6691,
983
+ "step": 1250
984
+ },
985
+ {
986
+ "epoch": 11.151447661469934,
987
+ "grad_norm": 27.80849266052246,
988
+ "learning_rate": 6.111111111111112e-06,
989
+ "loss": 0.5407,
990
+ "step": 1260
991
+ },
992
+ {
993
+ "epoch": 11.240534521158128,
994
+ "grad_norm": 43.23250961303711,
995
+ "learning_rate": 6.071428571428571e-06,
996
+ "loss": 0.4531,
997
+ "step": 1270
998
+ },
999
+ {
1000
+ "epoch": 11.329621380846325,
1001
+ "grad_norm": 91.41048431396484,
1002
+ "learning_rate": 6.031746031746032e-06,
1003
+ "loss": 0.3653,
1004
+ "step": 1280
1005
+ },
1006
+ {
1007
+ "epoch": 11.41870824053452,
1008
+ "grad_norm": 62.16683578491211,
1009
+ "learning_rate": 5.992063492063493e-06,
1010
+ "loss": 0.6402,
1011
+ "step": 1290
1012
+ },
1013
+ {
1014
+ "epoch": 11.507795100222717,
1015
+ "grad_norm": 29.885025024414062,
1016
+ "learning_rate": 5.9523809523809525e-06,
1017
+ "loss": 0.3978,
1018
+ "step": 1300
1019
+ },
1020
+ {
1021
+ "epoch": 11.596881959910913,
1022
+ "grad_norm": 35.54779815673828,
1023
+ "learning_rate": 5.912698412698413e-06,
1024
+ "loss": 0.5595,
1025
+ "step": 1310
1026
+ },
1027
+ {
1028
+ "epoch": 11.68596881959911,
1029
+ "grad_norm": 111.557373046875,
1030
+ "learning_rate": 5.873015873015874e-06,
1031
+ "loss": 0.4358,
1032
+ "step": 1320
1033
+ },
1034
+ {
1035
+ "epoch": 11.775055679287306,
1036
+ "grad_norm": 63.33297348022461,
1037
+ "learning_rate": 5.833333333333334e-06,
1038
+ "loss": 0.5095,
1039
+ "step": 1330
1040
+ },
1041
+ {
1042
+ "epoch": 11.864142538975502,
1043
+ "grad_norm": 96.7313232421875,
1044
+ "learning_rate": 5.793650793650795e-06,
1045
+ "loss": 0.6317,
1046
+ "step": 1340
1047
+ },
1048
+ {
1049
+ "epoch": 11.953229398663698,
1050
+ "grad_norm": 92.20706176757812,
1051
+ "learning_rate": 5.753968253968254e-06,
1052
+ "loss": 0.6447,
1053
+ "step": 1350
1054
+ },
1055
+ {
1056
+ "epoch": 12.0,
1057
+ "eval_loss": 0.6678270101547241,
1058
+ "eval_macro_f1": 0.7712242502141149,
1059
+ "eval_runtime": 21.8974,
1060
+ "eval_samples_per_second": 41.009,
1061
+ "eval_steps_per_second": 5.16,
1062
+ "step": 1356
1063
+ },
1064
+ {
1065
+ "epoch": 12.035634743875278,
1066
+ "grad_norm": 42.24013900756836,
1067
+ "learning_rate": 5.7142857142857145e-06,
1068
+ "loss": 0.2802,
1069
+ "step": 1360
1070
+ },
1071
+ {
1072
+ "epoch": 12.124721603563474,
1073
+ "grad_norm": 64.52838134765625,
1074
+ "learning_rate": 5.674603174603175e-06,
1075
+ "loss": 0.5235,
1076
+ "step": 1370
1077
+ },
1078
+ {
1079
+ "epoch": 12.21380846325167,
1080
+ "grad_norm": 26.648761749267578,
1081
+ "learning_rate": 5.634920634920635e-06,
1082
+ "loss": 0.2894,
1083
+ "step": 1380
1084
+ },
1085
+ {
1086
+ "epoch": 12.302895322939866,
1087
+ "grad_norm": 38.63190841674805,
1088
+ "learning_rate": 5.595238095238096e-06,
1089
+ "loss": 0.3347,
1090
+ "step": 1390
1091
+ },
1092
+ {
1093
+ "epoch": 12.391982182628063,
1094
+ "grad_norm": 38.316375732421875,
1095
+ "learning_rate": 5.555555555555557e-06,
1096
+ "loss": 0.408,
1097
+ "step": 1400
1098
+ },
1099
+ {
1100
+ "epoch": 12.481069042316259,
1101
+ "grad_norm": 9.889010429382324,
1102
+ "learning_rate": 5.5158730158730166e-06,
1103
+ "loss": 0.4411,
1104
+ "step": 1410
1105
+ },
1106
+ {
1107
+ "epoch": 12.570155902004455,
1108
+ "grad_norm": 53.189300537109375,
1109
+ "learning_rate": 5.476190476190477e-06,
1110
+ "loss": 0.6681,
1111
+ "step": 1420
1112
+ },
1113
+ {
1114
+ "epoch": 12.659242761692651,
1115
+ "grad_norm": 30.18885612487793,
1116
+ "learning_rate": 5.436507936507936e-06,
1117
+ "loss": 0.5419,
1118
+ "step": 1430
1119
+ },
1120
+ {
1121
+ "epoch": 12.748329621380847,
1122
+ "grad_norm": 40.49357604980469,
1123
+ "learning_rate": 5.396825396825397e-06,
1124
+ "loss": 0.5676,
1125
+ "step": 1440
1126
+ },
1127
+ {
1128
+ "epoch": 12.837416481069042,
1129
+ "grad_norm": 87.46019744873047,
1130
+ "learning_rate": 5.357142857142857e-06,
1131
+ "loss": 0.7195,
1132
+ "step": 1450
1133
+ },
1134
+ {
1135
+ "epoch": 12.926503340757238,
1136
+ "grad_norm": 58.786277770996094,
1137
+ "learning_rate": 5.317460317460318e-06,
1138
+ "loss": 0.3799,
1139
+ "step": 1460
1140
+ },
1141
+ {
1142
+ "epoch": 13.0,
1143
+ "eval_loss": 0.6582129001617432,
1144
+ "eval_macro_f1": 0.7994080527168146,
1145
+ "eval_runtime": 22.0321,
1146
+ "eval_samples_per_second": 40.759,
1147
+ "eval_steps_per_second": 5.129,
1148
+ "step": 1469
1149
+ },
1150
+ {
1151
+ "epoch": 13.00890868596882,
1152
+ "grad_norm": 23.018239974975586,
1153
+ "learning_rate": 5.2777777777777785e-06,
1154
+ "loss": 0.4031,
1155
+ "step": 1470
1156
+ },
1157
+ {
1158
+ "epoch": 13.097995545657016,
1159
+ "grad_norm": 47.901573181152344,
1160
+ "learning_rate": 5.2380952380952384e-06,
1161
+ "loss": 0.2202,
1162
+ "step": 1480
1163
+ },
1164
+ {
1165
+ "epoch": 13.187082405345212,
1166
+ "grad_norm": 95.25432586669922,
1167
+ "learning_rate": 5.198412698412699e-06,
1168
+ "loss": 0.58,
1169
+ "step": 1490
1170
+ },
1171
+ {
1172
+ "epoch": 13.276169265033408,
1173
+ "grad_norm": 52.78740692138672,
1174
+ "learning_rate": 5.15873015873016e-06,
1175
+ "loss": 0.399,
1176
+ "step": 1500
1177
+ },
1178
+ {
1179
+ "epoch": 13.365256124721604,
1180
+ "grad_norm": 45.429832458496094,
1181
+ "learning_rate": 5.119047619047619e-06,
1182
+ "loss": 0.4761,
1183
+ "step": 1510
1184
+ },
1185
+ {
1186
+ "epoch": 13.4543429844098,
1187
+ "grad_norm": 64.204833984375,
1188
+ "learning_rate": 5.07936507936508e-06,
1189
+ "loss": 0.3218,
1190
+ "step": 1520
1191
+ },
1192
+ {
1193
+ "epoch": 13.543429844097995,
1194
+ "grad_norm": 20.63060188293457,
1195
+ "learning_rate": 5.03968253968254e-06,
1196
+ "loss": 0.3994,
1197
+ "step": 1530
1198
+ },
1199
+ {
1200
+ "epoch": 13.632516703786191,
1201
+ "grad_norm": 35.755001068115234,
1202
+ "learning_rate": 5e-06,
1203
+ "loss": 0.5116,
1204
+ "step": 1540
1205
+ },
1206
+ {
1207
+ "epoch": 13.721603563474387,
1208
+ "grad_norm": 64.8443603515625,
1209
+ "learning_rate": 4.960317460317461e-06,
1210
+ "loss": 0.2796,
1211
+ "step": 1550
1212
+ },
1213
+ {
1214
+ "epoch": 13.810690423162583,
1215
+ "grad_norm": 66.51502990722656,
1216
+ "learning_rate": 4.920634920634921e-06,
1217
+ "loss": 0.4389,
1218
+ "step": 1560
1219
+ },
1220
+ {
1221
+ "epoch": 13.89977728285078,
1222
+ "grad_norm": 31.428354263305664,
1223
+ "learning_rate": 4.880952380952381e-06,
1224
+ "loss": 0.4831,
1225
+ "step": 1570
1226
+ },
1227
+ {
1228
+ "epoch": 13.988864142538976,
1229
+ "grad_norm": 17.072656631469727,
1230
+ "learning_rate": 4.841269841269842e-06,
1231
+ "loss": 0.353,
1232
+ "step": 1580
1233
+ },
1234
+ {
1235
+ "epoch": 14.0,
1236
+ "eval_loss": 0.7872561812400818,
1237
+ "eval_macro_f1": 0.7766969045913902,
1238
+ "eval_runtime": 22.1801,
1239
+ "eval_samples_per_second": 40.487,
1240
+ "eval_steps_per_second": 5.095,
1241
+ "step": 1582
1242
+ },
1243
+ {
1244
+ "epoch": 14.071269487750557,
1245
+ "grad_norm": 54.113372802734375,
1246
+ "learning_rate": 4.8015873015873025e-06,
1247
+ "loss": 0.553,
1248
+ "step": 1590
1249
+ },
1250
+ {
1251
+ "epoch": 14.160356347438753,
1252
+ "grad_norm": 47.31705093383789,
1253
+ "learning_rate": 4.761904761904762e-06,
1254
+ "loss": 0.3445,
1255
+ "step": 1600
1256
+ },
1257
+ {
1258
+ "epoch": 14.249443207126948,
1259
+ "grad_norm": 69.48124694824219,
1260
+ "learning_rate": 4.722222222222222e-06,
1261
+ "loss": 0.3443,
1262
+ "step": 1610
1263
+ },
1264
+ {
1265
+ "epoch": 14.338530066815144,
1266
+ "grad_norm": 46.73830032348633,
1267
+ "learning_rate": 4.682539682539683e-06,
1268
+ "loss": 0.5484,
1269
+ "step": 1620
1270
+ },
1271
+ {
1272
+ "epoch": 14.42761692650334,
1273
+ "grad_norm": 41.929805755615234,
1274
+ "learning_rate": 4.642857142857144e-06,
1275
+ "loss": 0.2983,
1276
+ "step": 1630
1277
+ },
1278
+ {
1279
+ "epoch": 14.516703786191536,
1280
+ "grad_norm": 25.568206787109375,
1281
+ "learning_rate": 4.603174603174604e-06,
1282
+ "loss": 0.3027,
1283
+ "step": 1640
1284
+ },
1285
+ {
1286
+ "epoch": 14.605790645879733,
1287
+ "grad_norm": 77.96745300292969,
1288
+ "learning_rate": 4.563492063492064e-06,
1289
+ "loss": 0.6002,
1290
+ "step": 1650
1291
+ },
1292
+ {
1293
+ "epoch": 14.694877505567929,
1294
+ "grad_norm": 32.5285758972168,
1295
+ "learning_rate": 4.523809523809524e-06,
1296
+ "loss": 0.3205,
1297
+ "step": 1660
1298
+ },
1299
+ {
1300
+ "epoch": 14.783964365256125,
1301
+ "grad_norm": 68.35356903076172,
1302
+ "learning_rate": 4.484126984126984e-06,
1303
+ "loss": 0.2666,
1304
+ "step": 1670
1305
+ },
1306
+ {
1307
+ "epoch": 14.873051224944321,
1308
+ "grad_norm": 2.4230611324310303,
1309
+ "learning_rate": 4.444444444444444e-06,
1310
+ "loss": 0.2947,
1311
+ "step": 1680
1312
+ },
1313
+ {
1314
+ "epoch": 14.962138084632517,
1315
+ "grad_norm": 18.08596420288086,
1316
+ "learning_rate": 4.404761904761905e-06,
1317
+ "loss": 0.4441,
1318
+ "step": 1690
1319
+ },
1320
+ {
1321
+ "epoch": 15.0,
1322
+ "eval_loss": 0.8163199424743652,
1323
+ "eval_macro_f1": 0.7852367846635664,
1324
+ "eval_runtime": 22.2418,
1325
+ "eval_samples_per_second": 40.374,
1326
+ "eval_steps_per_second": 5.081,
1327
+ "step": 1695
1328
+ },
1329
+ {
1330
+ "epoch": 15.044543429844097,
1331
+ "grad_norm": 11.731039047241211,
1332
+ "learning_rate": 4.365079365079366e-06,
1333
+ "loss": 0.239,
1334
+ "step": 1700
1335
+ },
1336
+ {
1337
+ "epoch": 15.133630289532293,
1338
+ "grad_norm": 44.82026672363281,
1339
+ "learning_rate": 4.3253968253968256e-06,
1340
+ "loss": 0.1949,
1341
+ "step": 1710
1342
+ },
1343
+ {
1344
+ "epoch": 15.22271714922049,
1345
+ "grad_norm": 24.14330291748047,
1346
+ "learning_rate": 4.2857142857142855e-06,
1347
+ "loss": 0.2407,
1348
+ "step": 1720
1349
+ },
1350
+ {
1351
+ "epoch": 15.311804008908686,
1352
+ "grad_norm": 71.19681549072266,
1353
+ "learning_rate": 4.246031746031746e-06,
1354
+ "loss": 0.6281,
1355
+ "step": 1730
1356
+ },
1357
+ {
1358
+ "epoch": 15.400890868596882,
1359
+ "grad_norm": 86.13356018066406,
1360
+ "learning_rate": 4.206349206349207e-06,
1361
+ "loss": 0.4672,
1362
+ "step": 1740
1363
+ },
1364
+ {
1365
+ "epoch": 15.489977728285078,
1366
+ "grad_norm": 6.331802845001221,
1367
+ "learning_rate": 4.166666666666667e-06,
1368
+ "loss": 0.2816,
1369
+ "step": 1750
1370
+ },
1371
+ {
1372
+ "epoch": 15.579064587973274,
1373
+ "grad_norm": 6.564174652099609,
1374
+ "learning_rate": 4.126984126984127e-06,
1375
+ "loss": 0.2649,
1376
+ "step": 1760
1377
+ },
1378
+ {
1379
+ "epoch": 15.66815144766147,
1380
+ "grad_norm": 43.143985748291016,
1381
+ "learning_rate": 4.0873015873015875e-06,
1382
+ "loss": 0.3499,
1383
+ "step": 1770
1384
+ },
1385
+ {
1386
+ "epoch": 15.757238307349667,
1387
+ "grad_norm": 11.393004417419434,
1388
+ "learning_rate": 4.047619047619048e-06,
1389
+ "loss": 0.2171,
1390
+ "step": 1780
1391
+ },
1392
+ {
1393
+ "epoch": 15.846325167037861,
1394
+ "grad_norm": 119.17877960205078,
1395
+ "learning_rate": 4.007936507936508e-06,
1396
+ "loss": 0.3459,
1397
+ "step": 1790
1398
+ },
1399
+ {
1400
+ "epoch": 15.935412026726057,
1401
+ "grad_norm": 135.7924041748047,
1402
+ "learning_rate": 3.968253968253968e-06,
1403
+ "loss": 0.3436,
1404
+ "step": 1800
1405
+ },
1406
+ {
1407
+ "epoch": 16.0,
1408
+ "eval_loss": 0.7593511343002319,
1409
+ "eval_macro_f1": 0.782747836115098,
1410
+ "eval_runtime": 22.0369,
1411
+ "eval_samples_per_second": 40.75,
1412
+ "eval_steps_per_second": 5.128,
1413
+ "step": 1808
1414
+ },
1415
+ {
1416
+ "epoch": 16.01781737193764,
1417
+ "grad_norm": 32.96919250488281,
1418
+ "learning_rate": 3.928571428571429e-06,
1419
+ "loss": 0.1912,
1420
+ "step": 1810
1421
+ },
1422
+ {
1423
+ "epoch": 16.106904231625837,
1424
+ "grad_norm": 54.140220642089844,
1425
+ "learning_rate": 3.88888888888889e-06,
1426
+ "loss": 0.3197,
1427
+ "step": 1820
1428
+ },
1429
+ {
1430
+ "epoch": 16.19599109131403,
1431
+ "grad_norm": 122.20573425292969,
1432
+ "learning_rate": 3.8492063492063495e-06,
1433
+ "loss": 0.4163,
1434
+ "step": 1830
1435
+ },
1436
+ {
1437
+ "epoch": 16.285077951002226,
1438
+ "grad_norm": 26.77570915222168,
1439
+ "learning_rate": 3.80952380952381e-06,
1440
+ "loss": 0.3443,
1441
+ "step": 1840
1442
+ },
1443
+ {
1444
+ "epoch": 16.374164810690424,
1445
+ "grad_norm": 41.13142395019531,
1446
+ "learning_rate": 3.76984126984127e-06,
1447
+ "loss": 0.3533,
1448
+ "step": 1850
1449
+ },
1450
+ {
1451
+ "epoch": 16.463251670378618,
1452
+ "grad_norm": 42.146873474121094,
1453
+ "learning_rate": 3.7301587301587305e-06,
1454
+ "loss": 0.4773,
1455
+ "step": 1860
1456
+ },
1457
+ {
1458
+ "epoch": 16.552338530066816,
1459
+ "grad_norm": 21.7828426361084,
1460
+ "learning_rate": 3.690476190476191e-06,
1461
+ "loss": 0.3173,
1462
+ "step": 1870
1463
+ },
1464
+ {
1465
+ "epoch": 16.64142538975501,
1466
+ "grad_norm": 43.677886962890625,
1467
+ "learning_rate": 3.6507936507936507e-06,
1468
+ "loss": 0.2824,
1469
+ "step": 1880
1470
+ },
1471
+ {
1472
+ "epoch": 16.73051224944321,
1473
+ "grad_norm": 24.393888473510742,
1474
+ "learning_rate": 3.6111111111111115e-06,
1475
+ "loss": 0.2087,
1476
+ "step": 1890
1477
+ },
1478
+ {
1479
+ "epoch": 16.819599109131403,
1480
+ "grad_norm": 281.832275390625,
1481
+ "learning_rate": 3.5714285714285718e-06,
1482
+ "loss": 0.4357,
1483
+ "step": 1900
1484
+ },
1485
+ {
1486
+ "epoch": 16.9086859688196,
1487
+ "grad_norm": 42.33871078491211,
1488
+ "learning_rate": 3.531746031746032e-06,
1489
+ "loss": 0.2472,
1490
+ "step": 1910
1491
+ },
1492
+ {
1493
+ "epoch": 16.997772828507795,
1494
+ "grad_norm": 65.45475769042969,
1495
+ "learning_rate": 3.492063492063492e-06,
1496
+ "loss": 0.478,
1497
+ "step": 1920
1498
+ },
1499
+ {
1500
+ "epoch": 17.0,
1501
+ "eval_loss": 0.8115331530570984,
1502
+ "eval_macro_f1": 0.7822360888195559,
1503
+ "eval_runtime": 21.7537,
1504
+ "eval_samples_per_second": 41.28,
1505
+ "eval_steps_per_second": 5.195,
1506
+ "step": 1921
1507
+ },
1508
+ {
1509
+ "epoch": 17.080178173719375,
1510
+ "grad_norm": 14.766924858093262,
1511
+ "learning_rate": 3.4523809523809528e-06,
1512
+ "loss": 0.3372,
1513
+ "step": 1930
1514
+ },
1515
+ {
1516
+ "epoch": 17.169265033407573,
1517
+ "grad_norm": 15.446084022521973,
1518
+ "learning_rate": 3.412698412698413e-06,
1519
+ "loss": 0.2074,
1520
+ "step": 1940
1521
+ },
1522
+ {
1523
+ "epoch": 17.258351893095767,
1524
+ "grad_norm": 61.16306686401367,
1525
+ "learning_rate": 3.3730158730158734e-06,
1526
+ "loss": 0.1885,
1527
+ "step": 1950
1528
+ },
1529
+ {
1530
+ "epoch": 17.347438752783965,
1531
+ "grad_norm": 23.0779972076416,
1532
+ "learning_rate": 3.3333333333333333e-06,
1533
+ "loss": 0.1597,
1534
+ "step": 1960
1535
+ },
1536
+ {
1537
+ "epoch": 17.43652561247216,
1538
+ "grad_norm": 76.70426940917969,
1539
+ "learning_rate": 3.293650793650794e-06,
1540
+ "loss": 0.2077,
1541
+ "step": 1970
1542
+ },
1543
+ {
1544
+ "epoch": 17.525612472160358,
1545
+ "grad_norm": 33.99974822998047,
1546
+ "learning_rate": 3.2539682539682544e-06,
1547
+ "loss": 0.353,
1548
+ "step": 1980
1549
+ },
1550
+ {
1551
+ "epoch": 17.614699331848552,
1552
+ "grad_norm": 4.826251029968262,
1553
+ "learning_rate": 3.2142857142857147e-06,
1554
+ "loss": 0.3525,
1555
+ "step": 1990
1556
+ },
1557
+ {
1558
+ "epoch": 17.70378619153675,
1559
+ "grad_norm": 13.412989616394043,
1560
+ "learning_rate": 3.1746031746031746e-06,
1561
+ "loss": 0.2875,
1562
+ "step": 2000
1563
+ },
1564
+ {
1565
+ "epoch": 17.792873051224944,
1566
+ "grad_norm": 18.342254638671875,
1567
+ "learning_rate": 3.134920634920635e-06,
1568
+ "loss": 0.4062,
1569
+ "step": 2010
1570
+ },
1571
+ {
1572
+ "epoch": 17.88195991091314,
1573
+ "grad_norm": 1.9873216152191162,
1574
+ "learning_rate": 3.0952380952380957e-06,
1575
+ "loss": 0.1457,
1576
+ "step": 2020
1577
+ },
1578
+ {
1579
+ "epoch": 17.971046770601337,
1580
+ "grad_norm": 68.14187622070312,
1581
+ "learning_rate": 3.055555555555556e-06,
1582
+ "loss": 0.3032,
1583
+ "step": 2030
1584
+ },
1585
+ {
1586
+ "epoch": 18.0,
1587
+ "eval_loss": 0.9094008803367615,
1588
+ "eval_macro_f1": 0.7945872168335913,
1589
+ "eval_runtime": 22.2168,
1590
+ "eval_samples_per_second": 40.42,
1591
+ "eval_steps_per_second": 5.086,
1592
+ "step": 2034
1593
+ },
1594
+ {
1595
+ "epoch": 18.053452115812917,
1596
+ "grad_norm": 36.161376953125,
1597
+ "learning_rate": 3.015873015873016e-06,
1598
+ "loss": 0.2413,
1599
+ "step": 2040
1600
+ },
1601
+ {
1602
+ "epoch": 18.142538975501115,
1603
+ "grad_norm": 23.837491989135742,
1604
+ "learning_rate": 2.9761904761904763e-06,
1605
+ "loss": 0.5601,
1606
+ "step": 2050
1607
+ },
1608
+ {
1609
+ "epoch": 18.23162583518931,
1610
+ "grad_norm": 338.7473449707031,
1611
+ "learning_rate": 2.936507936507937e-06,
1612
+ "loss": 0.4942,
1613
+ "step": 2060
1614
+ },
1615
+ {
1616
+ "epoch": 18.320712694877507,
1617
+ "grad_norm": 114.42424774169922,
1618
+ "learning_rate": 2.8968253968253974e-06,
1619
+ "loss": 0.4017,
1620
+ "step": 2070
1621
+ },
1622
+ {
1623
+ "epoch": 18.4097995545657,
1624
+ "grad_norm": 56.394432067871094,
1625
+ "learning_rate": 2.8571428571428573e-06,
1626
+ "loss": 0.2997,
1627
+ "step": 2080
1628
+ },
1629
+ {
1630
+ "epoch": 18.498886414253896,
1631
+ "grad_norm": 38.77593994140625,
1632
+ "learning_rate": 2.8174603174603176e-06,
1633
+ "loss": 0.4311,
1634
+ "step": 2090
1635
+ },
1636
+ {
1637
+ "epoch": 18.587973273942094,
1638
+ "grad_norm": 32.89288330078125,
1639
+ "learning_rate": 2.7777777777777783e-06,
1640
+ "loss": 0.3228,
1641
+ "step": 2100
1642
+ },
1643
+ {
1644
+ "epoch": 18.677060133630288,
1645
+ "grad_norm": 17.314373016357422,
1646
+ "learning_rate": 2.7380952380952387e-06,
1647
+ "loss": 0.2455,
1648
+ "step": 2110
1649
+ },
1650
+ {
1651
+ "epoch": 18.766146993318486,
1652
+ "grad_norm": 79.07820892333984,
1653
+ "learning_rate": 2.6984126984126986e-06,
1654
+ "loss": 0.2307,
1655
+ "step": 2120
1656
+ },
1657
+ {
1658
+ "epoch": 18.85523385300668,
1659
+ "grad_norm": 61.03816223144531,
1660
+ "learning_rate": 2.658730158730159e-06,
1661
+ "loss": 0.3474,
1662
+ "step": 2130
1663
+ },
1664
+ {
1665
+ "epoch": 18.94432071269488,
1666
+ "grad_norm": 14.016550064086914,
1667
+ "learning_rate": 2.6190476190476192e-06,
1668
+ "loss": 0.46,
1669
+ "step": 2140
1670
+ },
1671
+ {
1672
+ "epoch": 19.0,
1673
+ "eval_loss": 0.8602247834205627,
1674
+ "eval_macro_f1": 0.7783843611181741,
1675
+ "eval_runtime": 23.5346,
1676
+ "eval_samples_per_second": 38.157,
1677
+ "eval_steps_per_second": 4.801,
1678
+ "step": 2147
1679
+ },
1680
+ {
1681
+ "epoch": 19.02672605790646,
1682
+ "grad_norm": 6.988764762878418,
1683
+ "learning_rate": 2.57936507936508e-06,
1684
+ "loss": 0.2748,
1685
+ "step": 2150
1686
+ },
1687
+ {
1688
+ "epoch": 19.115812917594656,
1689
+ "grad_norm": 101.83765411376953,
1690
+ "learning_rate": 2.53968253968254e-06,
1691
+ "loss": 0.2884,
1692
+ "step": 2160
1693
+ },
1694
+ {
1695
+ "epoch": 19.20489977728285,
1696
+ "grad_norm": 43.04463577270508,
1697
+ "learning_rate": 2.5e-06,
1698
+ "loss": 0.248,
1699
+ "step": 2170
1700
+ },
1701
+ {
1702
+ "epoch": 19.293986636971045,
1703
+ "grad_norm": 37.474327087402344,
1704
+ "learning_rate": 2.4603174603174605e-06,
1705
+ "loss": 0.1679,
1706
+ "step": 2180
1707
+ },
1708
+ {
1709
+ "epoch": 19.383073496659243,
1710
+ "grad_norm": 33.84281539916992,
1711
+ "learning_rate": 2.420634920634921e-06,
1712
+ "loss": 0.1555,
1713
+ "step": 2190
1714
+ },
1715
+ {
1716
+ "epoch": 19.472160356347437,
1717
+ "grad_norm": 7.630841255187988,
1718
+ "learning_rate": 2.380952380952381e-06,
1719
+ "loss": 0.2762,
1720
+ "step": 2200
1721
+ },
1722
+ {
1723
+ "epoch": 19.561247216035635,
1724
+ "grad_norm": 34.77897262573242,
1725
+ "learning_rate": 2.3412698412698415e-06,
1726
+ "loss": 0.3175,
1727
+ "step": 2210
1728
+ },
1729
+ {
1730
+ "epoch": 19.65033407572383,
1731
+ "grad_norm": 24.479206085205078,
1732
+ "learning_rate": 2.301587301587302e-06,
1733
+ "loss": 0.2666,
1734
+ "step": 2220
1735
+ },
1736
+ {
1737
+ "epoch": 19.739420935412028,
1738
+ "grad_norm": 54.75190734863281,
1739
+ "learning_rate": 2.261904761904762e-06,
1740
+ "loss": 0.2699,
1741
+ "step": 2230
1742
+ },
1743
+ {
1744
+ "epoch": 19.828507795100222,
1745
+ "grad_norm": 27.636972427368164,
1746
+ "learning_rate": 2.222222222222222e-06,
1747
+ "loss": 0.4514,
1748
+ "step": 2240
1749
+ },
1750
+ {
1751
+ "epoch": 19.91759465478842,
1752
+ "grad_norm": 61.61141586303711,
1753
+ "learning_rate": 2.182539682539683e-06,
1754
+ "loss": 0.4043,
1755
+ "step": 2250
1756
+ },
1757
+ {
1758
+ "epoch": 20.0,
1759
+ "grad_norm": 48.8805046081543,
1760
+ "learning_rate": 2.1428571428571427e-06,
1761
+ "loss": 0.1551,
1762
+ "step": 2260
1763
+ },
1764
+ {
1765
+ "epoch": 20.0,
1766
+ "eval_loss": 0.9069338440895081,
1767
+ "eval_macro_f1": 0.7952638618677043,
1768
+ "eval_runtime": 22.5334,
1769
+ "eval_samples_per_second": 39.852,
1770
+ "eval_steps_per_second": 5.015,
1771
+ "step": 2260
1772
+ },
1773
+ {
1774
+ "epoch": 20.089086859688194,
1775
+ "grad_norm": 34.62713623046875,
1776
+ "learning_rate": 2.1031746031746035e-06,
1777
+ "loss": 0.2323,
1778
+ "step": 2270
1779
+ },
1780
+ {
1781
+ "epoch": 20.178173719376392,
1782
+ "grad_norm": 55.7051887512207,
1783
+ "learning_rate": 2.0634920634920634e-06,
1784
+ "loss": 0.239,
1785
+ "step": 2280
1786
+ },
1787
+ {
1788
+ "epoch": 20.267260579064587,
1789
+ "grad_norm": 11.973965644836426,
1790
+ "learning_rate": 2.023809523809524e-06,
1791
+ "loss": 0.221,
1792
+ "step": 2290
1793
+ },
1794
+ {
1795
+ "epoch": 20.356347438752785,
1796
+ "grad_norm": 18.674625396728516,
1797
+ "learning_rate": 1.984126984126984e-06,
1798
+ "loss": 0.1411,
1799
+ "step": 2300
1800
+ },
1801
+ {
1802
+ "epoch": 20.44543429844098,
1803
+ "grad_norm": 64.24128723144531,
1804
+ "learning_rate": 1.944444444444445e-06,
1805
+ "loss": 0.3944,
1806
+ "step": 2310
1807
+ },
1808
+ {
1809
+ "epoch": 20.534521158129177,
1810
+ "grad_norm": 19.434814453125,
1811
+ "learning_rate": 1.904761904761905e-06,
1812
+ "loss": 0.3017,
1813
+ "step": 2320
1814
+ },
1815
+ {
1816
+ "epoch": 20.62360801781737,
1817
+ "grad_norm": 76.87932586669922,
1818
+ "learning_rate": 1.8650793650793652e-06,
1819
+ "loss": 0.2937,
1820
+ "step": 2330
1821
+ },
1822
+ {
1823
+ "epoch": 20.71269487750557,
1824
+ "grad_norm": 79.9623794555664,
1825
+ "learning_rate": 1.8253968253968254e-06,
1826
+ "loss": 0.162,
1827
+ "step": 2340
1828
+ },
1829
+ {
1830
+ "epoch": 20.801781737193764,
1831
+ "grad_norm": 0.5564860105514526,
1832
+ "learning_rate": 1.7857142857142859e-06,
1833
+ "loss": 0.1174,
1834
+ "step": 2350
1835
+ },
1836
+ {
1837
+ "epoch": 20.89086859688196,
1838
+ "grad_norm": 63.84634017944336,
1839
+ "learning_rate": 1.746031746031746e-06,
1840
+ "loss": 0.3684,
1841
+ "step": 2360
1842
+ },
1843
+ {
1844
+ "epoch": 20.979955456570156,
1845
+ "grad_norm": 159.61268615722656,
1846
+ "learning_rate": 1.7063492063492065e-06,
1847
+ "loss": 0.3583,
1848
+ "step": 2370
1849
+ },
1850
+ {
1851
+ "epoch": 21.0,
1852
+ "eval_loss": 0.9667259454727173,
1853
+ "eval_macro_f1": 0.7853648934851267,
1854
+ "eval_runtime": 21.6042,
1855
+ "eval_samples_per_second": 41.566,
1856
+ "eval_steps_per_second": 5.23,
1857
+ "step": 2373
1858
+ },
1859
+ {
1860
+ "epoch": 21.062360801781736,
1861
+ "grad_norm": 3.4698734283447266,
1862
+ "learning_rate": 1.6666666666666667e-06,
1863
+ "loss": 0.0825,
1864
+ "step": 2380
1865
+ },
1866
+ {
1867
+ "epoch": 21.151447661469934,
1868
+ "grad_norm": 30.75046730041504,
1869
+ "learning_rate": 1.6269841269841272e-06,
1870
+ "loss": 0.2367,
1871
+ "step": 2390
1872
+ },
1873
+ {
1874
+ "epoch": 21.24053452115813,
1875
+ "grad_norm": 58.869632720947266,
1876
+ "learning_rate": 1.5873015873015873e-06,
1877
+ "loss": 0.2491,
1878
+ "step": 2400
1879
+ },
1880
+ {
1881
+ "epoch": 21.329621380846326,
1882
+ "grad_norm": 78.80653381347656,
1883
+ "learning_rate": 1.5476190476190479e-06,
1884
+ "loss": 0.3368,
1885
+ "step": 2410
1886
+ },
1887
+ {
1888
+ "epoch": 21.41870824053452,
1889
+ "grad_norm": 42.70677947998047,
1890
+ "learning_rate": 1.507936507936508e-06,
1891
+ "loss": 0.2251,
1892
+ "step": 2420
1893
+ },
1894
+ {
1895
+ "epoch": 21.50779510022272,
1896
+ "grad_norm": 24.363916397094727,
1897
+ "learning_rate": 1.4682539682539685e-06,
1898
+ "loss": 0.154,
1899
+ "step": 2430
1900
+ },
1901
+ {
1902
+ "epoch": 21.596881959910913,
1903
+ "grad_norm": 70.08970642089844,
1904
+ "learning_rate": 1.4285714285714286e-06,
1905
+ "loss": 0.3436,
1906
+ "step": 2440
1907
+ },
1908
+ {
1909
+ "epoch": 21.685968819599108,
1910
+ "grad_norm": 59.4857292175293,
1911
+ "learning_rate": 1.3888888888888892e-06,
1912
+ "loss": 0.2631,
1913
+ "step": 2450
1914
+ },
1915
+ {
1916
+ "epoch": 21.775055679287306,
1917
+ "grad_norm": 2.151820421218872,
1918
+ "learning_rate": 1.3492063492063493e-06,
1919
+ "loss": 0.2338,
1920
+ "step": 2460
1921
+ },
1922
+ {
1923
+ "epoch": 21.8641425389755,
1924
+ "grad_norm": 32.367698669433594,
1925
+ "learning_rate": 1.3095238095238096e-06,
1926
+ "loss": 0.3898,
1927
+ "step": 2470
1928
+ },
1929
+ {
1930
+ "epoch": 21.953229398663698,
1931
+ "grad_norm": 62.534358978271484,
1932
+ "learning_rate": 1.26984126984127e-06,
1933
+ "loss": 0.3501,
1934
+ "step": 2480
1935
+ },
1936
+ {
1937
+ "epoch": 22.0,
1938
+ "eval_loss": 1.012130618095398,
1939
+ "eval_macro_f1": 0.7793243771478484,
1940
+ "eval_runtime": 21.887,
1941
+ "eval_samples_per_second": 41.029,
1942
+ "eval_steps_per_second": 5.163,
1943
+ "step": 2486
1944
+ },
1945
+ {
1946
+ "epoch": 22.035634743875278,
1947
+ "grad_norm": 69.29458618164062,
1948
+ "learning_rate": 1.2301587301587303e-06,
1949
+ "loss": 0.2882,
1950
+ "step": 2490
1951
+ },
1952
+ {
1953
+ "epoch": 22.124721603563476,
1954
+ "grad_norm": 89.39969635009766,
1955
+ "learning_rate": 1.1904761904761906e-06,
1956
+ "loss": 0.2838,
1957
+ "step": 2500
1958
+ },
1959
+ {
1960
+ "epoch": 22.21380846325167,
1961
+ "grad_norm": 356.38787841796875,
1962
+ "learning_rate": 1.150793650793651e-06,
1963
+ "loss": 0.2001,
1964
+ "step": 2510
1965
+ },
1966
+ {
1967
+ "epoch": 22.302895322939868,
1968
+ "grad_norm": 4.26671028137207,
1969
+ "learning_rate": 1.111111111111111e-06,
1970
+ "loss": 0.1658,
1971
+ "step": 2520
1972
+ },
1973
+ {
1974
+ "epoch": 22.391982182628063,
1975
+ "grad_norm": 70.33140563964844,
1976
+ "learning_rate": 1.0714285714285714e-06,
1977
+ "loss": 0.2195,
1978
+ "step": 2530
1979
+ },
1980
+ {
1981
+ "epoch": 22.481069042316257,
1982
+ "grad_norm": 57.502559661865234,
1983
+ "learning_rate": 1.0317460317460317e-06,
1984
+ "loss": 0.1085,
1985
+ "step": 2540
1986
+ },
1987
+ {
1988
+ "epoch": 22.570155902004455,
1989
+ "grad_norm": 76.61205291748047,
1990
+ "learning_rate": 9.92063492063492e-07,
1991
+ "loss": 0.1872,
1992
+ "step": 2550
1993
+ },
1994
+ {
1995
+ "epoch": 22.65924276169265,
1996
+ "grad_norm": 72.69236755371094,
1997
+ "learning_rate": 9.523809523809525e-07,
1998
+ "loss": 0.2837,
1999
+ "step": 2560
2000
+ },
2001
+ {
2002
+ "epoch": 22.748329621380847,
2003
+ "grad_norm": 112.57559967041016,
2004
+ "learning_rate": 9.126984126984127e-07,
2005
+ "loss": 0.4063,
2006
+ "step": 2570
2007
+ },
2008
+ {
2009
+ "epoch": 22.83741648106904,
2010
+ "grad_norm": 102.09916687011719,
2011
+ "learning_rate": 8.73015873015873e-07,
2012
+ "loss": 0.3085,
2013
+ "step": 2580
2014
+ },
2015
+ {
2016
+ "epoch": 22.92650334075724,
2017
+ "grad_norm": 4.140219211578369,
2018
+ "learning_rate": 8.333333333333333e-07,
2019
+ "loss": 0.0432,
2020
+ "step": 2590
2021
+ },
2022
+ {
2023
+ "epoch": 23.0,
2024
+ "eval_loss": 0.9713873863220215,
2025
+ "eval_macro_f1": 0.7901658291457286,
2026
+ "eval_runtime": 21.8489,
2027
+ "eval_samples_per_second": 41.1,
2028
+ "eval_steps_per_second": 5.172,
2029
+ "step": 2599
2030
+ },
2031
+ {
2032
+ "epoch": 23.00890868596882,
2033
+ "grad_norm": 81.31112670898438,
2034
+ "learning_rate": 7.936507936507937e-07,
2035
+ "loss": 0.2301,
2036
+ "step": 2600
2037
+ },
2038
+ {
2039
+ "epoch": 23.097995545657014,
2040
+ "grad_norm": 133.33106994628906,
2041
+ "learning_rate": 7.53968253968254e-07,
2042
+ "loss": 0.3902,
2043
+ "step": 2610
2044
+ },
2045
+ {
2046
+ "epoch": 23.187082405345212,
2047
+ "grad_norm": 75.14102935791016,
2048
+ "learning_rate": 7.142857142857143e-07,
2049
+ "loss": 0.4124,
2050
+ "step": 2620
2051
+ },
2052
+ {
2053
+ "epoch": 23.276169265033406,
2054
+ "grad_norm": 56.319427490234375,
2055
+ "learning_rate": 6.746031746031746e-07,
2056
+ "loss": 0.2647,
2057
+ "step": 2630
2058
+ },
2059
+ {
2060
+ "epoch": 23.365256124721604,
2061
+ "grad_norm": 0.3156695067882538,
2062
+ "learning_rate": 6.34920634920635e-07,
2063
+ "loss": 0.2229,
2064
+ "step": 2640
2065
+ },
2066
+ {
2067
+ "epoch": 23.4543429844098,
2068
+ "grad_norm": 0.1939859390258789,
2069
+ "learning_rate": 5.952380952380953e-07,
2070
+ "loss": 0.11,
2071
+ "step": 2650
2072
+ },
2073
+ {
2074
+ "epoch": 23.543429844097997,
2075
+ "grad_norm": 27.532583236694336,
2076
+ "learning_rate": 5.555555555555555e-07,
2077
+ "loss": 0.269,
2078
+ "step": 2660
2079
+ },
2080
+ {
2081
+ "epoch": 23.63251670378619,
2082
+ "grad_norm": 5.016575336456299,
2083
+ "learning_rate": 5.158730158730158e-07,
2084
+ "loss": 0.2089,
2085
+ "step": 2670
2086
+ },
2087
+ {
2088
+ "epoch": 23.72160356347439,
2089
+ "grad_norm": 82.43734741210938,
2090
+ "learning_rate": 4.7619047619047623e-07,
2091
+ "loss": 0.3519,
2092
+ "step": 2680
2093
+ },
2094
+ {
2095
+ "epoch": 23.810690423162583,
2096
+ "grad_norm": 12.222328186035156,
2097
+ "learning_rate": 4.365079365079365e-07,
2098
+ "loss": 0.2363,
2099
+ "step": 2690
2100
+ },
2101
+ {
2102
+ "epoch": 23.899777282850778,
2103
+ "grad_norm": 56.74919509887695,
2104
+ "learning_rate": 3.9682539682539683e-07,
2105
+ "loss": 0.0908,
2106
+ "step": 2700
2107
+ },
2108
+ {
2109
+ "epoch": 23.988864142538976,
2110
+ "grad_norm": 77.4910659790039,
2111
+ "learning_rate": 3.5714285714285716e-07,
2112
+ "loss": 0.3777,
2113
+ "step": 2710
2114
+ },
2115
+ {
2116
+ "epoch": 24.0,
2117
+ "eval_loss": 0.9040295481681824,
2118
+ "eval_macro_f1": 0.7973446094765495,
2119
+ "eval_runtime": 21.8468,
2120
+ "eval_samples_per_second": 41.104,
2121
+ "eval_steps_per_second": 5.172,
2122
+ "step": 2712
2123
+ },
2124
+ {
2125
+ "epoch": 24.071269487750556,
2126
+ "grad_norm": 23.00518798828125,
2127
+ "learning_rate": 3.174603174603175e-07,
2128
+ "loss": 0.1797,
2129
+ "step": 2720
2130
+ },
2131
+ {
2132
+ "epoch": 24.160356347438753,
2133
+ "grad_norm": 81.12632751464844,
2134
+ "learning_rate": 2.7777777777777776e-07,
2135
+ "loss": 0.2499,
2136
+ "step": 2730
2137
+ },
2138
+ {
2139
+ "epoch": 24.249443207126948,
2140
+ "grad_norm": 124.60951232910156,
2141
+ "learning_rate": 2.3809523809523811e-07,
2142
+ "loss": 0.2462,
2143
+ "step": 2740
2144
+ },
2145
+ {
2146
+ "epoch": 24.338530066815146,
2147
+ "grad_norm": 90.49283599853516,
2148
+ "learning_rate": 1.9841269841269841e-07,
2149
+ "loss": 0.3619,
2150
+ "step": 2750
2151
+ },
2152
+ {
2153
+ "epoch": 24.42761692650334,
2154
+ "grad_norm": 4.63253927230835,
2155
+ "learning_rate": 1.5873015873015874e-07,
2156
+ "loss": 0.2901,
2157
+ "step": 2760
2158
+ },
2159
+ {
2160
+ "epoch": 24.51670378619154,
2161
+ "grad_norm": 8.05462646484375,
2162
+ "learning_rate": 1.1904761904761906e-07,
2163
+ "loss": 0.1431,
2164
+ "step": 2770
2165
+ },
2166
+ {
2167
+ "epoch": 24.605790645879733,
2168
+ "grad_norm": 10.225213050842285,
2169
+ "learning_rate": 7.936507936507937e-08,
2170
+ "loss": 0.1209,
2171
+ "step": 2780
2172
+ },
2173
+ {
2174
+ "epoch": 24.694877505567927,
2175
+ "grad_norm": 23.83371925354004,
2176
+ "learning_rate": 3.9682539682539686e-08,
2177
+ "loss": 0.2688,
2178
+ "step": 2790
2179
+ },
2180
+ {
2181
+ "epoch": 24.783964365256125,
2182
+ "grad_norm": 0.8638786673545837,
2183
+ "learning_rate": 0.0,
2184
+ "loss": 0.1488,
2185
+ "step": 2800
2186
+ },
2187
+ {
2188
+ "epoch": 24.783964365256125,
2189
+ "eval_loss": 0.9561133980751038,
2190
+ "eval_macro_f1": 0.8017572519758047,
2191
+ "eval_runtime": 22.04,
2192
+ "eval_samples_per_second": 40.744,
2193
+ "eval_steps_per_second": 5.127,
2194
+ "step": 2800
2195
+ }
2196
+ ],
2197
+ "logging_steps": 10,
2198
+ "max_steps": 2800,
2199
+ "num_input_tokens_seen": 0,
2200
+ "num_train_epochs": 25,
2201
+ "save_steps": 500,
2202
+ "stateful_callbacks": {
2203
+ "TrainerControl": {
2204
+ "args": {
2205
+ "should_epoch_stop": false,
2206
+ "should_evaluate": false,
2207
+ "should_log": false,
2208
+ "should_save": true,
2209
+ "should_training_stop": true
2210
+ },
2211
+ "attributes": {}
2212
+ }
2213
+ },
2214
+ "total_flos": 3.1947072291024667e+19,
2215
+ "train_batch_size": 8,
2216
+ "trial_name": null,
2217
+ "trial_params": null
2218
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2aa91e9fb5f6fb80530bd2f0be3b57b5dc44f26b993ccf79d89846501db4b5a
3
+ size 5304