JonWen commited on
Commit
4b3e037
·
verified ·
1 Parent(s): 011259f

Upload google/gemma-3-270m-it-mmlu_pro/checkpoint-39500

Browse files
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3TextForSequenceClassification"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "dtype": "bfloat16",
11
+ "eos_token_id": 1,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "problem_type": "single_label_classification",
45
+ "query_pre_attn_scalar": 256,
46
+ "rms_norm_eps": 1e-06,
47
+ "rope_local_base_freq": 10000.0,
48
+ "rope_scaling": null,
49
+ "rope_theta": 1000000.0,
50
+ "sliding_window": 512,
51
+ "transformers_version": "4.57.0",
52
+ "use_bidirectional_attention": false,
53
+ "use_cache": true,
54
+ "vocab_size": 262144
55
+ }
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:461d2b9411e96d9dafc990f32533120ea3b8bbba0640a14bb70c32f6fe676be7
3
+ size 536225696
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a57fbc99f48442fb93536edd9ed816c803b98b8d8753d048ed3d51e7c9f9ac51
3
+ size 1072600395
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc8125017ac64aaaf6b7b4d9fec3c97b36247e6f90303f4f65e7da22752a6021
3
+ size 16389
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e6feb4e71a30bbd30fb817d57cf38086ff8afd570f5fb462a4d672dc5b9c7f4
3
+ size 16389
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a783a95b392b2b425b72256e899035dee284c9de0ad7d762036e94693dc4f06
3
+ size 16389
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84676a652b2e127a0f26e9e98aa79f894b6a28b744b6c6af7db9b1981b7e8118
3
+ size 16389
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e404f18b3aced4d62989184b5ba490b18f3cfd416c392cac83f54a7cd7234908
3
+ size 16389
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e808e76e90d6a465ee4ea62c07518ee045190927828be150142f6f13cb63efa
3
+ size 16389
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b4ff2843553f32d6779fa593223bb2bc6507b59dc0801501e96de5709554f0b
3
+ size 16389
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a00cbd0fa2276ef03d7e54235bff23ff65e5559b2a4d871c31f6ccca7fc9a594
3
+ size 16389
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3494c805e517ad939ba54ad38ea90f5051d6f814cec2a5b0f1d515ca634f2d50
3
+ size 1465
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/trainer_state.json ADDED
@@ -0,0 +1,1094 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5034989993754063,
6
+ "eval_steps": 1000,
7
+ "global_step": 39500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00637340505538489,
14
+ "grad_norm": 280.0,
15
+ "learning_rate": 6.359928626051491e-06,
16
+ "loss": 5.4594,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.01274681011076978,
21
+ "grad_norm": 100.0,
22
+ "learning_rate": 1.2732602600050984e-05,
23
+ "loss": 4.709,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.01274681011076978,
28
+ "eval_accuracy": 0.7175561479002666,
29
+ "eval_auc": 0.7208985686302185,
30
+ "eval_f1": 0.6381847876292697,
31
+ "eval_loss": 4.532001972198486,
32
+ "eval_precision": 0.6933309621576436,
33
+ "eval_recall": 0.6326105710782597,
34
+ "eval_runtime": 236.3818,
35
+ "eval_samples_per_second": 1901.322,
36
+ "eval_steps_per_second": 3.714,
37
+ "step": 1000
38
+ },
39
+ {
40
+ "epoch": 0.01912021516615467,
41
+ "grad_norm": 368.0,
42
+ "learning_rate": 1.9105276574050474e-05,
43
+ "loss": 4.5852,
44
+ "step": 1500
45
+ },
46
+ {
47
+ "epoch": 0.02549362022153956,
48
+ "grad_norm": 170.0,
49
+ "learning_rate": 2.547795054804996e-05,
50
+ "loss": 4.2857,
51
+ "step": 2000
52
+ },
53
+ {
54
+ "epoch": 0.02549362022153956,
55
+ "eval_accuracy": 0.7647461941357874,
56
+ "eval_auc": 0.7973974347114563,
57
+ "eval_f1": 0.701763515452194,
58
+ "eval_loss": 4.041171550750732,
59
+ "eval_precision": 0.7651343630299938,
60
+ "eval_recall": 0.6882129391743368,
61
+ "eval_runtime": 235.9253,
62
+ "eval_samples_per_second": 1905.002,
63
+ "eval_steps_per_second": 3.722,
64
+ "step": 2000
65
+ },
66
+ {
67
+ "epoch": 0.03186702527692445,
68
+ "grad_norm": 109.0,
69
+ "learning_rate": 3.1850624522049453e-05,
70
+ "loss": 3.8736,
71
+ "step": 2500
72
+ },
73
+ {
74
+ "epoch": 0.03824043033230934,
75
+ "grad_norm": 44.5,
76
+ "learning_rate": 3.822329849604894e-05,
77
+ "loss": 3.6724,
78
+ "step": 3000
79
+ },
80
+ {
81
+ "epoch": 0.03824043033230934,
82
+ "eval_accuracy": 0.8030896363903364,
83
+ "eval_auc": 0.8465206623077393,
84
+ "eval_f1": 0.7667088375827027,
85
+ "eval_loss": 3.5329034328460693,
86
+ "eval_precision": 0.793541421184558,
87
+ "eval_recall": 0.7536982564982242,
88
+ "eval_runtime": 235.8325,
89
+ "eval_samples_per_second": 1905.751,
90
+ "eval_steps_per_second": 3.723,
91
+ "step": 3000
92
+ },
93
+ {
94
+ "epoch": 0.04461383538769423,
95
+ "grad_norm": 36.0,
96
+ "learning_rate": 4.459597247004843e-05,
97
+ "loss": 3.5882,
98
+ "step": 3500
99
+ },
100
+ {
101
+ "epoch": 0.05098724044307912,
102
+ "grad_norm": 37.75,
103
+ "learning_rate": 5.0968646444047927e-05,
104
+ "loss": 3.5063,
105
+ "step": 4000
106
+ },
107
+ {
108
+ "epoch": 0.05098724044307912,
109
+ "eval_accuracy": 0.8067920380564171,
110
+ "eval_auc": 0.8545748591423035,
111
+ "eval_f1": 0.7629704100530446,
112
+ "eval_loss": 3.482598304748535,
113
+ "eval_precision": 0.8125964117429191,
114
+ "eval_recall": 0.7454246487633156,
115
+ "eval_runtime": 235.9569,
116
+ "eval_samples_per_second": 1904.746,
117
+ "eval_steps_per_second": 3.721,
118
+ "step": 4000
119
+ },
120
+ {
121
+ "epoch": 0.05736064549846401,
122
+ "grad_norm": 66.0,
123
+ "learning_rate": 5.734132041804742e-05,
124
+ "loss": 3.4417,
125
+ "step": 4500
126
+ },
127
+ {
128
+ "epoch": 0.0637340505538489,
129
+ "grad_norm": 48.0,
130
+ "learning_rate": 6.37139943920469e-05,
131
+ "loss": 3.3849,
132
+ "step": 5000
133
+ },
134
+ {
135
+ "epoch": 0.0637340505538489,
136
+ "eval_accuracy": 0.8106501897925854,
137
+ "eval_auc": 0.8611223697662354,
138
+ "eval_f1": 0.7847092912487569,
139
+ "eval_loss": 3.420646905899048,
140
+ "eval_precision": 0.7922936032572139,
141
+ "eval_recall": 0.778987144356641,
142
+ "eval_runtime": 236.031,
143
+ "eval_samples_per_second": 1904.148,
144
+ "eval_steps_per_second": 3.72,
145
+ "step": 5000
146
+ },
147
+ {
148
+ "epoch": 0.07010745560923379,
149
+ "grad_norm": 23.5,
150
+ "learning_rate": 7.008666836604639e-05,
151
+ "loss": 3.3603,
152
+ "step": 5500
153
+ },
154
+ {
155
+ "epoch": 0.07648086066461868,
156
+ "grad_norm": 27.75,
157
+ "learning_rate": 7.645934234004589e-05,
158
+ "loss": 3.344,
159
+ "step": 6000
160
+ },
161
+ {
162
+ "epoch": 0.07648086066461868,
163
+ "eval_accuracy": 0.8188337434751846,
164
+ "eval_auc": 0.8691284656524658,
165
+ "eval_f1": 0.7824235286935783,
166
+ "eval_loss": 3.3333048820495605,
167
+ "eval_precision": 0.8193413944376746,
168
+ "eval_recall": 0.7662003507186271,
169
+ "eval_runtime": 235.9449,
170
+ "eval_samples_per_second": 1904.843,
171
+ "eval_steps_per_second": 3.721,
172
+ "step": 6000
173
+ },
174
+ {
175
+ "epoch": 0.08285426572000357,
176
+ "grad_norm": 20.5,
177
+ "learning_rate": 8.283201631404538e-05,
178
+ "loss": 3.2919,
179
+ "step": 6500
180
+ },
181
+ {
182
+ "epoch": 0.08922767077538846,
183
+ "grad_norm": 17.25,
184
+ "learning_rate": 8.920469028804487e-05,
185
+ "loss": 3.27,
186
+ "step": 7000
187
+ },
188
+ {
189
+ "epoch": 0.08922767077538846,
190
+ "eval_accuracy": 0.8203645441640449,
191
+ "eval_auc": 0.870448887348175,
192
+ "eval_f1": 0.7899083336048272,
193
+ "eval_loss": 3.292984962463379,
194
+ "eval_precision": 0.8111738927346904,
195
+ "eval_recall": 0.7779126528172351,
196
+ "eval_runtime": 235.9365,
197
+ "eval_samples_per_second": 1904.911,
198
+ "eval_steps_per_second": 3.721,
199
+ "step": 7000
200
+ },
201
+ {
202
+ "epoch": 0.09560107583077335,
203
+ "grad_norm": 35.5,
204
+ "learning_rate": 9.557736426204435e-05,
205
+ "loss": 3.2578,
206
+ "step": 7500
207
+ },
208
+ {
209
+ "epoch": 0.10197448088615824,
210
+ "grad_norm": 22.75,
211
+ "learning_rate": 0.0001,
212
+ "loss": 3.2406,
213
+ "step": 8000
214
+ },
215
+ {
216
+ "epoch": 0.10197448088615824,
217
+ "eval_accuracy": 0.8261206217542798,
218
+ "eval_auc": 0.8779715895652771,
219
+ "eval_f1": 0.7968208595461465,
220
+ "eval_loss": 3.1795294284820557,
221
+ "eval_precision": 0.8180364024794908,
222
+ "eval_recall": 0.7846676093864756,
223
+ "eval_runtime": 235.8958,
224
+ "eval_samples_per_second": 1905.239,
225
+ "eval_steps_per_second": 3.722,
226
+ "step": 8000
227
+ },
228
+ {
229
+ "epoch": 0.10834788594154313,
230
+ "grad_norm": 26.25,
231
+ "learning_rate": 0.0001,
232
+ "loss": 3.2028,
233
+ "step": 8500
234
+ },
235
+ {
236
+ "epoch": 0.11472129099692802,
237
+ "grad_norm": 14.625,
238
+ "learning_rate": 0.0001,
239
+ "loss": 3.1689,
240
+ "step": 9000
241
+ },
242
+ {
243
+ "epoch": 0.11472129099692802,
244
+ "eval_accuracy": 0.8285325228396353,
245
+ "eval_auc": 0.8811337947845459,
246
+ "eval_f1": 0.8013452045114392,
247
+ "eval_loss": 3.1494317054748535,
248
+ "eval_precision": 0.8182039479298111,
249
+ "eval_recall": 0.7908135604045534,
250
+ "eval_runtime": 235.9141,
251
+ "eval_samples_per_second": 1905.092,
252
+ "eval_steps_per_second": 3.722,
253
+ "step": 9000
254
+ },
255
+ {
256
+ "epoch": 0.1210946960523129,
257
+ "grad_norm": 13.4375,
258
+ "learning_rate": 0.0001,
259
+ "loss": 3.1133,
260
+ "step": 9500
261
+ },
262
+ {
263
+ "epoch": 0.1274681011076978,
264
+ "grad_norm": 36.0,
265
+ "learning_rate": 0.0001,
266
+ "loss": 3.0911,
267
+ "step": 10000
268
+ },
269
+ {
270
+ "epoch": 0.1274681011076978,
271
+ "eval_accuracy": 0.8282343727054677,
272
+ "eval_auc": 0.8881419897079468,
273
+ "eval_f1": 0.8074358690960415,
274
+ "eval_loss": 3.1354596614837646,
275
+ "eval_precision": 0.8100602141990728,
276
+ "eval_recall": 0.8050712691013812,
277
+ "eval_runtime": 235.8981,
278
+ "eval_samples_per_second": 1905.221,
279
+ "eval_steps_per_second": 3.722,
280
+ "step": 10000
281
+ },
282
+ {
283
+ "epoch": 0.13384150616308269,
284
+ "grad_norm": 14.75,
285
+ "learning_rate": 0.0001,
286
+ "loss": 3.0308,
287
+ "step": 10500
288
+ },
289
+ {
290
+ "epoch": 0.14021491121846757,
291
+ "grad_norm": 12.75,
292
+ "learning_rate": 0.0001,
293
+ "loss": 3.015,
294
+ "step": 11000
295
+ },
296
+ {
297
+ "epoch": 0.14021491121846757,
298
+ "eval_accuracy": 0.8361620512729231,
299
+ "eval_auc": 0.8948841691017151,
300
+ "eval_f1": 0.8145356177908325,
301
+ "eval_loss": 3.019545555114746,
302
+ "eval_precision": 0.8209786628512243,
303
+ "eval_recall": 0.8093553565236841,
304
+ "eval_runtime": 235.8842,
305
+ "eval_samples_per_second": 1905.333,
306
+ "eval_steps_per_second": 3.722,
307
+ "step": 11000
308
+ },
309
+ {
310
+ "epoch": 0.14658831627385246,
311
+ "grad_norm": 20.25,
312
+ "learning_rate": 0.0001,
313
+ "loss": 2.9743,
314
+ "step": 11500
315
+ },
316
+ {
317
+ "epoch": 0.15296172132923735,
318
+ "grad_norm": 12.3125,
319
+ "learning_rate": 0.0001,
320
+ "loss": 2.9401,
321
+ "step": 12000
322
+ },
323
+ {
324
+ "epoch": 0.15296172132923735,
325
+ "eval_accuracy": 0.8361954262879419,
326
+ "eval_auc": 0.8988780379295349,
327
+ "eval_f1": 0.8173352478602913,
328
+ "eval_loss": 3.0004172325134277,
329
+ "eval_precision": 0.8181878263931783,
330
+ "eval_recall": 0.8165118432863682,
331
+ "eval_runtime": 235.9073,
332
+ "eval_samples_per_second": 1905.147,
333
+ "eval_steps_per_second": 3.722,
334
+ "step": 12000
335
+ },
336
+ {
337
+ "epoch": 0.15933512638462224,
338
+ "grad_norm": 15.25,
339
+ "learning_rate": 0.0001,
340
+ "loss": 2.908,
341
+ "step": 12500
342
+ },
343
+ {
344
+ "epoch": 0.16570853144000713,
345
+ "grad_norm": 13.875,
346
+ "learning_rate": 0.0001,
347
+ "loss": 2.8787,
348
+ "step": 13000
349
+ },
350
+ {
351
+ "epoch": 0.16570853144000713,
352
+ "eval_accuracy": 0.8437960297082133,
353
+ "eval_auc": 0.9022700190544128,
354
+ "eval_f1": 0.8193442741933041,
355
+ "eval_loss": 2.874938726425171,
356
+ "eval_precision": 0.8361642928355996,
357
+ "eval_recall": 0.8085115166264416,
358
+ "eval_runtime": 235.9543,
359
+ "eval_samples_per_second": 1904.767,
360
+ "eval_steps_per_second": 3.721,
361
+ "step": 13000
362
+ },
363
+ {
364
+ "epoch": 0.17208193649539202,
365
+ "grad_norm": 16.375,
366
+ "learning_rate": 0.0001,
367
+ "loss": 2.8756,
368
+ "step": 13500
369
+ },
370
+ {
371
+ "epoch": 0.1784553415507769,
372
+ "grad_norm": 12.125,
373
+ "learning_rate": 0.0001,
374
+ "loss": 2.8368,
375
+ "step": 14000
376
+ },
377
+ {
378
+ "epoch": 0.1784553415507769,
379
+ "eval_accuracy": 0.8476319314343691,
380
+ "eval_auc": 0.9060380458831787,
381
+ "eval_f1": 0.8228087161610238,
382
+ "eval_loss": 2.818519353866577,
383
+ "eval_precision": 0.8428622503963941,
384
+ "eval_recall": 0.8105203055669887,
385
+ "eval_runtime": 235.8665,
386
+ "eval_samples_per_second": 1905.476,
387
+ "eval_steps_per_second": 3.722,
388
+ "step": 14000
389
+ },
390
+ {
391
+ "epoch": 0.1848287466061618,
392
+ "grad_norm": 17.625,
393
+ "learning_rate": 0.0001,
394
+ "loss": 2.815,
395
+ "step": 14500
396
+ },
397
+ {
398
+ "epoch": 0.1912021516615467,
399
+ "grad_norm": 14.625,
400
+ "learning_rate": 0.0001,
401
+ "loss": 2.7892,
402
+ "step": 15000
403
+ },
404
+ {
405
+ "epoch": 0.1912021516615467,
406
+ "eval_accuracy": 0.8496433323394995,
407
+ "eval_auc": 0.9092991948127747,
408
+ "eval_f1": 0.8246497759245118,
409
+ "eval_loss": 2.790006637573242,
410
+ "eval_precision": 0.8464656761081797,
411
+ "eval_recall": 0.8116097713844176,
412
+ "eval_runtime": 235.9319,
413
+ "eval_samples_per_second": 1904.948,
414
+ "eval_steps_per_second": 3.721,
415
+ "step": 15000
416
+ },
417
+ {
418
+ "epoch": 0.19757555671693158,
419
+ "grad_norm": 13.875,
420
+ "learning_rate": 0.0001,
421
+ "loss": 2.7762,
422
+ "step": 15500
423
+ },
424
+ {
425
+ "epoch": 0.20394896177231647,
426
+ "grad_norm": 14.1875,
427
+ "learning_rate": 0.0001,
428
+ "loss": 2.7388,
429
+ "step": 16000
430
+ },
431
+ {
432
+ "epoch": 0.20394896177231647,
433
+ "eval_accuracy": 0.8512942830824274,
434
+ "eval_auc": 0.9118760228157043,
435
+ "eval_f1": 0.8312896382532227,
436
+ "eval_loss": 2.753647565841675,
437
+ "eval_precision": 0.8390426673615496,
438
+ "eval_recall": 0.8251724666807414,
439
+ "eval_runtime": 235.9135,
440
+ "eval_samples_per_second": 1905.096,
441
+ "eval_steps_per_second": 3.722,
442
+ "step": 16000
443
+ },
444
+ {
445
+ "epoch": 0.21032236682770136,
446
+ "grad_norm": 14.4375,
447
+ "learning_rate": 0.0001,
448
+ "loss": 2.739,
449
+ "step": 16500
450
+ },
451
+ {
452
+ "epoch": 0.21669577188308625,
453
+ "grad_norm": 19.25,
454
+ "learning_rate": 0.0001,
455
+ "loss": 2.7192,
456
+ "step": 17000
457
+ },
458
+ {
459
+ "epoch": 0.21669577188308625,
460
+ "eval_accuracy": 0.854524984536243,
461
+ "eval_auc": 0.9146767854690552,
462
+ "eval_f1": 0.8303229470241833,
463
+ "eval_loss": 2.7026894092559814,
464
+ "eval_precision": 0.8525518966758392,
465
+ "eval_recall": 0.8169920703438669,
466
+ "eval_runtime": 235.9956,
467
+ "eval_samples_per_second": 1904.434,
468
+ "eval_steps_per_second": 3.72,
469
+ "step": 17000
470
+ },
471
+ {
472
+ "epoch": 0.22306917693847114,
473
+ "grad_norm": 16.875,
474
+ "learning_rate": 0.0001,
475
+ "loss": 2.7023,
476
+ "step": 17500
477
+ },
478
+ {
479
+ "epoch": 0.22944258199385603,
480
+ "grad_norm": 13.125,
481
+ "learning_rate": 0.0001,
482
+ "loss": 2.6888,
483
+ "step": 18000
484
+ },
485
+ {
486
+ "epoch": 0.22944258199385603,
487
+ "eval_accuracy": 0.8558866851490083,
488
+ "eval_auc": 0.916471540927887,
489
+ "eval_f1": 0.8312201448847177,
490
+ "eval_loss": 2.6786839962005615,
491
+ "eval_precision": 0.8560204841421162,
492
+ "eval_recall": 0.8168747132432559,
493
+ "eval_runtime": 235.8807,
494
+ "eval_samples_per_second": 1905.361,
495
+ "eval_steps_per_second": 3.722,
496
+ "step": 18000
497
+ },
498
+ {
499
+ "epoch": 0.23581598704924092,
500
+ "grad_norm": 13.25,
501
+ "learning_rate": 0.0001,
502
+ "loss": 2.6453,
503
+ "step": 18500
504
+ },
505
+ {
506
+ "epoch": 0.2421893921046258,
507
+ "grad_norm": 17.75,
508
+ "learning_rate": 0.0001,
509
+ "loss": 2.6318,
510
+ "step": 19000
511
+ },
512
+ {
513
+ "epoch": 0.2421893921046258,
514
+ "eval_accuracy": 0.8580071111032,
515
+ "eval_auc": 0.9181417226791382,
516
+ "eval_f1": 0.8353620807604387,
517
+ "eval_loss": 2.646692991256714,
518
+ "eval_precision": 0.854392404869657,
519
+ "eval_recall": 0.8232679476079143,
520
+ "eval_runtime": 236.966,
521
+ "eval_samples_per_second": 1896.635,
522
+ "eval_steps_per_second": 3.705,
523
+ "step": 19000
524
+ },
525
+ {
526
+ "epoch": 0.2485627971600107,
527
+ "grad_norm": 11.5625,
528
+ "learning_rate": 0.0001,
529
+ "loss": 2.6293,
530
+ "step": 19500
531
+ },
532
+ {
533
+ "epoch": 0.2549362022153956,
534
+ "grad_norm": 12.875,
535
+ "learning_rate": 0.0001,
536
+ "loss": 2.6017,
537
+ "step": 20000
538
+ },
539
+ {
540
+ "epoch": 0.2549362022153956,
541
+ "eval_accuracy": 0.8599740119883054,
542
+ "eval_auc": 0.9203953146934509,
543
+ "eval_f1": 0.8392022322375958,
544
+ "eval_loss": 2.6171152591705322,
545
+ "eval_precision": 0.8530874017696329,
546
+ "eval_recall": 0.8295022195226356,
547
+ "eval_runtime": 235.9321,
548
+ "eval_samples_per_second": 1904.946,
549
+ "eval_steps_per_second": 3.721,
550
+ "step": 20000
551
+ },
552
+ {
553
+ "epoch": 0.2613096072707805,
554
+ "grad_norm": 17.625,
555
+ "learning_rate": 0.0001,
556
+ "loss": 2.6217,
557
+ "step": 20500
558
+ },
559
+ {
560
+ "epoch": 0.26768301232616537,
561
+ "grad_norm": 14.3125,
562
+ "learning_rate": 0.0001,
563
+ "loss": 2.5889,
564
+ "step": 21000
565
+ },
566
+ {
567
+ "epoch": 0.26768301232616537,
568
+ "eval_accuracy": 0.8611532625189682,
569
+ "eval_auc": 0.9224462509155273,
570
+ "eval_f1": 0.8408396987376525,
571
+ "eval_loss": 2.5873217582702637,
572
+ "eval_precision": 0.853855816993957,
573
+ "eval_recall": 0.8315778560709257,
574
+ "eval_runtime": 235.8945,
575
+ "eval_samples_per_second": 1905.25,
576
+ "eval_steps_per_second": 3.722,
577
+ "step": 21000
578
+ },
579
+ {
580
+ "epoch": 0.2740564173815503,
581
+ "grad_norm": 11.1875,
582
+ "learning_rate": 0.0001,
583
+ "loss": 2.5848,
584
+ "step": 21500
585
+ },
586
+ {
587
+ "epoch": 0.28042982243693515,
588
+ "grad_norm": 15.0625,
589
+ "learning_rate": 0.0001,
590
+ "loss": 2.5618,
591
+ "step": 22000
592
+ },
593
+ {
594
+ "epoch": 0.28042982243693515,
595
+ "eval_accuracy": 0.8609263124168406,
596
+ "eval_auc": 0.9236695170402527,
597
+ "eval_f1": 0.8428097613168799,
598
+ "eval_loss": 2.586594820022583,
599
+ "eval_precision": 0.8491212608075118,
600
+ "eval_recall": 0.8376013596126513,
601
+ "eval_runtime": 235.8797,
602
+ "eval_samples_per_second": 1905.37,
603
+ "eval_steps_per_second": 3.722,
604
+ "step": 22000
605
+ },
606
+ {
607
+ "epoch": 0.28680322749232007,
608
+ "grad_norm": 15.875,
609
+ "learning_rate": 0.0001,
610
+ "loss": 2.5587,
611
+ "step": 22500
612
+ },
613
+ {
614
+ "epoch": 0.29317663254770493,
615
+ "grad_norm": 16.0,
616
+ "learning_rate": 0.0001,
617
+ "loss": 2.5198,
618
+ "step": 23000
619
+ },
620
+ {
621
+ "epoch": 0.29317663254770493,
622
+ "eval_accuracy": 0.8641258638566387,
623
+ "eval_auc": 0.9258681535720825,
624
+ "eval_f1": 0.8433215002829988,
625
+ "eval_loss": 2.530731439590454,
626
+ "eval_precision": 0.8596180420798043,
627
+ "eval_recall": 0.8323607774926369,
628
+ "eval_runtime": 236.896,
629
+ "eval_samples_per_second": 1897.195,
630
+ "eval_steps_per_second": 3.706,
631
+ "step": 23000
632
+ },
633
+ {
634
+ "epoch": 0.29955003760308985,
635
+ "grad_norm": 16.125,
636
+ "learning_rate": 0.0001,
637
+ "loss": 2.5324,
638
+ "step": 23500
639
+ },
640
+ {
641
+ "epoch": 0.3059234426584747,
642
+ "grad_norm": 14.9375,
643
+ "learning_rate": 0.0001,
644
+ "loss": 2.522,
645
+ "step": 24000
646
+ },
647
+ {
648
+ "epoch": 0.3059234426584747,
649
+ "eval_accuracy": 0.8645819890618951,
650
+ "eval_auc": 0.9265353083610535,
651
+ "eval_f1": 0.8449232507125215,
652
+ "eval_loss": 2.515869379043579,
653
+ "eval_precision": 0.8575695182961542,
654
+ "eval_recall": 0.8358181801726667,
655
+ "eval_runtime": 235.8798,
656
+ "eval_samples_per_second": 1905.369,
657
+ "eval_steps_per_second": 3.722,
658
+ "step": 24000
659
+ },
660
+ {
661
+ "epoch": 0.3122968477138596,
662
+ "grad_norm": 13.75,
663
+ "learning_rate": 0.0001,
664
+ "loss": 2.498,
665
+ "step": 24500
666
+ },
667
+ {
668
+ "epoch": 0.3186702527692445,
669
+ "grad_norm": 16.25,
670
+ "learning_rate": 0.0001,
671
+ "loss": 2.486,
672
+ "step": 25000
673
+ },
674
+ {
675
+ "epoch": 0.3186702527692445,
676
+ "eval_accuracy": 0.8657857146035716,
677
+ "eval_auc": 0.9282439947128296,
678
+ "eval_f1": 0.8459563365344265,
679
+ "eval_loss": 2.492276191711426,
680
+ "eval_precision": 0.8598189954610617,
681
+ "eval_recall": 0.8361893429769411,
682
+ "eval_runtime": 235.8558,
683
+ "eval_samples_per_second": 1905.563,
684
+ "eval_steps_per_second": 3.723,
685
+ "step": 25000
686
+ },
687
+ {
688
+ "epoch": 0.3250436578246294,
689
+ "grad_norm": 15.6875,
690
+ "learning_rate": 0.0001,
691
+ "loss": 2.4922,
692
+ "step": 25500
693
+ },
694
+ {
695
+ "epoch": 0.33141706288001427,
696
+ "grad_norm": 14.875,
697
+ "learning_rate": 0.0001,
698
+ "loss": 2.4656,
699
+ "step": 26000
700
+ },
701
+ {
702
+ "epoch": 0.33141706288001427,
703
+ "eval_accuracy": 0.8665488899470005,
704
+ "eval_auc": 0.929278552532196,
705
+ "eval_f1": 0.8470851473256837,
706
+ "eval_loss": 2.4791793823242188,
707
+ "eval_precision": 0.8601149529519857,
708
+ "eval_recall": 0.8377495469299366,
709
+ "eval_runtime": 235.9794,
710
+ "eval_samples_per_second": 1904.565,
711
+ "eval_steps_per_second": 3.721,
712
+ "step": 26000
713
+ },
714
+ {
715
+ "epoch": 0.3377904679353992,
716
+ "grad_norm": 16.875,
717
+ "learning_rate": 0.0001,
718
+ "loss": 2.4566,
719
+ "step": 26500
720
+ },
721
+ {
722
+ "epoch": 0.34416387299078405,
723
+ "grad_norm": 14.875,
724
+ "learning_rate": 0.0001,
725
+ "loss": 2.4475,
726
+ "step": 27000
727
+ },
728
+ {
729
+ "epoch": 0.34416387299078405,
730
+ "eval_accuracy": 0.8678304905237207,
731
+ "eval_auc": 0.930943489074707,
732
+ "eval_f1": 0.847395770631417,
733
+ "eval_loss": 2.4564414024353027,
734
+ "eval_precision": 0.8646021959492661,
735
+ "eval_recall": 0.8359379618427593,
736
+ "eval_runtime": 235.96,
737
+ "eval_samples_per_second": 1904.721,
738
+ "eval_steps_per_second": 3.721,
739
+ "step": 27000
740
+ },
741
+ {
742
+ "epoch": 0.35053727804616897,
743
+ "grad_norm": 16.625,
744
+ "learning_rate": 0.0001,
745
+ "loss": 2.4558,
746
+ "step": 27500
747
+ },
748
+ {
749
+ "epoch": 0.3569106831015538,
750
+ "grad_norm": 12.6875,
751
+ "learning_rate": 0.0001,
752
+ "loss": 2.4269,
753
+ "step": 28000
754
+ },
755
+ {
756
+ "epoch": 0.3569106831015538,
757
+ "eval_accuracy": 0.8688740159933072,
758
+ "eval_auc": 0.931858241558075,
759
+ "eval_f1": 0.850913963051127,
760
+ "eval_loss": 2.438432216644287,
761
+ "eval_precision": 0.860130532609005,
762
+ "eval_recall": 0.8437573238621774,
763
+ "eval_runtime": 235.7882,
764
+ "eval_samples_per_second": 1906.109,
765
+ "eval_steps_per_second": 3.724,
766
+ "step": 28000
767
+ },
768
+ {
769
+ "epoch": 0.36328408815693874,
770
+ "grad_norm": 14.75,
771
+ "learning_rate": 0.0001,
772
+ "loss": 2.4417,
773
+ "step": 28500
774
+ },
775
+ {
776
+ "epoch": 0.3696574932123236,
777
+ "grad_norm": 14.625,
778
+ "learning_rate": 0.0001,
779
+ "loss": 2.4049,
780
+ "step": 29000
781
+ },
782
+ {
783
+ "epoch": 0.3696574932123236,
784
+ "eval_accuracy": 0.8687049159172122,
785
+ "eval_auc": 0.9325141310691833,
786
+ "eval_f1": 0.8503633017719338,
787
+ "eval_loss": 2.4242045879364014,
788
+ "eval_precision": 0.8607433942000131,
789
+ "eval_recall": 0.8425017750352706,
790
+ "eval_runtime": 235.9336,
791
+ "eval_samples_per_second": 1904.934,
792
+ "eval_steps_per_second": 3.721,
793
+ "step": 29000
794
+ },
795
+ {
796
+ "epoch": 0.3760308982677085,
797
+ "grad_norm": 13.5625,
798
+ "learning_rate": 0.0001,
799
+ "loss": 2.4041,
800
+ "step": 29500
801
+ },
802
+ {
803
+ "epoch": 0.3824043033230934,
804
+ "grad_norm": 16.75,
805
+ "learning_rate": 0.0001,
806
+ "loss": 2.4019,
807
+ "step": 30000
808
+ },
809
+ {
810
+ "epoch": 0.3824043033230934,
811
+ "eval_accuracy": 0.8692478161615173,
812
+ "eval_auc": 0.9345240592956543,
813
+ "eval_f1": 0.853530809841845,
814
+ "eval_loss": 2.423635959625244,
815
+ "eval_precision": 0.8562227161992152,
816
+ "eval_recall": 0.8510652656840554,
817
+ "eval_runtime": 235.9233,
818
+ "eval_samples_per_second": 1905.017,
819
+ "eval_steps_per_second": 3.722,
820
+ "step": 30000
821
+ },
822
+ {
823
+ "epoch": 0.3887777083784783,
824
+ "grad_norm": 18.5,
825
+ "learning_rate": 0.0001,
826
+ "loss": 2.3713,
827
+ "step": 30500
828
+ },
829
+ {
830
+ "epoch": 0.39515111343386317,
831
+ "grad_norm": 14.875,
832
+ "learning_rate": 0.0001,
833
+ "loss": 2.3798,
834
+ "step": 31000
835
+ },
836
+ {
837
+ "epoch": 0.39515111343386317,
838
+ "eval_accuracy": 0.8717932173069478,
839
+ "eval_auc": 0.9350019097328186,
840
+ "eval_f1": 0.8538149679524644,
841
+ "eval_loss": 2.3812272548675537,
842
+ "eval_precision": 0.8645258042401223,
843
+ "eval_recall": 0.8457306918961859,
844
+ "eval_runtime": 236.9275,
845
+ "eval_samples_per_second": 1896.943,
846
+ "eval_steps_per_second": 3.706,
847
+ "step": 31000
848
+ },
849
+ {
850
+ "epoch": 0.4015245184892481,
851
+ "grad_norm": 14.875,
852
+ "learning_rate": 0.0001,
853
+ "loss": 2.3885,
854
+ "step": 31500
855
+ },
856
+ {
857
+ "epoch": 0.40789792354463295,
858
+ "grad_norm": 14.9375,
859
+ "learning_rate": 0.0001,
860
+ "loss": 2.3697,
861
+ "step": 32000
862
+ },
863
+ {
864
+ "epoch": 0.40789792354463295,
865
+ "eval_accuracy": 0.8725185676333554,
866
+ "eval_auc": 0.9362517595291138,
867
+ "eval_f1": 0.8536967960754542,
868
+ "eval_loss": 2.366279363632202,
869
+ "eval_precision": 0.8677993339914021,
870
+ "eval_recall": 0.8437154760316758,
871
+ "eval_runtime": 235.8823,
872
+ "eval_samples_per_second": 1905.349,
873
+ "eval_steps_per_second": 3.722,
874
+ "step": 32000
875
+ },
876
+ {
877
+ "epoch": 0.41427132860001786,
878
+ "grad_norm": 15.3125,
879
+ "learning_rate": 0.0001,
880
+ "loss": 2.3609,
881
+ "step": 32500
882
+ },
883
+ {
884
+ "epoch": 0.4206447336554027,
885
+ "grad_norm": 14.6875,
886
+ "learning_rate": 0.0001,
887
+ "loss": 2.3457,
888
+ "step": 33000
889
+ },
890
+ {
891
+ "epoch": 0.4206447336554027,
892
+ "eval_accuracy": 0.8736199431289744,
893
+ "eval_auc": 0.937017023563385,
894
+ "eval_f1": 0.8555537856349047,
895
+ "eval_loss": 2.3477745056152344,
896
+ "eval_precision": 0.8675405385181937,
897
+ "eval_recall": 0.8467145886295767,
898
+ "eval_runtime": 236.0095,
899
+ "eval_samples_per_second": 1904.321,
900
+ "eval_steps_per_second": 3.72,
901
+ "step": 33000
902
+ },
903
+ {
904
+ "epoch": 0.42701813871078764,
905
+ "grad_norm": 15.9375,
906
+ "learning_rate": 0.0001,
907
+ "loss": 2.3372,
908
+ "step": 33500
909
+ },
910
+ {
911
+ "epoch": 0.4333915437661725,
912
+ "grad_norm": 15.125,
913
+ "learning_rate": 0.0001,
914
+ "loss": 2.3387,
915
+ "step": 34000
916
+ },
917
+ {
918
+ "epoch": 0.4333915437661725,
919
+ "eval_accuracy": 0.8744187184884233,
920
+ "eval_auc": 0.9379249811172485,
921
+ "eval_f1": 0.8563329267012663,
922
+ "eval_loss": 2.3327484130859375,
923
+ "eval_precision": 0.8688335161622546,
924
+ "eval_recall": 0.8471966069466921,
925
+ "eval_runtime": 235.9082,
926
+ "eval_samples_per_second": 1905.139,
927
+ "eval_steps_per_second": 3.722,
928
+ "step": 34000
929
+ },
930
+ {
931
+ "epoch": 0.4397649488215574,
932
+ "grad_norm": 13.5625,
933
+ "learning_rate": 0.0001,
934
+ "loss": 2.3124,
935
+ "step": 34500
936
+ },
937
+ {
938
+ "epoch": 0.4461383538769423,
939
+ "grad_norm": 15.0,
940
+ "learning_rate": 0.0001,
941
+ "loss": 2.3108,
942
+ "step": 35000
943
+ },
944
+ {
945
+ "epoch": 0.4461383538769423,
946
+ "eval_accuracy": 0.8752486438618897,
947
+ "eval_auc": 0.9388136267662048,
948
+ "eval_f1": 0.857440432905537,
949
+ "eval_loss": 2.3170506954193115,
950
+ "eval_precision": 0.8693945403391463,
951
+ "eval_recall": 0.8486030934280269,
952
+ "eval_runtime": 237.1783,
953
+ "eval_samples_per_second": 1894.937,
954
+ "eval_steps_per_second": 3.702,
955
+ "step": 35000
956
+ },
957
+ {
958
+ "epoch": 0.4525117589323272,
959
+ "grad_norm": 16.5,
960
+ "learning_rate": 0.0001,
961
+ "loss": 2.3104,
962
+ "step": 35500
963
+ },
964
+ {
965
+ "epoch": 0.45888516398771206,
966
+ "grad_norm": 21.0,
967
+ "learning_rate": 0.0001,
968
+ "loss": 2.3093,
969
+ "step": 36000
970
+ },
971
+ {
972
+ "epoch": 0.45888516398771206,
973
+ "eval_accuracy": 0.8749304687187109,
974
+ "eval_auc": 0.9399862289428711,
975
+ "eval_f1": 0.859832853201925,
976
+ "eval_loss": 2.3292479515075684,
977
+ "eval_precision": 0.8627566786638952,
978
+ "eval_recall": 0.8571670902820068,
979
+ "eval_runtime": 235.9848,
980
+ "eval_samples_per_second": 1904.521,
981
+ "eval_steps_per_second": 3.721,
982
+ "step": 36000
983
+ },
984
+ {
985
+ "epoch": 0.465258569043097,
986
+ "grad_norm": 15.8125,
987
+ "learning_rate": 0.0001,
988
+ "loss": 2.3037,
989
+ "step": 36500
990
+ },
991
+ {
992
+ "epoch": 0.47163197409848184,
993
+ "grad_norm": 15.3125,
994
+ "learning_rate": 0.0001,
995
+ "loss": 2.2886,
996
+ "step": 37000
997
+ },
998
+ {
999
+ "epoch": 0.47163197409848184,
1000
+ "eval_accuracy": 0.8750083437537547,
1001
+ "eval_auc": 0.9404018521308899,
1002
+ "eval_f1": 0.855230925034759,
1003
+ "eval_loss": 2.3024775981903076,
1004
+ "eval_precision": 0.8746376825597328,
1005
+ "eval_recall": 0.8426211541705337,
1006
+ "eval_runtime": 236.0698,
1007
+ "eval_samples_per_second": 1903.835,
1008
+ "eval_steps_per_second": 3.719,
1009
+ "step": 37000
1010
+ },
1011
+ {
1012
+ "epoch": 0.47800537915386676,
1013
+ "grad_norm": 14.8125,
1014
+ "learning_rate": 0.0001,
1015
+ "loss": 2.2959,
1016
+ "step": 37500
1017
+ },
1018
+ {
1019
+ "epoch": 0.4843787842092516,
1020
+ "grad_norm": 16.625,
1021
+ "learning_rate": 0.0001,
1022
+ "loss": 2.2607,
1023
+ "step": 38000
1024
+ },
1025
+ {
1026
+ "epoch": 0.4843787842092516,
1027
+ "eval_accuracy": 0.877444719850124,
1028
+ "eval_auc": 0.941037118434906,
1029
+ "eval_f1": 0.8605285488101033,
1030
+ "eval_loss": 2.27842378616333,
1031
+ "eval_precision": 0.8704496429359335,
1032
+ "eval_recall": 0.8528732041380341,
1033
+ "eval_runtime": 236.143,
1034
+ "eval_samples_per_second": 1903.245,
1035
+ "eval_steps_per_second": 3.718,
1036
+ "step": 38000
1037
+ },
1038
+ {
1039
+ "epoch": 0.49075218926463654,
1040
+ "grad_norm": 14.6875,
1041
+ "learning_rate": 0.0001,
1042
+ "loss": 2.2675,
1043
+ "step": 38500
1044
+ },
1045
+ {
1046
+ "epoch": 0.4971255943200214,
1047
+ "grad_norm": 18.75,
1048
+ "learning_rate": 0.0001,
1049
+ "loss": 2.2426,
1050
+ "step": 39000
1051
+ },
1052
+ {
1053
+ "epoch": 0.4971255943200214,
1054
+ "eval_accuracy": 0.8774046698321014,
1055
+ "eval_auc": 0.9419523477554321,
1056
+ "eval_f1": 0.860408547862752,
1057
+ "eval_loss": 2.262415647506714,
1058
+ "eval_precision": 0.8705951091265102,
1059
+ "eval_recall": 0.8525912689551651,
1060
+ "eval_runtime": 236.1607,
1061
+ "eval_samples_per_second": 1903.103,
1062
+ "eval_steps_per_second": 3.718,
1063
+ "step": 39000
1064
+ },
1065
+ {
1066
+ "epoch": 0.5034989993754063,
1067
+ "grad_norm": 16.75,
1068
+ "learning_rate": 0.0001,
1069
+ "loss": 2.2544,
1070
+ "step": 39500
1071
+ }
1072
+ ],
1073
+ "logging_steps": 500,
1074
+ "max_steps": 78451,
1075
+ "num_input_tokens_seen": 0,
1076
+ "num_train_epochs": 1,
1077
+ "save_steps": 500,
1078
+ "stateful_callbacks": {
1079
+ "TrainerControl": {
1080
+ "args": {
1081
+ "should_epoch_stop": false,
1082
+ "should_evaluate": false,
1083
+ "should_log": false,
1084
+ "should_save": true,
1085
+ "should_training_stop": false
1086
+ },
1087
+ "attributes": {}
1088
+ }
1089
+ },
1090
+ "total_flos": 9.105054320734241e+18,
1091
+ "train_batch_size": 64,
1092
+ "trial_name": null,
1093
+ "trial_params": null
1094
+ }
google/gemma-3-270m-it-mmlu_pro/checkpoint-39500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88ef4a3915b8e9506ab39a5ba6141f508f9a71f0225eebc339973f2f4e19a361
3
+ size 5841