Hugging-GK commited on
Commit
2d1ea20
·
verified ·
1 Parent(s): 29f3db7

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +42 -126
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5a50d5521adeff20945a7719949087357d4cd8f287db88632b0da4e7b7caf85
3
  size 664584480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e325566f131c65e9ade91ef4d6b36bf75f4d6d3f099460bf3dccbd2564d075c3
3
  size 664584480
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb68b11a39516e8f06bd559c4d177450f1a4e9dfebf14172a2847449ea3a0597
3
  size 1329377575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb20f5cc62e14285d3b8a72eaca1ae71faf3b293ee954757b7f91023a0fca6d9
3
  size 1329377575
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d85b518b1add175fef95a2cd66cf2e301b338be446b49c058da16fb22164c09d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1599b40990af505c591b8a948a922af153eca9aff68fc7776ee92dc8b2dd2b0
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f36b49862996c84ec71df64aef02c8b667bafc5196ea03b019509144d50963b4
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8db2bbbb37be651c165fdb7fb553d8117ad58dbacc21093c0131c68d882ce6e4
3
  size 1465
trainer_state.json CHANGED
@@ -1,160 +1,76 @@
1
  {
2
- "best_global_step": 140,
3
- "best_metric": 0.21776749193668365,
4
- "best_model_checkpoint": "/content/models/gemma_jigsaw_lmh/checkpoint-140",
5
- "epoch": 2.7450980392156863,
6
  "eval_steps": 20,
7
- "global_step": 140,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 2.5688398241996766,
14
  "epoch": 0.39215686274509803,
15
- "grad_norm": 6.326695442199707,
16
  "learning_rate": 8.758169934640524e-06,
17
- "loss": 0.5574,
18
- "mean_token_accuracy": 0.75078125,
19
- "num_tokens": 67412.0,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.39215686274509803,
24
- "eval_entropy": 2.5053940552931566,
25
- "eval_loss": 0.3418131172657013,
26
- "eval_mean_token_accuracy": 0.793924826842088,
27
- "eval_num_tokens": 67412.0,
28
- "eval_runtime": 2.6834,
29
- "eval_samples_per_second": 75.649,
30
- "eval_steps_per_second": 4.845,
31
  "step": 20
32
  },
33
  {
34
- "entropy": 2.434329295158386,
35
  "epoch": 0.7843137254901961,
36
- "grad_norm": 7.59612512588501,
37
  "learning_rate": 7.450980392156863e-06,
38
- "loss": 0.3467,
39
- "mean_token_accuracy": 0.80703125,
40
- "num_tokens": 134808.0,
41
  "step": 40
42
  },
43
  {
44
  "epoch": 0.7843137254901961,
45
- "eval_entropy": 2.478470985706036,
46
- "eval_loss": 0.28553512692451477,
47
- "eval_mean_token_accuracy": 0.8583916104756869,
48
- "eval_num_tokens": 134808.0,
49
- "eval_runtime": 2.7069,
50
- "eval_samples_per_second": 74.993,
51
- "eval_steps_per_second": 4.803,
52
  "step": 40
53
  },
54
  {
55
- "entropy": 2.4949795722961428,
56
  "epoch": 1.1764705882352942,
57
- "grad_norm": 9.272110939025879,
58
  "learning_rate": 6.143790849673204e-06,
59
- "loss": 0.2735,
60
- "mean_token_accuracy": 0.8612723216414452,
61
- "num_tokens": 201046.0,
62
  "step": 60
63
  },
64
  {
65
  "epoch": 1.1764705882352942,
66
- "eval_entropy": 2.478868062679584,
67
- "eval_loss": 0.23516832292079926,
68
- "eval_mean_token_accuracy": 0.8848339181679946,
69
- "eval_num_tokens": 201046.0,
70
- "eval_runtime": 2.6624,
71
- "eval_samples_per_second": 76.246,
72
- "eval_steps_per_second": 4.883,
73
- "step": 60
74
- },
75
- {
76
- "entropy": 2.3844941794872283,
77
- "epoch": 1.5686274509803921,
78
- "grad_norm": 5.027565956115723,
79
- "learning_rate": 4.836601307189543e-06,
80
- "loss": 0.2427,
81
- "mean_token_accuracy": 0.8828125,
82
- "num_tokens": 269334.0,
83
- "step": 80
84
- },
85
- {
86
- "epoch": 1.5686274509803921,
87
- "eval_entropy": 2.4057154105259824,
88
- "eval_loss": 0.2731766998767853,
89
- "eval_mean_token_accuracy": 0.8730332163664011,
90
- "eval_num_tokens": 269334.0,
91
- "eval_runtime": 2.7181,
92
- "eval_samples_per_second": 74.685,
93
- "eval_steps_per_second": 4.783,
94
- "step": 80
95
- },
96
- {
97
- "entropy": 2.388584631681442,
98
- "epoch": 1.9607843137254903,
99
- "grad_norm": 4.1141252517700195,
100
- "learning_rate": 3.529411764705883e-06,
101
- "loss": 0.2321,
102
- "mean_token_accuracy": 0.88828125,
103
- "num_tokens": 337424.0,
104
- "step": 100
105
- },
106
- {
107
- "epoch": 1.9607843137254903,
108
- "eval_entropy": 2.409378546934861,
109
- "eval_loss": 0.223977193236351,
110
- "eval_mean_token_accuracy": 0.8955419567915109,
111
- "eval_num_tokens": 337424.0,
112
- "eval_runtime": 2.6976,
113
- "eval_samples_per_second": 75.252,
114
- "eval_steps_per_second": 4.819,
115
- "step": 100
116
- },
117
- {
118
- "entropy": 2.3580436170101167,
119
- "epoch": 2.3529411764705883,
120
- "grad_norm": 7.861292362213135,
121
- "learning_rate": 2.222222222222222e-06,
122
- "loss": 0.2025,
123
- "mean_token_accuracy": 0.9146205350756645,
124
- "num_tokens": 404955.0,
125
- "step": 120
126
- },
127
- {
128
- "epoch": 2.3529411764705883,
129
- "eval_entropy": 2.4057958676264835,
130
- "eval_loss": 0.24806056916713715,
131
- "eval_mean_token_accuracy": 0.8933566441902747,
132
- "eval_num_tokens": 404955.0,
133
- "eval_runtime": 2.6948,
134
- "eval_samples_per_second": 75.329,
135
- "eval_steps_per_second": 4.824,
136
- "step": 120
137
- },
138
- {
139
- "entropy": 2.3841957092285155,
140
- "epoch": 2.7450980392156863,
141
- "grad_norm": 10.065418243408203,
142
- "learning_rate": 9.150326797385621e-07,
143
- "loss": 0.1868,
144
- "mean_token_accuracy": 0.92421875,
145
- "num_tokens": 472246.0,
146
- "step": 140
147
- },
148
- {
149
- "epoch": 2.7450980392156863,
150
- "eval_entropy": 2.4176712219531717,
151
- "eval_loss": 0.21776749193668365,
152
  "eval_mean_token_accuracy": 0.8944493027833792,
153
- "eval_num_tokens": 472246.0,
154
- "eval_runtime": 2.7057,
155
- "eval_samples_per_second": 75.027,
156
- "eval_steps_per_second": 4.805,
157
- "step": 140
158
  }
159
  ],
160
  "logging_steps": 20,
@@ -174,7 +90,7 @@
174
  "attributes": {}
175
  }
176
  },
177
- "total_flos": 1.0222031863807488e+16,
178
  "train_batch_size": 16,
179
  "trial_name": null,
180
  "trial_params": null
 
1
  {
2
+ "best_global_step": 60,
3
+ "best_metric": 0.21963942050933838,
4
+ "best_model_checkpoint": "/content/models/gemma_jigsaw_lmh/checkpoint-60",
5
+ "epoch": 1.1764705882352942,
6
  "eval_steps": 20,
7
+ "global_step": 60,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 2.3684401620518076,
14
  "epoch": 0.39215686274509803,
15
+ "grad_norm": 5.825000286102295,
16
  "learning_rate": 8.758169934640524e-06,
17
+ "loss": 0.2165,
18
+ "mean_token_accuracy": 0.9179518394397966,
19
+ "num_tokens": 581990.0,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.39215686274509803,
24
+ "eval_entropy": 2.3978513204134426,
25
+ "eval_loss": 0.22738918662071228,
26
+ "eval_mean_token_accuracy": 0.892045456629533,
27
+ "eval_num_tokens": 581990.0,
28
+ "eval_runtime": 2.7348,
29
+ "eval_samples_per_second": 74.228,
30
+ "eval_steps_per_second": 4.754,
31
  "step": 20
32
  },
33
  {
34
+ "entropy": 2.366195046901703,
35
  "epoch": 0.7843137254901961,
36
+ "grad_norm": 15.021644592285156,
37
  "learning_rate": 7.450980392156863e-06,
38
+ "loss": 0.188,
39
+ "mean_token_accuracy": 0.92265625,
40
+ "num_tokens": 649386.0,
41
  "step": 40
42
  },
43
  {
44
  "epoch": 0.7843137254901961,
45
+ "eval_entropy": 2.3892184037428637,
46
+ "eval_loss": 0.28083646297454834,
47
+ "eval_mean_token_accuracy": 0.8824300720141485,
48
+ "eval_num_tokens": 649386.0,
49
+ "eval_runtime": 2.7229,
50
+ "eval_samples_per_second": 74.553,
51
+ "eval_steps_per_second": 4.774,
52
  "step": 40
53
  },
54
  {
55
+ "entropy": 2.4170044481754305,
56
  "epoch": 1.1764705882352942,
57
+ "grad_norm": 12.809877395629883,
58
  "learning_rate": 6.143790849673204e-06,
59
+ "loss": 0.182,
60
+ "mean_token_accuracy": 0.9083705350756646,
61
+ "num_tokens": 715624.0,
62
  "step": 60
63
  },
64
  {
65
  "epoch": 1.1764705882352942,
66
+ "eval_entropy": 2.436244304363544,
67
+ "eval_loss": 0.21963942050933838,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  "eval_mean_token_accuracy": 0.8944493027833792,
69
+ "eval_num_tokens": 715624.0,
70
+ "eval_runtime": 2.7838,
71
+ "eval_samples_per_second": 72.923,
72
+ "eval_steps_per_second": 4.67,
73
+ "step": 60
74
  }
75
  ],
76
  "logging_steps": 20,
 
90
  "attributes": {}
91
  }
92
  },
93
+ "total_flos": 4282364704969728.0,
94
  "train_batch_size": 16,
95
  "trial_name": null,
96
  "trial_params": null