Hugging-GK commited on
Commit
46513cf
·
verified ·
1 Parent(s): 2d1ea20

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "up_proj",
 
29
  "v_proj",
 
30
  "o_proj",
31
- "down_proj",
32
  "gate_proj",
33
- "q_proj",
34
- "k_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "down_proj",
29
+ "k_proj",
30
  "v_proj",
31
+ "q_proj",
32
  "o_proj",
 
33
  "gate_proj",
34
+ "up_proj"
 
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e325566f131c65e9ade91ef4d6b36bf75f4d6d3f099460bf3dccbd2564d075c3
3
  size 664584480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d8d37d3bd75fc01d14a55529984572995053bef53b73a30d53f4b080d82a0f2
3
  size 664584480
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb20f5cc62e14285d3b8a72eaca1ae71faf3b293ee954757b7f91023a0fca6d9
3
  size 1329377575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:622750c88c69e639da993c5b73cd1aa1da12f5e11cd900058b9e9a499b2fffdf
3
  size 1329377575
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1599b40990af505c591b8a948a922af153eca9aff68fc7776ee92dc8b2dd2b0
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d85b518b1add175fef95a2cd66cf2e301b338be446b49c058da16fb22164c09d
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8db2bbbb37be651c165fdb7fb553d8117ad58dbacc21093c0131c68d882ce6e4
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f36b49862996c84ec71df64aef02c8b667bafc5196ea03b019509144d50963b4
3
  size 1465
trainer_state.json CHANGED
@@ -1,76 +1,160 @@
1
  {
2
- "best_global_step": 60,
3
- "best_metric": 0.21963942050933838,
4
- "best_model_checkpoint": "/content/models/gemma_jigsaw_lmh/checkpoint-60",
5
- "epoch": 1.1764705882352942,
6
  "eval_steps": 20,
7
- "global_step": 60,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 2.3684401620518076,
14
  "epoch": 0.39215686274509803,
15
- "grad_norm": 5.825000286102295,
16
  "learning_rate": 8.758169934640524e-06,
17
- "loss": 0.2165,
18
- "mean_token_accuracy": 0.9179518394397966,
19
- "num_tokens": 581990.0,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.39215686274509803,
24
- "eval_entropy": 2.3978513204134426,
25
- "eval_loss": 0.22738918662071228,
26
- "eval_mean_token_accuracy": 0.892045456629533,
27
- "eval_num_tokens": 581990.0,
28
- "eval_runtime": 2.7348,
29
- "eval_samples_per_second": 74.228,
30
- "eval_steps_per_second": 4.754,
31
  "step": 20
32
  },
33
  {
34
- "entropy": 2.366195046901703,
35
  "epoch": 0.7843137254901961,
36
- "grad_norm": 15.021644592285156,
37
  "learning_rate": 7.450980392156863e-06,
38
- "loss": 0.188,
39
- "mean_token_accuracy": 0.92265625,
40
- "num_tokens": 649386.0,
41
  "step": 40
42
  },
43
  {
44
  "epoch": 0.7843137254901961,
45
- "eval_entropy": 2.3892184037428637,
46
- "eval_loss": 0.28083646297454834,
47
- "eval_mean_token_accuracy": 0.8824300720141485,
48
- "eval_num_tokens": 649386.0,
49
- "eval_runtime": 2.7229,
50
- "eval_samples_per_second": 74.553,
51
- "eval_steps_per_second": 4.774,
52
  "step": 40
53
  },
54
  {
55
- "entropy": 2.4170044481754305,
56
  "epoch": 1.1764705882352942,
57
- "grad_norm": 12.809877395629883,
58
  "learning_rate": 6.143790849673204e-06,
59
- "loss": 0.182,
60
- "mean_token_accuracy": 0.9083705350756646,
61
- "num_tokens": 715624.0,
62
  "step": 60
63
  },
64
  {
65
  "epoch": 1.1764705882352942,
66
- "eval_entropy": 2.436244304363544,
67
- "eval_loss": 0.21963942050933838,
68
- "eval_mean_token_accuracy": 0.8944493027833792,
69
- "eval_num_tokens": 715624.0,
70
- "eval_runtime": 2.7838,
71
- "eval_samples_per_second": 72.923,
72
- "eval_steps_per_second": 4.67,
73
  "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  }
75
  ],
76
  "logging_steps": 20,
@@ -90,7 +174,7 @@
90
  "attributes": {}
91
  }
92
  },
93
- "total_flos": 4282364704969728.0,
94
  "train_batch_size": 16,
95
  "trial_name": null,
96
  "trial_params": null
 
1
  {
2
+ "best_global_step": 140,
3
+ "best_metric": 0.22836367785930634,
4
+ "best_model_checkpoint": "/content/models/gemma_jigsaw_lmh/checkpoint-140",
5
+ "epoch": 2.7450980392156863,
6
  "eval_steps": 20,
7
+ "global_step": 140,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 2.560735213756561,
14
  "epoch": 0.39215686274509803,
15
+ "grad_norm": 6.186038494110107,
16
  "learning_rate": 8.758169934640524e-06,
17
+ "loss": 0.5541,
18
+ "mean_token_accuracy": 0.7453125,
19
+ "num_tokens": 67412.0,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.39215686274509803,
24
+ "eval_entropy": 2.491304580981915,
25
+ "eval_loss": 0.3785918951034546,
26
+ "eval_mean_token_accuracy": 0.7578671345343957,
27
+ "eval_num_tokens": 67412.0,
28
+ "eval_runtime": 2.6932,
29
+ "eval_samples_per_second": 75.376,
30
+ "eval_steps_per_second": 4.827,
31
  "step": 20
32
  },
33
  {
34
+ "entropy": 2.419952464103699,
35
  "epoch": 0.7843137254901961,
36
+ "grad_norm": 2.4226012229919434,
37
  "learning_rate": 7.450980392156863e-06,
38
+ "loss": 0.3293,
39
+ "mean_token_accuracy": 0.81640625,
40
+ "num_tokens": 134808.0,
41
  "step": 40
42
  },
43
  {
44
  "epoch": 0.7843137254901961,
45
+ "eval_entropy": 2.4435020043299747,
46
+ "eval_loss": 0.2946617007255554,
47
+ "eval_mean_token_accuracy": 0.8513986009817857,
48
+ "eval_num_tokens": 134808.0,
49
+ "eval_runtime": 2.6642,
50
+ "eval_samples_per_second": 76.195,
51
+ "eval_steps_per_second": 4.879,
52
  "step": 40
53
  },
54
  {
55
+ "entropy": 2.423317462205887,
56
  "epoch": 1.1764705882352942,
57
+ "grad_norm": 10.875091552734375,
58
  "learning_rate": 6.143790849673204e-06,
59
+ "loss": 0.2871,
60
+ "mean_token_accuracy": 0.8487723216414451,
61
+ "num_tokens": 201046.0,
62
  "step": 60
63
  },
64
  {
65
  "epoch": 1.1764705882352942,
66
+ "eval_entropy": 2.448130937723013,
67
+ "eval_loss": 0.2594100534915924,
68
+ "eval_mean_token_accuracy": 0.8621066441902747,
69
+ "eval_num_tokens": 201046.0,
70
+ "eval_runtime": 2.6633,
71
+ "eval_samples_per_second": 76.221,
72
+ "eval_steps_per_second": 4.881,
73
  "step": 60
74
+ },
75
+ {
76
+ "entropy": 2.3511528968811035,
77
+ "epoch": 1.5686274509803921,
78
+ "grad_norm": 5.017323970794678,
79
+ "learning_rate": 4.836601307189543e-06,
80
+ "loss": 0.2503,
81
+ "mean_token_accuracy": 0.87109375,
82
+ "num_tokens": 269334.0,
83
+ "step": 80
84
+ },
85
+ {
86
+ "epoch": 1.5686274509803921,
87
+ "eval_entropy": 2.3352334682758036,
88
+ "eval_loss": 0.2594275176525116,
89
+ "eval_mean_token_accuracy": 0.8824300720141485,
90
+ "eval_num_tokens": 269334.0,
91
+ "eval_runtime": 2.6613,
92
+ "eval_samples_per_second": 76.278,
93
+ "eval_steps_per_second": 4.885,
94
+ "step": 80
95
+ },
96
+ {
97
+ "entropy": 2.3079891920089723,
98
+ "epoch": 1.9607843137254903,
99
+ "grad_norm": 7.2991743087768555,
100
+ "learning_rate": 3.529411764705883e-06,
101
+ "loss": 0.247,
102
+ "mean_token_accuracy": 0.87890625,
103
+ "num_tokens": 337424.0,
104
+ "step": 100
105
+ },
106
+ {
107
+ "epoch": 1.9607843137254903,
108
+ "eval_entropy": 2.3540270145122824,
109
+ "eval_loss": 0.23124322295188904,
110
+ "eval_mean_token_accuracy": 0.8861451057287363,
111
+ "eval_num_tokens": 337424.0,
112
+ "eval_runtime": 2.7035,
113
+ "eval_samples_per_second": 75.087,
114
+ "eval_steps_per_second": 4.809,
115
+ "step": 100
116
+ },
117
+ {
118
+ "entropy": 2.3495707869529725,
119
+ "epoch": 2.3529411764705883,
120
+ "grad_norm": 5.988176345825195,
121
+ "learning_rate": 2.222222222222222e-06,
122
+ "loss": 0.2009,
123
+ "mean_token_accuracy": 0.9171875,
124
+ "num_tokens": 404955.0,
125
+ "step": 120
126
+ },
127
+ {
128
+ "epoch": 2.3529411764705883,
129
+ "eval_entropy": 2.4085900966937723,
130
+ "eval_loss": 0.23444519937038422,
131
+ "eval_mean_token_accuracy": 0.8957604903441209,
132
+ "eval_num_tokens": 404955.0,
133
+ "eval_runtime": 2.6365,
134
+ "eval_samples_per_second": 76.995,
135
+ "eval_steps_per_second": 4.931,
136
+ "step": 120
137
+ },
138
+ {
139
+ "entropy": 2.3811947822570803,
140
+ "epoch": 2.7450980392156863,
141
+ "grad_norm": 9.767471313476562,
142
+ "learning_rate": 9.150326797385621e-07,
143
+ "loss": 0.1973,
144
+ "mean_token_accuracy": 0.91328125,
145
+ "num_tokens": 472246.0,
146
+ "step": 140
147
+ },
148
+ {
149
+ "epoch": 2.7450980392156863,
150
+ "eval_entropy": 2.4008009983943057,
151
+ "eval_loss": 0.22836367785930634,
152
+ "eval_mean_token_accuracy": 0.9040646873987638,
153
+ "eval_num_tokens": 472246.0,
154
+ "eval_runtime": 2.645,
155
+ "eval_samples_per_second": 76.748,
156
+ "eval_steps_per_second": 4.915,
157
+ "step": 140
158
  }
159
  ],
160
  "logging_steps": 20,
 
174
  "attributes": {}
175
  }
176
  },
177
+ "total_flos": 1.0222031863807488e+16,
178
  "train_batch_size": 16,
179
  "trial_name": null,
180
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6666f3ffe597916194c55d1e8f4a39a253059f0511383c7a71cd741dc4d25435
3
  size 6353
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba3e30758f814af90a14f019df18e1d2888c00c0bbdb4d0137148582f3a9dada
3
  size 6353