maidacundo commited on
Commit
bb9b1e1
·
verified ·
1 Parent(s): 059eecc

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6cc1acb58f6552f17f47c9e79c248b5766ef129b9cc7004d0289960bcfa126ad
3
  size 68602152
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f80e3de78ec7f718a3637a451b022d00d5747c3921319eeca47a758a0e59be95
3
  size 68602152
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2812cf244d8d46428667984a8e6ab4a612874e817adebcbd079c4d0b057e17c
3
  size 137100235
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d57376255dcc20709e63e6fac8eb03cca00ae997ca828a45e21c3ad3d33673a5
3
  size 137100235
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcd679242a831d4484be1b031fb6a525641a1324383532f6e1c8bb5ac52e4ce7
3
  size 14455
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dc250d4299290adba53e444599baac02762c85fc6141cd19d386e734b288b94
3
  size 14455
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f931434598425c3d35e696dc1ef9a0302880efc75ad45f5e1fbe43cfc68f080
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77ced95f1c67747bc54b27b1781254392fe5947e21557e2368922a37a0b7cc5f
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -4,84 +4,161 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 10,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.1,
14
- "grad_norm": 3.904756546020508,
15
  "learning_rate": 0.0,
16
- "loss": 11.039416313171387,
17
  "step": 1
18
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  {
20
  "epoch": 0.2,
21
- "grad_norm": 2.952423572540283,
22
- "learning_rate": 0.00015,
23
- "loss": 11.029916763305664,
24
- "step": 2
 
 
 
 
 
 
 
25
  },
26
  {
27
  "epoch": 0.3,
28
- "grad_norm": 2.548621654510498,
29
- "learning_rate": 0.0003,
30
- "loss": 10.987221717834473,
31
- "step": 3
 
 
 
 
 
 
 
32
  },
33
  {
34
  "epoch": 0.4,
35
- "grad_norm": 2.724034309387207,
36
- "learning_rate": 0.000288581929876693,
37
- "loss": 10.806486129760742,
38
- "step": 4
 
 
 
 
 
 
 
39
  },
40
  {
41
  "epoch": 0.5,
42
- "grad_norm": 2.913846731185913,
43
- "learning_rate": 0.00025606601717798207,
44
- "loss": 10.382536888122559,
45
- "step": 5
 
 
 
 
 
 
 
46
  },
47
  {
48
  "epoch": 0.6,
49
- "grad_norm": 3.6747395992279053,
50
- "learning_rate": 0.00020740251485476345,
51
- "loss": 10.37621784210205,
52
- "step": 6
 
 
 
 
 
 
 
53
  },
54
  {
55
  "epoch": 0.7,
56
- "grad_norm": 2.790466785430908,
57
- "learning_rate": 0.00015,
58
- "loss": 10.06509780883789,
59
- "step": 7
 
 
 
 
 
 
 
60
  },
61
  {
62
  "epoch": 0.8,
63
- "grad_norm": 3.1298794746398926,
64
- "learning_rate": 9.259748514523653e-05,
65
- "loss": 10.32823371887207,
66
- "step": 8
 
 
 
 
 
 
 
67
  },
68
  {
69
  "epoch": 0.9,
70
- "grad_norm": 4.239963054656982,
71
- "learning_rate": 4.3933982822017876e-05,
72
- "loss": 9.877490997314453,
73
- "step": 9
 
 
 
 
 
 
 
74
  },
75
  {
76
  "epoch": 1.0,
77
- "grad_norm": 3.074774742126465,
78
- "learning_rate": 1.1418070123306989e-05,
79
- "loss": 9.85549545288086,
80
- "step": 10
81
  }
82
  ],
83
- "logging_steps": 1,
84
- "max_steps": 10,
85
  "num_input_tokens_seen": 0,
86
  "num_train_epochs": 9223372036854775807,
87
  "save_steps": 999999,
@@ -97,7 +174,7 @@
97
  "attributes": {}
98
  }
99
  },
100
- "total_flos": 131090350080.0,
101
  "train_batch_size": 2,
102
  "trial_name": null,
103
  "trial_params": null
 
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.005,
14
+ "grad_norm": 5.714346885681152,
15
  "learning_rate": 0.0,
16
+ "loss": 22.1131649017334,
17
  "step": 1
18
  },
19
+ {
20
+ "epoch": 0.05,
21
+ "grad_norm": 4.169634819030762,
22
+ "learning_rate": 0.000135,
23
+ "loss": 21.767859564887154,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.1,
28
+ "grad_norm": 5.887928009033203,
29
+ "learning_rate": 0.000285,
30
+ "loss": 19.7130615234375,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.15,
35
+ "grad_norm": 3.950296401977539,
36
+ "learning_rate": 0.00029815325108927063,
37
+ "loss": 16.41785888671875,
38
+ "step": 30
39
+ },
40
  {
41
  "epoch": 0.2,
42
+ "grad_norm": 3.509552240371704,
43
+ "learning_rate": 0.00029182778633989753,
44
+ "loss": 13.775221252441407,
45
+ "step": 40
46
+ },
47
+ {
48
+ "epoch": 0.25,
49
+ "grad_norm": 4.330707550048828,
50
+ "learning_rate": 0.00028119295607090933,
51
+ "loss": 12.137359619140625,
52
+ "step": 50
53
  },
54
  {
55
  "epoch": 0.3,
56
+ "grad_norm": 5.043827533721924,
57
+ "learning_rate": 0.0002665718942185456,
58
+ "loss": 10.122330474853516,
59
+ "step": 60
60
+ },
61
+ {
62
+ "epoch": 0.35,
63
+ "grad_norm": 7.171847820281982,
64
+ "learning_rate": 0.0002484088543485761,
65
+ "loss": 9.294523620605469,
66
+ "step": 70
67
  },
68
  {
69
  "epoch": 0.4,
70
+ "grad_norm": 5.002739906311035,
71
+ "learning_rate": 0.00022725571123650813,
72
+ "loss": 8.580620574951173,
73
+ "step": 80
74
+ },
75
+ {
76
+ "epoch": 0.45,
77
+ "grad_norm": 7.2115864753723145,
78
+ "learning_rate": 0.000203755192431795,
79
+ "loss": 7.536891174316406,
80
+ "step": 90
81
  },
82
  {
83
  "epoch": 0.5,
84
+ "grad_norm": 6.534404277801514,
85
+ "learning_rate": 0.0001786213493064817,
86
+ "loss": 6.871263122558593,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.55,
91
+ "grad_norm": 7.4687371253967285,
92
+ "learning_rate": 0.00015261786096559254,
93
+ "loss": 6.103485488891602,
94
+ "step": 110
95
  },
96
  {
97
  "epoch": 0.6,
98
+ "grad_norm": 7.448652744293213,
99
+ "learning_rate": 0.00012653483024396533,
100
+ "loss": 5.706993484497071,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.65,
105
+ "grad_norm": 8.251703262329102,
106
+ "learning_rate": 0.00010116477683142652,
107
+ "loss": 5.297743225097657,
108
+ "step": 130
109
  },
110
  {
111
  "epoch": 0.7,
112
+ "grad_norm": 8.646885871887207,
113
+ "learning_rate": 7.727855696304944e-05,
114
+ "loss": 5.091459274291992,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.75,
119
+ "grad_norm": 6.968000888824463,
120
+ "learning_rate": 5.56019413425244e-05,
121
+ "loss": 5.0955852508544925,
122
+ "step": 150
123
  },
124
  {
125
  "epoch": 0.8,
126
+ "grad_norm": 7.26023530960083,
127
+ "learning_rate": 3.6793562966584196e-05,
128
+ "loss": 4.98658561706543,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.85,
133
+ "grad_norm": 6.96763801574707,
134
+ "learning_rate": 2.1424904894683165e-05,
135
+ "loss": 4.740906524658203,
136
+ "step": 170
137
  },
138
  {
139
  "epoch": 0.9,
140
+ "grad_norm": 5.774592399597168,
141
+ "learning_rate": 9.962936025419754e-06,
142
+ "loss": 5.162647247314453,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.95,
147
+ "grad_norm": 5.674193382263184,
148
+ "learning_rate": 2.7559224828504035e-06,
149
+ "loss": 4.910265731811523,
150
+ "step": 190
151
  },
152
  {
153
  "epoch": 1.0,
154
+ "grad_norm": 5.3918843269348145,
155
+ "learning_rate": 2.284572654130956e-08,
156
+ "loss": 4.620555114746094,
157
+ "step": 200
158
  }
159
  ],
160
+ "logging_steps": 10,
161
+ "max_steps": 200,
162
  "num_input_tokens_seen": 0,
163
  "num_train_epochs": 9223372036854775807,
164
  "save_steps": 999999,
 
174
  "attributes": {}
175
  }
176
  },
177
+ "total_flos": 5243614003200.0,
178
  "train_batch_size": 2,
179
  "trial_name": null,
180
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e7be4b80f34efa8c6ffa736724ae3152fa29c25aa950c50b57dfee4c09cee52
3
  size 5329
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:415469a51be6dee5831e245b12681d572265671f676d2c3884a4e71aefabe5d4
3
  size 5329