Azrail commited on
Commit
a3fce1a
·
verified ·
1 Parent(s): 89647d4

Training in progress, step 63000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbbf7b607c85a5d696bff54af0adb9f239d76d76446306b0d75e85fb86338432
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0f52abb8596fb1c55e5609ec97ec3ea8479c701d5763f12612f03207baebfdc
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3561f0a9213e3ac9e43eff9c9d946a42b171ff83db0a3806965305d6e1bbe28a
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc42234c3f4bb7923a06f1e41810d1e801108c51e07feed1ea66a8af7c05bc5a
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b604bf86b8b70beb6e4043604c61f8577f1fbe75a9d1e20249b5622ec5aa2654
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a97095234a7b82e99cd1b23ba4db26c35942b8b4622876b166d0ce65b7c7110
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68dfbb60d9dcf18c45914087cca91dc6c214da7f11269c4a414921902f313d06
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca0e8dbf69c9810c713183e067be8112924d576870302a9fb3c526f389826e7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.41704293019743954,
6
  "eval_steps": 500,
7
- "global_step": 62000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11044,11 +11044,189 @@
11044
  "eval_steps_per_second": 23.717,
11045
  "num_input_tokens_seen": 16252928000,
11046
  "step": 62000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11047
  }
11048
  ],
11049
  "logging_steps": 50,
11050
  "max_steps": 70000,
11051
- "num_input_tokens_seen": 16252928000,
11052
  "num_train_epochs": 1,
11053
  "save_steps": 1000,
11054
  "stateful_callbacks": {
@@ -11063,7 +11241,7 @@
11063
  "attributes": {}
11064
  }
11065
  },
11066
- "total_flos": 4.34781686857728e+18,
11067
  "train_batch_size": 64,
11068
  "trial_name": null,
11069
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4237694290715918,
6
  "eval_steps": 500,
7
+ "global_step": 63000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11044
  "eval_steps_per_second": 23.717,
11045
  "num_input_tokens_seen": 16252928000,
11046
  "step": 62000
11047
+ },
11048
+ {
11049
+ "epoch": 0.41737925514114715,
11050
+ "grad_norm": 0.19505389034748077,
11051
+ "learning_rate": 0.0004950913403314252,
11052
+ "loss": 2.9995,
11053
+ "num_input_tokens_seen": 16266035200,
11054
+ "step": 62050
11055
+ },
11056
+ {
11057
+ "epoch": 0.41771558008485477,
11058
+ "grad_norm": 0.18988089263439178,
11059
+ "learning_rate": 0.0004901831537696859,
11060
+ "loss": 3.0041,
11061
+ "num_input_tokens_seen": 16279142400,
11062
+ "step": 62100
11063
+ },
11064
+ {
11065
+ "epoch": 0.4180519050285624,
11066
+ "grad_norm": 0.19544407725334167,
11067
+ "learning_rate": 0.0004852759133760184,
11068
+ "loss": 3.0073,
11069
+ "num_input_tokens_seen": 16292249600,
11070
+ "step": 62150
11071
+ },
11072
+ {
11073
+ "epoch": 0.41838822997227,
11074
+ "grad_norm": 0.1884351521730423,
11075
+ "learning_rate": 0.00048037009212046586,
11076
+ "loss": 3.0035,
11077
+ "num_input_tokens_seen": 16305356800,
11078
+ "step": 62200
11079
+ },
11080
+ {
11081
+ "epoch": 0.4187245549159776,
11082
+ "grad_norm": 0.17927390336990356,
11083
+ "learning_rate": 0.000475466162836291,
11084
+ "loss": 2.9921,
11085
+ "num_input_tokens_seen": 16318464000,
11086
+ "step": 62250
11087
+ },
11088
+ {
11089
+ "epoch": 0.4190608798596852,
11090
+ "grad_norm": 0.18687283992767334,
11091
+ "learning_rate": 0.00047056459817440544,
11092
+ "loss": 3.0042,
11093
+ "num_input_tokens_seen": 16331571200,
11094
+ "step": 62300
11095
+ },
11096
+ {
11097
+ "epoch": 0.4193972048033928,
11098
+ "grad_norm": 0.18783149123191833,
11099
+ "learning_rate": 0.00046566587055781316,
11100
+ "loss": 3.0003,
11101
+ "num_input_tokens_seen": 16344678400,
11102
+ "step": 62350
11103
+ },
11104
+ {
11105
+ "epoch": 0.41973352974710043,
11106
+ "grad_norm": 0.18625770509243011,
11107
+ "learning_rate": 0.0004607704521360776,
11108
+ "loss": 3.0061,
11109
+ "num_input_tokens_seen": 16357785600,
11110
+ "step": 62400
11111
+ },
11112
+ {
11113
+ "epoch": 0.4200698546908081,
11114
+ "grad_norm": 0.20189669728279114,
11115
+ "learning_rate": 0.00045587881473981533,
11116
+ "loss": 2.9976,
11117
+ "num_input_tokens_seen": 16370892800,
11118
+ "step": 62450
11119
+ },
11120
+ {
11121
+ "epoch": 0.4204061796345157,
11122
+ "grad_norm": 0.19049198925495148,
11123
+ "learning_rate": 0.0004509914298352197,
11124
+ "loss": 3.0055,
11125
+ "num_input_tokens_seen": 16384000000,
11126
+ "step": 62500
11127
+ },
11128
+ {
11129
+ "epoch": 0.4204061796345157,
11130
+ "eval_loss": 2.896798849105835,
11131
+ "eval_runtime": 52.8908,
11132
+ "eval_samples_per_second": 94.534,
11133
+ "eval_steps_per_second": 23.634,
11134
+ "num_input_tokens_seen": 16384000000,
11135
+ "step": 62500
11136
+ },
11137
+ {
11138
+ "epoch": 0.4207425045782233,
11139
+ "grad_norm": 0.1667575091123581,
11140
+ "learning_rate": 0.00044610876847862033,
11141
+ "loss": 2.9929,
11142
+ "num_input_tokens_seen": 16397107200,
11143
+ "step": 62550
11144
+ },
11145
+ {
11146
+ "epoch": 0.42107882952193093,
11147
+ "grad_norm": 0.7176526188850403,
11148
+ "learning_rate": 0.00044123130127108126,
11149
+ "loss": 2.9918,
11150
+ "num_input_tokens_seen": 16410214400,
11151
+ "step": 62600
11152
+ },
11153
+ {
11154
+ "epoch": 0.42141515446563854,
11155
+ "grad_norm": 0.20578069984912872,
11156
+ "learning_rate": 0.00043635949831304343,
11157
+ "loss": 3.0037,
11158
+ "num_input_tokens_seen": 16423321600,
11159
+ "step": 62650
11160
+ },
11161
+ {
11162
+ "epoch": 0.42175147940934615,
11163
+ "grad_norm": 0.19712655246257782,
11164
+ "learning_rate": 0.0004314938291590161,
11165
+ "loss": 3.0142,
11166
+ "num_input_tokens_seen": 16436428800,
11167
+ "step": 62700
11168
+ },
11169
+ {
11170
+ "epoch": 0.42208780435305376,
11171
+ "grad_norm": 0.20189446210861206,
11172
+ "learning_rate": 0.00042663476277231917,
11173
+ "loss": 2.9983,
11174
+ "num_input_tokens_seen": 16449536000,
11175
+ "step": 62750
11176
+ },
11177
+ {
11178
+ "epoch": 0.4224241292967614,
11179
+ "grad_norm": 0.18463867902755737,
11180
+ "learning_rate": 0.0004217827674798845,
11181
+ "loss": 2.9971,
11182
+ "num_input_tokens_seen": 16462643200,
11183
+ "step": 62800
11184
+ },
11185
+ {
11186
+ "epoch": 0.422760454240469,
11187
+ "grad_norm": 0.17639389634132385,
11188
+ "learning_rate": 0.0004169383109271174,
11189
+ "loss": 3.0032,
11190
+ "num_input_tokens_seen": 16475750400,
11191
+ "step": 62850
11192
+ },
11193
+ {
11194
+ "epoch": 0.4230967791841766,
11195
+ "grad_norm": 0.1733781099319458,
11196
+ "learning_rate": 0.00041210186003282274,
11197
+ "loss": 2.9932,
11198
+ "num_input_tokens_seen": 16488857600,
11199
+ "step": 62900
11200
+ },
11201
+ {
11202
+ "epoch": 0.4234331041278842,
11203
+ "grad_norm": 0.17753124237060547,
11204
+ "learning_rate": 0.00040727388094420456,
11205
+ "loss": 3.0012,
11206
+ "num_input_tokens_seen": 16501964800,
11207
+ "step": 62950
11208
+ },
11209
+ {
11210
+ "epoch": 0.4237694290715918,
11211
+ "grad_norm": 0.180925652384758,
11212
+ "learning_rate": 0.00040245483899193594,
11213
+ "loss": 2.9823,
11214
+ "num_input_tokens_seen": 16515072000,
11215
+ "step": 63000
11216
+ },
11217
+ {
11218
+ "epoch": 0.4237694290715918,
11219
+ "eval_loss": 2.8929545879364014,
11220
+ "eval_runtime": 53.37,
11221
+ "eval_samples_per_second": 93.686,
11222
+ "eval_steps_per_second": 23.421,
11223
+ "num_input_tokens_seen": 16515072000,
11224
+ "step": 63000
11225
  }
11226
  ],
11227
  "logging_steps": 50,
11228
  "max_steps": 70000,
11229
+ "num_input_tokens_seen": 16515072000,
11230
  "num_train_epochs": 1,
11231
  "save_steps": 1000,
11232
  "stateful_callbacks": {
 
11241
  "attributes": {}
11242
  }
11243
  },
11244
+ "total_flos": 4.41794294710272e+18,
11245
  "train_batch_size": 64,
11246
  "trial_name": null,
11247
  "trial_params": null