Azrail commited on
Commit
e1d38bf
·
verified ·
1 Parent(s): 7622fe0

Training in progress, step 136000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:079464073c9724ceb804666b522429a90a4928e290e5da217f3ad8b9d68b8886
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fe644242ac85364957a221ecb3fda251252bbb21f78dcf32d44ddb45cee4b8c
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fd285cfac8e5c0f6d1266cf8e23ce20a797130dac2828587dcc5345232fa441
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a0bb2637b2d27c703e80119c30822f6cacfac9cba885cfe1635772ce684b387
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:874cf93e738f75197422ec1e62b162ef1e398b581422e23932b758446980a6af
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c2ffcf5f582912b4a7016b15e29048dddaa402730efcd133059a2e08945301c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e1e7a01b81e1907abf43be3318a5c567fc57f95dbaef634f44d30b341186326
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff50fa4a38896a05eab7dc1bfd456c8019098d112a942a25a411381c6596e51c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2879091787209178,
6
  "eval_steps": 500,
7
- "global_step": 135000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -24038,11 +24038,189 @@
24038
  "eval_steps_per_second": 15.14,
24039
  "num_input_tokens_seen": 70767457344,
24040
  "step": 135000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24041
  }
24042
  ],
24043
  "logging_steps": 50,
24044
  "max_steps": 140000,
24045
- "num_input_tokens_seen": 70767457344,
24046
  "num_train_epochs": 2,
24047
  "save_steps": 1000,
24048
  "stateful_callbacks": {
@@ -24057,7 +24235,7 @@
24057
  "attributes": {}
24058
  }
24059
  },
24060
- "total_flos": 1.2524549151466045e+20,
24061
  "train_batch_size": 32,
24062
  "trial_name": null,
24063
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.297449229044683,
6
  "eval_steps": 500,
7
+ "global_step": 136000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
24038
  "eval_steps_per_second": 15.14,
24039
  "num_input_tokens_seen": 70767457344,
24040
  "step": 135000
24041
+ },
24042
+ {
24043
+ "epoch": 1.288386181237106,
24044
+ "grad_norm": 0.1237749382853508,
24045
+ "learning_rate": 7.515222372735647e-05,
24046
+ "loss": 2.029,
24047
+ "num_input_tokens_seen": 70793671744,
24048
+ "step": 135050
24049
+ },
24050
+ {
24051
+ "epoch": 1.2888631837532942,
24052
+ "grad_norm": 0.11638092249631882,
24053
+ "learning_rate": 7.367991782295391e-05,
24054
+ "loss": 2.0171,
24055
+ "num_input_tokens_seen": 70819879168,
24056
+ "step": 135100
24057
+ },
24058
+ {
24059
+ "epoch": 1.2893401862694827,
24060
+ "grad_norm": 0.11938998103141785,
24061
+ "learning_rate": 7.222102900887101e-05,
24062
+ "loss": 2.0232,
24063
+ "num_input_tokens_seen": 70846079616,
24064
+ "step": 135150
24065
+ },
24066
+ {
24067
+ "epoch": 1.2898171887856709,
24068
+ "grad_norm": 0.11985292285680771,
24069
+ "learning_rate": 7.077560319906695e-05,
24070
+ "loss": 2.0387,
24071
+ "num_input_tokens_seen": 70872294016,
24072
+ "step": 135200
24073
+ },
24074
+ {
24075
+ "epoch": 1.290294191301859,
24076
+ "grad_norm": 0.12651756405830383,
24077
+ "learning_rate": 6.934368588379552e-05,
24078
+ "loss": 2.0345,
24079
+ "num_input_tokens_seen": 70898498624,
24080
+ "step": 135250
24081
+ },
24082
+ {
24083
+ "epoch": 1.2907711938180473,
24084
+ "grad_norm": 0.12012086063623428,
24085
+ "learning_rate": 6.792532212817271e-05,
24086
+ "loss": 2.0362,
24087
+ "num_input_tokens_seen": 70924710048,
24088
+ "step": 135300
24089
+ },
24090
+ {
24091
+ "epoch": 1.2912481963342357,
24092
+ "grad_norm": 0.12295469641685486,
24093
+ "learning_rate": 6.652055657075845e-05,
24094
+ "loss": 2.0338,
24095
+ "num_input_tokens_seen": 70950915200,
24096
+ "step": 135350
24097
+ },
24098
+ {
24099
+ "epoch": 1.291725198850424,
24100
+ "grad_norm": 0.12192966043949127,
24101
+ "learning_rate": 6.512943342215233e-05,
24102
+ "loss": 2.0311,
24103
+ "num_input_tokens_seen": 70977118208,
24104
+ "step": 135400
24105
+ },
24106
+ {
24107
+ "epoch": 1.2922022013666123,
24108
+ "grad_norm": 0.1188386008143425,
24109
+ "learning_rate": 6.375199646360142e-05,
24110
+ "loss": 2.0311,
24111
+ "num_input_tokens_seen": 71003331520,
24112
+ "step": 135450
24113
+ },
24114
+ {
24115
+ "epoch": 1.2926792038828006,
24116
+ "grad_norm": 0.11646123230457306,
24117
+ "learning_rate": 6.238828904562316e-05,
24118
+ "loss": 2.037,
24119
+ "num_input_tokens_seen": 71029545920,
24120
+ "step": 135500
24121
+ },
24122
+ {
24123
+ "epoch": 1.2926792038828006,
24124
+ "eval_loss": 1.9530843496322632,
24125
+ "eval_runtime": 82.2362,
24126
+ "eval_samples_per_second": 60.8,
24127
+ "eval_steps_per_second": 15.2,
24128
+ "num_input_tokens_seen": 71029545920,
24129
+ "step": 135500
24130
+ },
24131
+ {
24132
+ "epoch": 1.2931562063989888,
24133
+ "grad_norm": 0.12359626591205597,
24134
+ "learning_rate": 6.103835408664032e-05,
24135
+ "loss": 2.0441,
24136
+ "num_input_tokens_seen": 71055753312,
24137
+ "step": 135550
24138
+ },
24139
+ {
24140
+ "epoch": 1.293633208915177,
24141
+ "grad_norm": 0.12097882479429245,
24142
+ "learning_rate": 5.9702234071631e-05,
24143
+ "loss": 2.0251,
24144
+ "num_input_tokens_seen": 71081964480,
24145
+ "step": 135600
24146
+ },
24147
+ {
24148
+ "epoch": 1.2941102114313652,
24149
+ "grad_norm": 0.11585067212581635,
24150
+ "learning_rate": 5.83799710507909e-05,
24151
+ "loss": 2.0352,
24152
+ "num_input_tokens_seen": 71108163424,
24153
+ "step": 135650
24154
+ },
24155
+ {
24156
+ "epoch": 1.2945872139475536,
24157
+ "grad_norm": 0.12164249271154404,
24158
+ "learning_rate": 5.7071606638210094e-05,
24159
+ "loss": 2.0314,
24160
+ "num_input_tokens_seen": 71134375424,
24161
+ "step": 135700
24162
+ },
24163
+ {
24164
+ "epoch": 1.2950642164637418,
24165
+ "grad_norm": 0.11601755023002625,
24166
+ "learning_rate": 5.577718201056392e-05,
24167
+ "loss": 2.0313,
24168
+ "num_input_tokens_seen": 71160582688,
24169
+ "step": 135750
24170
+ },
24171
+ {
24172
+ "epoch": 1.2955412189799302,
24173
+ "grad_norm": 0.11863810569047928,
24174
+ "learning_rate": 5.449673790581611e-05,
24175
+ "loss": 2.036,
24176
+ "num_input_tokens_seen": 71186792800,
24177
+ "step": 135800
24178
+ },
24179
+ {
24180
+ "epoch": 1.2960182214961184,
24181
+ "grad_norm": 0.12455905973911285,
24182
+ "learning_rate": 5.3230314621937556e-05,
24183
+ "loss": 2.0316,
24184
+ "num_input_tokens_seen": 71213000416,
24185
+ "step": 135850
24186
+ },
24187
+ {
24188
+ "epoch": 1.2964952240123067,
24189
+ "grad_norm": 0.11861378699541092,
24190
+ "learning_rate": 5.197795201563743e-05,
24191
+ "loss": 2.0334,
24192
+ "num_input_tokens_seen": 71239212224,
24193
+ "step": 135900
24194
+ },
24195
+ {
24196
+ "epoch": 1.2969722265284949,
24197
+ "grad_norm": 0.11894825845956802,
24198
+ "learning_rate": 5.073968950110941e-05,
24199
+ "loss": 2.028,
24200
+ "num_input_tokens_seen": 71265425728,
24201
+ "step": 135950
24202
+ },
24203
+ {
24204
+ "epoch": 1.297449229044683,
24205
+ "grad_norm": 0.11746333539485931,
24206
+ "learning_rate": 4.9515566048790485e-05,
24207
+ "loss": 2.0302,
24208
+ "num_input_tokens_seen": 71291638272,
24209
+ "step": 136000
24210
+ },
24211
+ {
24212
+ "epoch": 1.297449229044683,
24213
+ "eval_loss": 1.9527229070663452,
24214
+ "eval_runtime": 82.9319,
24215
+ "eval_samples_per_second": 60.29,
24216
+ "eval_steps_per_second": 15.073,
24217
+ "num_input_tokens_seen": 71291638272,
24218
+ "step": 136000
24219
  }
24220
  ],
24221
  "logging_steps": 50,
24222
  "max_steps": 140000,
24223
+ "num_input_tokens_seen": 71291638272,
24224
  "num_train_epochs": 2,
24225
  "save_steps": 1000,
24226
  "stateful_callbacks": {
 
24235
  "attributes": {}
24236
  }
24237
  },
24238
+ "total_flos": 1.2617319614661919e+20,
24239
  "train_batch_size": 32,
24240
  "trial_name": null,
24241
  "trial_params": null