Azrail commited on
Commit
bf603f5
·
verified ·
1 Parent(s): afb6ff9

Training in progress, step 63000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c05cde8285dd52085342b46430f4e5412103d775ef2ecb3ff92fe973f05563a
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a92901e6dc98a2f43e5ab06e2e35886c7f4c68e401e8be0d01acd281cd82349c
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13bd3612e3785a0d69245374e9d503a45ff63d121c602ad8a1a69ce58b21ee6f
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34104db96694bb116cf3048bcf68919612a7c6c79ff646c13c6e8d5a81aff8f6
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b604bf86b8b70beb6e4043604c61f8577f1fbe75a9d1e20249b5622ec5aa2654
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a97095234a7b82e99cd1b23ba4db26c35942b8b4622876b166d0ce65b7c7110
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b86dd42ce2bfa419ab9d950fa2e032bc9074c23516cf132dad718a38dfd9a2d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d1de5c681ac3c8b6bb5235a71c5b6efd72fc9171aa2c9c6e093b8695c8a08b8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2957415600367292,
6
  "eval_steps": 500,
7
- "global_step": 62000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11044,11 +11044,189 @@
11044
  "eval_steps_per_second": 23.306,
11045
  "num_input_tokens_seen": 16252923456,
11046
  "step": 62000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11047
  }
11048
  ],
11049
  "logging_steps": 50,
11050
  "max_steps": 70000,
11051
- "num_input_tokens_seen": 16252923456,
11052
  "num_train_epochs": 1,
11053
  "save_steps": 1000,
11054
  "stateful_callbacks": {
@@ -11063,7 +11241,7 @@
11063
  "attributes": {}
11064
  }
11065
  },
11066
- "total_flos": 4.3478156530129306e+18,
11067
  "train_batch_size": 64,
11068
  "trial_name": null,
11069
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.30051158519861193,
6
  "eval_steps": 500,
7
+ "global_step": 63000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11044
  "eval_steps_per_second": 23.306,
11045
  "num_input_tokens_seen": 16252923456,
11046
  "step": 62000
11047
+ },
11048
+ {
11049
+ "epoch": 0.29598006129482335,
11050
+ "grad_norm": 0.1945638656616211,
11051
+ "learning_rate": 0.0006057842458386314,
11052
+ "loss": 2.5582,
11053
+ "num_input_tokens_seen": 16266030656,
11054
+ "step": 62050
11055
+ },
11056
+ {
11057
+ "epoch": 0.29621856255291745,
11058
+ "grad_norm": 0.201882466673851,
11059
+ "learning_rate": 0.0006002947078916364,
11060
+ "loss": 2.5764,
11061
+ "num_input_tokens_seen": 16279137856,
11062
+ "step": 62100
11063
+ },
11064
+ {
11065
+ "epoch": 0.2964570638110116,
11066
+ "grad_norm": 0.2137998789548874,
11067
+ "learning_rate": 0.0005947925441958392,
11068
+ "loss": 2.5689,
11069
+ "num_input_tokens_seen": 16292245056,
11070
+ "step": 62150
11071
+ },
11072
+ {
11073
+ "epoch": 0.2966955650691057,
11074
+ "grad_norm": 0.18265672028064728,
11075
+ "learning_rate": 0.0005892784473993184,
11076
+ "loss": 2.5741,
11077
+ "num_input_tokens_seen": 16305352256,
11078
+ "step": 62200
11079
+ },
11080
+ {
11081
+ "epoch": 0.2969340663271999,
11082
+ "grad_norm": 0.16944251954555511,
11083
+ "learning_rate": 0.0005837531116523682,
11084
+ "loss": 2.5537,
11085
+ "num_input_tokens_seen": 16318459456,
11086
+ "step": 62250
11087
+ },
11088
+ {
11089
+ "epoch": 0.29717256758529403,
11090
+ "grad_norm": 0.20273485779762268,
11091
+ "learning_rate": 0.0005782172325201155,
11092
+ "loss": 2.5512,
11093
+ "num_input_tokens_seen": 16331566656,
11094
+ "step": 62300
11095
+ },
11096
+ {
11097
+ "epoch": 0.29741106884338814,
11098
+ "grad_norm": 0.19320476055145264,
11099
+ "learning_rate": 0.0005726715068949564,
11100
+ "loss": 2.5823,
11101
+ "num_input_tokens_seen": 16344673856,
11102
+ "step": 62350
11103
+ },
11104
+ {
11105
+ "epoch": 0.2976495701014823,
11106
+ "grad_norm": 0.21321871876716614,
11107
+ "learning_rate": 0.0005671166329088278,
11108
+ "loss": 2.5608,
11109
+ "num_input_tokens_seen": 16357781056,
11110
+ "step": 62400
11111
+ },
11112
+ {
11113
+ "epoch": 0.2978880713595764,
11114
+ "grad_norm": 0.2007117122411728,
11115
+ "learning_rate": 0.0005615533098453215,
11116
+ "loss": 2.5685,
11117
+ "num_input_tokens_seen": 16370888256,
11118
+ "step": 62450
11119
+ },
11120
+ {
11121
+ "epoch": 0.29812657261767056,
11122
+ "grad_norm": 0.1896267682313919,
11123
+ "learning_rate": 0.0005559822380516539,
11124
+ "loss": 2.56,
11125
+ "num_input_tokens_seen": 16383995456,
11126
+ "step": 62500
11127
+ },
11128
+ {
11129
+ "epoch": 0.29812657261767056,
11130
+ "eval_loss": 2.448042154312134,
11131
+ "eval_runtime": 54.1994,
11132
+ "eval_samples_per_second": 92.252,
11133
+ "eval_steps_per_second": 23.063,
11134
+ "num_input_tokens_seen": 16383995456,
11135
+ "step": 62500
11136
+ },
11137
+ {
11138
+ "epoch": 0.2983650738757647,
11139
+ "grad_norm": 0.18581034243106842,
11140
+ "learning_rate": 0.0005504041188505022,
11141
+ "loss": 2.5691,
11142
+ "num_input_tokens_seen": 16397102656,
11143
+ "step": 62550
11144
+ },
11145
+ {
11146
+ "epoch": 0.2986035751338588,
11147
+ "grad_norm": 0.19272533059120178,
11148
+ "learning_rate": 0.0005448196544517168,
11149
+ "loss": 2.5635,
11150
+ "num_input_tokens_seen": 16410209856,
11151
+ "step": 62600
11152
+ },
11153
+ {
11154
+ "epoch": 0.298842076391953,
11155
+ "grad_norm": 0.19940300285816193,
11156
+ "learning_rate": 0.0005392295478639225,
11157
+ "loss": 2.5755,
11158
+ "num_input_tokens_seen": 16423317056,
11159
+ "step": 62650
11160
+ },
11161
+ {
11162
+ "epoch": 0.2990805776500471,
11163
+ "grad_norm": 0.18894875049591064,
11164
+ "learning_rate": 0.0005336345028060199,
11165
+ "loss": 2.5718,
11166
+ "num_input_tokens_seen": 16436424256,
11167
+ "step": 62700
11168
+ },
11169
+ {
11170
+ "epoch": 0.29931907890814125,
11171
+ "grad_norm": 0.19226962327957153,
11172
+ "learning_rate": 0.0005280352236185959,
11173
+ "loss": 2.563,
11174
+ "num_input_tokens_seen": 16449531456,
11175
+ "step": 62750
11176
+ },
11177
+ {
11178
+ "epoch": 0.2995575801662354,
11179
+ "grad_norm": 0.20716702938079834,
11180
+ "learning_rate": 0.0005224324151752575,
11181
+ "loss": 2.5532,
11182
+ "num_input_tokens_seen": 16462638656,
11183
+ "step": 62800
11184
+ },
11185
+ {
11186
+ "epoch": 0.2997960814243295,
11187
+ "grad_norm": 0.20232325792312622,
11188
+ "learning_rate": 0.000516826782793897,
11189
+ "loss": 2.5691,
11190
+ "num_input_tokens_seen": 16475745856,
11191
+ "step": 62850
11192
+ },
11193
+ {
11194
+ "epoch": 0.30003458268242367,
11195
+ "grad_norm": 0.19828926026821136,
11196
+ "learning_rate": 0.0005112190321479025,
11197
+ "loss": 2.5602,
11198
+ "num_input_tokens_seen": 16488853056,
11199
+ "step": 62900
11200
+ },
11201
+ {
11202
+ "epoch": 0.30027308394051777,
11203
+ "grad_norm": 0.22366905212402344,
11204
+ "learning_rate": 0.000505609869177323,
11205
+ "loss": 2.5556,
11206
+ "num_input_tokens_seen": 16501960256,
11207
+ "step": 62950
11208
+ },
11209
+ {
11210
+ "epoch": 0.30051158519861193,
11211
+ "grad_norm": 0.1883884221315384,
11212
+ "learning_rate": 0.0005,
11213
+ "loss": 2.5567,
11214
+ "num_input_tokens_seen": 16515067456,
11215
+ "step": 63000
11216
+ },
11217
+ {
11218
+ "epoch": 0.30051158519861193,
11219
+ "eval_loss": 2.4441678524017334,
11220
+ "eval_runtime": 54.2448,
11221
+ "eval_samples_per_second": 92.175,
11222
+ "eval_steps_per_second": 23.044,
11223
+ "num_input_tokens_seen": 16515067456,
11224
+ "step": 63000
11225
  }
11226
  ],
11227
  "logging_steps": 50,
11228
  "max_steps": 70000,
11229
+ "num_input_tokens_seen": 16515067456,
11230
  "num_train_epochs": 1,
11231
  "save_steps": 1000,
11232
  "stateful_callbacks": {
 
11241
  "attributes": {}
11242
  }
11243
  },
11244
+ "total_flos": 4.4179417315383706e+18,
11245
  "train_batch_size": 64,
11246
  "trial_name": null,
11247
  "trial_params": null