kiritan commited on
Commit
bcf3329
·
verified ·
1 Parent(s): 5fe7b73

Training in progress, step 8000, checkpoint

Browse files
last-checkpoint/global_step8000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aa42ddad4af96c74b65d7385afeb6f0d5fb2d664599cd656cd82fbd70ad62da
3
+ size 5117197489
last-checkpoint/global_step8000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5c9356593ff2b606f0316fc19dda6e153f3fa9391f9b150d5a17343e4eeeeb
3
+ size 859127933
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step7000
 
1
+ global_step8000
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:856733172381a37b6de12c25512bfa5cc33814241a1986b18ae46a3c6cd69ce1
3
  size 962205216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0823a45cd5f5f262d4113d9c6af3e480a93b1328d895090e11d3841575e98029
3
  size 962205216
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56a8ea7c15005d31ade663058f08a1d5a4619da6c77df5179c75f15bb9cc3f05
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2132517ec1780cf0e43d2d85e0457c9953dabc448540c499dce25e57e2b052
3
  size 14709
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34a83060f11df4fe46a27d45e8744a4c0e7bb60df156e5d496780133906eacd7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cba89e97c806c2994342d3ee7fc823d23ef358301180bf2dcf6ac57f1ab3869c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 84.13012729844414,
3
  "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-7000",
4
- "epoch": 7.709251101321586,
5
  "eval_steps": 1000,
6
- "global_step": 7000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2037,6 +2037,296 @@
2037
  "eval_steps_per_second": 1.957,
2038
  "eval_wer": 84.13012729844414,
2039
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2040
  }
2041
  ],
2042
  "logging_steps": 25,
@@ -2056,7 +2346,7 @@
2056
  "attributes": {}
2057
  }
2058
  },
2059
- "total_flos": 1.2027002305430684e+20,
2060
  "train_batch_size": 4,
2061
  "trial_name": null,
2062
  "trial_params": null
 
1
  {
2
  "best_metric": 84.13012729844414,
3
  "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-7000",
4
+ "epoch": 8.810572687224669,
5
  "eval_steps": 1000,
6
+ "global_step": 8000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2037
  "eval_steps_per_second": 1.957,
2038
  "eval_wer": 84.13012729844414,
2039
  "step": 7000
2040
+ },
2041
+ {
2042
+ "epoch": 7.736784140969163,
2043
+ "grad_norm": 0.5741052627563477,
2044
+ "learning_rate": 1.3307692307692309e-05,
2045
+ "loss": 0.0909,
2046
+ "step": 7025
2047
+ },
2048
+ {
2049
+ "epoch": 7.76431718061674,
2050
+ "grad_norm": 0.7617988586425781,
2051
+ "learning_rate": 1.3282051282051282e-05,
2052
+ "loss": 0.0753,
2053
+ "step": 7050
2054
+ },
2055
+ {
2056
+ "epoch": 7.791850220264317,
2057
+ "grad_norm": 0.5416741967201233,
2058
+ "learning_rate": 1.3256410256410258e-05,
2059
+ "loss": 0.0731,
2060
+ "step": 7075
2061
+ },
2062
+ {
2063
+ "epoch": 7.819383259911894,
2064
+ "grad_norm": 0.6349952816963196,
2065
+ "learning_rate": 1.3230769230769231e-05,
2066
+ "loss": 0.0703,
2067
+ "step": 7100
2068
+ },
2069
+ {
2070
+ "epoch": 7.846916299559472,
2071
+ "grad_norm": 0.33196786046028137,
2072
+ "learning_rate": 1.3205128205128207e-05,
2073
+ "loss": 0.0656,
2074
+ "step": 7125
2075
+ },
2076
+ {
2077
+ "epoch": 7.8744493392070485,
2078
+ "grad_norm": 0.379213809967041,
2079
+ "learning_rate": 1.317948717948718e-05,
2080
+ "loss": 0.0917,
2081
+ "step": 7150
2082
+ },
2083
+ {
2084
+ "epoch": 7.901982378854625,
2085
+ "grad_norm": 0.34401291608810425,
2086
+ "learning_rate": 1.3153846153846156e-05,
2087
+ "loss": 0.0911,
2088
+ "step": 7175
2089
+ },
2090
+ {
2091
+ "epoch": 7.929515418502203,
2092
+ "grad_norm": 0.4732189178466797,
2093
+ "learning_rate": 1.312820512820513e-05,
2094
+ "loss": 0.101,
2095
+ "step": 7200
2096
+ },
2097
+ {
2098
+ "epoch": 7.95704845814978,
2099
+ "grad_norm": 0.5580617785453796,
2100
+ "learning_rate": 1.3102564102564103e-05,
2101
+ "loss": 0.0898,
2102
+ "step": 7225
2103
+ },
2104
+ {
2105
+ "epoch": 7.984581497797357,
2106
+ "grad_norm": 0.42180871963500977,
2107
+ "learning_rate": 1.3076923076923078e-05,
2108
+ "loss": 0.086,
2109
+ "step": 7250
2110
+ },
2111
+ {
2112
+ "epoch": 8.012114537444933,
2113
+ "grad_norm": 0.2615683376789093,
2114
+ "learning_rate": 1.3051282051282052e-05,
2115
+ "loss": 0.0898,
2116
+ "step": 7275
2117
+ },
2118
+ {
2119
+ "epoch": 8.039647577092511,
2120
+ "grad_norm": 0.44722801446914673,
2121
+ "learning_rate": 1.3025641025641027e-05,
2122
+ "loss": 0.0602,
2123
+ "step": 7300
2124
+ },
2125
+ {
2126
+ "epoch": 8.067180616740089,
2127
+ "grad_norm": 0.3499121367931366,
2128
+ "learning_rate": 1.3000000000000001e-05,
2129
+ "loss": 0.0549,
2130
+ "step": 7325
2131
+ },
2132
+ {
2133
+ "epoch": 8.094713656387665,
2134
+ "grad_norm": 0.37767261266708374,
2135
+ "learning_rate": 1.2974358974358976e-05,
2136
+ "loss": 0.0573,
2137
+ "step": 7350
2138
+ },
2139
+ {
2140
+ "epoch": 8.122246696035242,
2141
+ "grad_norm": 0.4645783007144928,
2142
+ "learning_rate": 1.294871794871795e-05,
2143
+ "loss": 0.0566,
2144
+ "step": 7375
2145
+ },
2146
+ {
2147
+ "epoch": 8.14977973568282,
2148
+ "grad_norm": 0.6134966015815735,
2149
+ "learning_rate": 1.2923076923076925e-05,
2150
+ "loss": 0.0507,
2151
+ "step": 7400
2152
+ },
2153
+ {
2154
+ "epoch": 8.177312775330396,
2155
+ "grad_norm": 0.28678062558174133,
2156
+ "learning_rate": 1.2897435897435899e-05,
2157
+ "loss": 0.051,
2158
+ "step": 7425
2159
+ },
2160
+ {
2161
+ "epoch": 8.204845814977974,
2162
+ "grad_norm": 0.299078106880188,
2163
+ "learning_rate": 1.2871794871794874e-05,
2164
+ "loss": 0.045,
2165
+ "step": 7450
2166
+ },
2167
+ {
2168
+ "epoch": 8.232378854625551,
2169
+ "grad_norm": 0.6386272311210632,
2170
+ "learning_rate": 1.2846153846153848e-05,
2171
+ "loss": 0.0665,
2172
+ "step": 7475
2173
+ },
2174
+ {
2175
+ "epoch": 8.259911894273127,
2176
+ "grad_norm": 0.1515616923570633,
2177
+ "learning_rate": 1.2820512820512823e-05,
2178
+ "loss": 0.052,
2179
+ "step": 7500
2180
+ },
2181
+ {
2182
+ "epoch": 8.287444933920705,
2183
+ "grad_norm": 0.61899733543396,
2184
+ "learning_rate": 1.2794871794871795e-05,
2185
+ "loss": 0.0462,
2186
+ "step": 7525
2187
+ },
2188
+ {
2189
+ "epoch": 8.314977973568283,
2190
+ "grad_norm": 0.6535860300064087,
2191
+ "learning_rate": 1.2769230769230769e-05,
2192
+ "loss": 0.0518,
2193
+ "step": 7550
2194
+ },
2195
+ {
2196
+ "epoch": 8.342511013215859,
2197
+ "grad_norm": 0.4084964692592621,
2198
+ "learning_rate": 1.2743589743589744e-05,
2199
+ "loss": 0.0574,
2200
+ "step": 7575
2201
+ },
2202
+ {
2203
+ "epoch": 8.370044052863436,
2204
+ "grad_norm": 0.4185622036457062,
2205
+ "learning_rate": 1.2717948717948718e-05,
2206
+ "loss": 0.0466,
2207
+ "step": 7600
2208
+ },
2209
+ {
2210
+ "epoch": 8.397577092511014,
2211
+ "grad_norm": 0.5417298078536987,
2212
+ "learning_rate": 1.2692307692307693e-05,
2213
+ "loss": 0.0595,
2214
+ "step": 7625
2215
+ },
2216
+ {
2217
+ "epoch": 8.42511013215859,
2218
+ "grad_norm": 0.0882943645119667,
2219
+ "learning_rate": 1.2666666666666667e-05,
2220
+ "loss": 0.0441,
2221
+ "step": 7650
2222
+ },
2223
+ {
2224
+ "epoch": 8.452643171806168,
2225
+ "grad_norm": 0.5028131008148193,
2226
+ "learning_rate": 1.2641025641025642e-05,
2227
+ "loss": 0.0584,
2228
+ "step": 7675
2229
+ },
2230
+ {
2231
+ "epoch": 8.480176211453745,
2232
+ "grad_norm": 0.32492244243621826,
2233
+ "learning_rate": 1.2615384615384616e-05,
2234
+ "loss": 0.0519,
2235
+ "step": 7700
2236
+ },
2237
+ {
2238
+ "epoch": 8.507709251101321,
2239
+ "grad_norm": 0.199100524187088,
2240
+ "learning_rate": 1.2589743589743591e-05,
2241
+ "loss": 0.0519,
2242
+ "step": 7725
2243
+ },
2244
+ {
2245
+ "epoch": 8.535242290748899,
2246
+ "grad_norm": 0.560196578502655,
2247
+ "learning_rate": 1.2564102564102565e-05,
2248
+ "loss": 0.0601,
2249
+ "step": 7750
2250
+ },
2251
+ {
2252
+ "epoch": 8.562775330396477,
2253
+ "grad_norm": 0.3848872780799866,
2254
+ "learning_rate": 1.253846153846154e-05,
2255
+ "loss": 0.0561,
2256
+ "step": 7775
2257
+ },
2258
+ {
2259
+ "epoch": 8.590308370044053,
2260
+ "grad_norm": 0.6430539488792419,
2261
+ "learning_rate": 1.2512820512820514e-05,
2262
+ "loss": 0.0573,
2263
+ "step": 7800
2264
+ },
2265
+ {
2266
+ "epoch": 8.61784140969163,
2267
+ "grad_norm": 0.1402147263288498,
2268
+ "learning_rate": 1.2487179487179487e-05,
2269
+ "loss": 0.0613,
2270
+ "step": 7825
2271
+ },
2272
+ {
2273
+ "epoch": 8.645374449339208,
2274
+ "grad_norm": 0.3411605656147003,
2275
+ "learning_rate": 1.2461538461538463e-05,
2276
+ "loss": 0.0401,
2277
+ "step": 7850
2278
+ },
2279
+ {
2280
+ "epoch": 8.672907488986784,
2281
+ "grad_norm": 0.4999459981918335,
2282
+ "learning_rate": 1.2435897435897436e-05,
2283
+ "loss": 0.0393,
2284
+ "step": 7875
2285
+ },
2286
+ {
2287
+ "epoch": 8.700440528634362,
2288
+ "grad_norm": 0.6794586777687073,
2289
+ "learning_rate": 1.2410256410256412e-05,
2290
+ "loss": 0.061,
2291
+ "step": 7900
2292
+ },
2293
+ {
2294
+ "epoch": 8.72797356828194,
2295
+ "grad_norm": 0.30914783477783203,
2296
+ "learning_rate": 1.2384615384615385e-05,
2297
+ "loss": 0.0552,
2298
+ "step": 7925
2299
+ },
2300
+ {
2301
+ "epoch": 8.755506607929515,
2302
+ "grad_norm": 0.311613529920578,
2303
+ "learning_rate": 1.235897435897436e-05,
2304
+ "loss": 0.0556,
2305
+ "step": 7950
2306
+ },
2307
+ {
2308
+ "epoch": 8.783039647577093,
2309
+ "grad_norm": 0.48470553755760193,
2310
+ "learning_rate": 1.2333333333333334e-05,
2311
+ "loss": 0.0553,
2312
+ "step": 7975
2313
+ },
2314
+ {
2315
+ "epoch": 8.810572687224669,
2316
+ "grad_norm": 0.5605005621910095,
2317
+ "learning_rate": 1.230769230769231e-05,
2318
+ "loss": 0.0673,
2319
+ "step": 8000
2320
+ },
2321
+ {
2322
+ "epoch": 8.810572687224669,
2323
+ "eval_cer": 25.173386218518225,
2324
+ "eval_loss": 0.8125333189964294,
2325
+ "eval_runtime": 1304.9459,
2326
+ "eval_samples_per_second": 8.108,
2327
+ "eval_steps_per_second": 2.028,
2328
+ "eval_wer": 85.53512494106553,
2329
+ "step": 8000
2330
  }
2331
  ],
2332
  "logging_steps": 25,
 
2346
  "attributes": {}
2347
  }
2348
  },
2349
+ "total_flos": 1.3745145491920781e+20,
2350
  "train_batch_size": 4,
2351
  "trial_name": null,
2352
  "trial_params": null