Azrail commited on
Commit
cd2e3d5
·
verified ·
1 Parent(s): 9f9b8f0

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af73c4ec485cd7fa414342b390a5c634c47a31d116a73e322cc418d51a330596
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:593df4add94d8349a8e2c27dd6a4c8e410dc62c59535de38e2c844bae1bf9105
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8057071e9c871132fbac5baaaef5c6aca4c49e2663c7a32995eef4dffca1eb9
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ca220deb73713912b17a381232ea629f59c26aebf972823900e92efe4bee200
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1674983d22ea028f37f625821f1fca77be67adb3636d14701707c06c0fbac379
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5148f4a0429b56039088b4393cfcab680c3af25b037593fe69f3727d64615009
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb36ce4646595e0955071e0d49fcfefa2b2d576fde195d65fd821c4d2bebc721
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d15ebff9b6275f35ed91d179fc6aa0df6144af185e5ca68cd213907d032111d8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.172367487967168,
6
  "eval_steps": 500,
7
- "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1970,11 +1970,229 @@
1970
  "eval_steps_per_second": 20.582,
1971
  "num_input_tokens_seen": 4347894913,
1972
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1973
  }
1974
  ],
1975
  "logging_steps": 50,
1976
  "max_steps": 16568,
1977
- "num_input_tokens_seen": 4347894913,
1978
  "num_train_epochs": 4,
1979
  "save_steps": 1000,
1980
  "stateful_callbacks": {
@@ -1989,7 +2207,7 @@
1989
  "attributes": {}
1990
  }
1991
  },
1992
- "total_flos": 1.163104324681851e+18,
1993
  "train_batch_size": 16,
1994
  "trial_name": null,
1995
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.413778535540233,
6
  "eval_steps": 500,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1970
  "eval_steps_per_second": 20.582,
1971
  "num_input_tokens_seen": 4347894913,
1972
  "step": 9000
1973
+ },
1974
+ {
1975
+ "epoch": 2.1844380403458215,
1976
+ "grad_norm": 0.2734375,
1977
+ "learning_rate": 2.8361249434133093e-05,
1978
+ "loss": 2.0951,
1979
+ "mean_token_accuracy": 0.5561914920061827,
1980
+ "num_input_tokens_seen": 4372093409,
1981
+ "num_tokens": 1842403609.0,
1982
+ "step": 9050
1983
+ },
1984
+ {
1985
+ "epoch": 2.1965085927244745,
1986
+ "grad_norm": 0.275390625,
1987
+ "learning_rate": 2.817262713143202e-05,
1988
+ "loss": 2.0915,
1989
+ "mean_token_accuracy": 0.5557398213073611,
1990
+ "num_input_tokens_seen": 4396072241,
1991
+ "num_tokens": 1852448709.0,
1992
+ "step": 9100
1993
+ },
1994
+ {
1995
+ "epoch": 2.2085791451031276,
1996
+ "grad_norm": 0.2392578125,
1997
+ "learning_rate": 2.7984004828730953e-05,
1998
+ "loss": 2.0965,
1999
+ "mean_token_accuracy": 0.5542204293608666,
2000
+ "num_input_tokens_seen": 4420308385,
2001
+ "num_tokens": 1862702843.0,
2002
+ "step": 9150
2003
+ },
2004
+ {
2005
+ "epoch": 2.220649697481781,
2006
+ "grad_norm": 0.2578125,
2007
+ "learning_rate": 2.779538252602988e-05,
2008
+ "loss": 2.0873,
2009
+ "mean_token_accuracy": 0.555770318582654,
2010
+ "num_input_tokens_seen": 4444408305,
2011
+ "num_tokens": 1872813360.0,
2012
+ "step": 9200
2013
+ },
2014
+ {
2015
+ "epoch": 2.232720249860434,
2016
+ "grad_norm": 0.248046875,
2017
+ "learning_rate": 2.760676022332881e-05,
2018
+ "loss": 2.0984,
2019
+ "mean_token_accuracy": 0.5543450859189033,
2020
+ "num_input_tokens_seen": 4468586049,
2021
+ "num_tokens": 1883034727.0,
2022
+ "step": 9250
2023
+ },
2024
+ {
2025
+ "epoch": 2.2447908022390877,
2026
+ "grad_norm": 0.26171875,
2027
+ "learning_rate": 2.7418137920627736e-05,
2028
+ "loss": 2.0913,
2029
+ "mean_token_accuracy": 0.5554680547490716,
2030
+ "num_input_tokens_seen": 4492717489,
2031
+ "num_tokens": 1893259660.0,
2032
+ "step": 9300
2033
+ },
2034
+ {
2035
+ "epoch": 2.2568613546177407,
2036
+ "grad_norm": 0.3046875,
2037
+ "learning_rate": 2.7229515617926664e-05,
2038
+ "loss": 2.0976,
2039
+ "mean_token_accuracy": 0.5547211924567819,
2040
+ "num_input_tokens_seen": 4516832449,
2041
+ "num_tokens": 1903351453.0,
2042
+ "step": 9350
2043
+ },
2044
+ {
2045
+ "epoch": 2.268931906996394,
2046
+ "grad_norm": 0.240234375,
2047
+ "learning_rate": 2.7040893315225596e-05,
2048
+ "loss": 2.095,
2049
+ "mean_token_accuracy": 0.5545766900852322,
2050
+ "num_input_tokens_seen": 4540881473,
2051
+ "num_tokens": 1913462038.0,
2052
+ "step": 9400
2053
+ },
2054
+ {
2055
+ "epoch": 2.2810024593750473,
2056
+ "grad_norm": 0.2412109375,
2057
+ "learning_rate": 2.685227101252452e-05,
2058
+ "loss": 2.1047,
2059
+ "mean_token_accuracy": 0.5530835852399468,
2060
+ "num_input_tokens_seen": 4565196353,
2061
+ "num_tokens": 1923836730.0,
2062
+ "step": 9450
2063
+ },
2064
+ {
2065
+ "epoch": 2.2930730117537004,
2066
+ "grad_norm": 0.25390625,
2067
+ "learning_rate": 2.6663648709823454e-05,
2068
+ "loss": 2.1036,
2069
+ "num_input_tokens_seen": 4589393665,
2070
+ "step": 9500
2071
+ },
2072
+ {
2073
+ "epoch": 2.2930730117537004,
2074
+ "eval_loss": 1.9684821367263794,
2075
+ "eval_mean_token_accuracy": 0.5784456487953707,
2076
+ "eval_num_tokens": 1933999749.0,
2077
+ "eval_runtime": 130.3401,
2078
+ "eval_samples_per_second": 82.185,
2079
+ "eval_steps_per_second": 20.546,
2080
+ "num_input_tokens_seen": 4589393665,
2081
+ "step": 9500
2082
+ },
2083
+ {
2084
+ "epoch": 2.3051435641323534,
2085
+ "grad_norm": 0.2373046875,
2086
+ "learning_rate": 2.647502640712238e-05,
2087
+ "loss": 2.1091,
2088
+ "mean_token_accuracy": 0.5529859235696495,
2089
+ "num_input_tokens_seen": 4613609921,
2090
+ "num_tokens": 1944210855.0,
2091
+ "step": 9550
2092
+ },
2093
+ {
2094
+ "epoch": 2.317214116511007,
2095
+ "grad_norm": 0.2490234375,
2096
+ "learning_rate": 2.6286404104421307e-05,
2097
+ "loss": 2.0976,
2098
+ "mean_token_accuracy": 0.554888856895268,
2099
+ "num_input_tokens_seen": 4637474321,
2100
+ "num_tokens": 1954258079.0,
2101
+ "step": 9600
2102
+ },
2103
+ {
2104
+ "epoch": 2.32928466888966,
2105
+ "grad_norm": 0.25,
2106
+ "learning_rate": 2.609778180172024e-05,
2107
+ "loss": 2.1061,
2108
+ "mean_token_accuracy": 0.5531089297309518,
2109
+ "num_input_tokens_seen": 4661687841,
2110
+ "num_tokens": 1964447306.0,
2111
+ "step": 9650
2112
+ },
2113
+ {
2114
+ "epoch": 2.341355221268313,
2115
+ "grad_norm": 0.283203125,
2116
+ "learning_rate": 2.5909159499019165e-05,
2117
+ "loss": 2.0972,
2118
+ "mean_token_accuracy": 0.5547297456115484,
2119
+ "num_input_tokens_seen": 4685886657,
2120
+ "num_tokens": 1974672380.0,
2121
+ "step": 9700
2122
+ },
2123
+ {
2124
+ "epoch": 2.3534257736469666,
2125
+ "grad_norm": 0.275390625,
2126
+ "learning_rate": 2.5720537196318097e-05,
2127
+ "loss": 2.0874,
2128
+ "mean_token_accuracy": 0.556226581223309,
2129
+ "num_input_tokens_seen": 4710004273,
2130
+ "num_tokens": 1984832310.0,
2131
+ "step": 9750
2132
+ },
2133
+ {
2134
+ "epoch": 2.3654963260256197,
2135
+ "grad_norm": 0.2392578125,
2136
+ "learning_rate": 2.5531914893617022e-05,
2137
+ "loss": 2.096,
2138
+ "mean_token_accuracy": 0.5547980547696352,
2139
+ "num_input_tokens_seen": 4734271009,
2140
+ "num_tokens": 1995090784.0,
2141
+ "step": 9800
2142
+ },
2143
+ {
2144
+ "epoch": 2.377566878404273,
2145
+ "grad_norm": 0.28515625,
2146
+ "learning_rate": 2.534329259091595e-05,
2147
+ "loss": 2.0871,
2148
+ "mean_token_accuracy": 0.5552258058264852,
2149
+ "num_input_tokens_seen": 4758291265,
2150
+ "num_tokens": 2005240317.0,
2151
+ "step": 9850
2152
+ },
2153
+ {
2154
+ "epoch": 2.3896374307829262,
2155
+ "grad_norm": 0.2470703125,
2156
+ "learning_rate": 2.5154670288214883e-05,
2157
+ "loss": 2.0865,
2158
+ "mean_token_accuracy": 0.5557247434183955,
2159
+ "num_input_tokens_seen": 4782472097,
2160
+ "num_tokens": 2015507708.0,
2161
+ "step": 9900
2162
+ },
2163
+ {
2164
+ "epoch": 2.4017079831615793,
2165
+ "grad_norm": 0.2421875,
2166
+ "learning_rate": 2.4966047985513808e-05,
2167
+ "loss": 2.1074,
2168
+ "mean_token_accuracy": 0.5527091028168798,
2169
+ "num_input_tokens_seen": 4806608113,
2170
+ "num_tokens": 2025820931.0,
2171
+ "step": 9950
2172
+ },
2173
+ {
2174
+ "epoch": 2.413778535540233,
2175
+ "grad_norm": 0.2421875,
2176
+ "learning_rate": 2.477742568281274e-05,
2177
+ "loss": 2.1001,
2178
+ "num_input_tokens_seen": 4830743425,
2179
+ "step": 10000
2180
+ },
2181
+ {
2182
+ "epoch": 2.413778535540233,
2183
+ "eval_loss": 1.9683291912078857,
2184
+ "eval_mean_token_accuracy": 0.5784874623550952,
2185
+ "eval_num_tokens": 2035904188.0,
2186
+ "eval_runtime": 130.7093,
2187
+ "eval_samples_per_second": 81.953,
2188
+ "eval_steps_per_second": 20.488,
2189
+ "num_input_tokens_seen": 4830743425,
2190
+ "step": 10000
2191
  }
2192
  ],
2193
  "logging_steps": 50,
2194
  "max_steps": 16568,
2195
+ "num_input_tokens_seen": 4830743425,
2196
  "num_train_epochs": 4,
2197
  "save_steps": 1000,
2198
  "stateful_callbacks": {
 
2207
  "attributes": {}
2208
  }
2209
  },
2210
+ "total_flos": 1.292271014243328e+18,
2211
  "train_batch_size": 16,
2212
  "trial_name": null,
2213
  "trial_params": null