Training in progress, step 10000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 150625560
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:593df4add94d8349a8e2c27dd6a4c8e410dc62c59535de38e2c844bae1bf9105
|
| 3 |
size 150625560
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ca220deb73713912b17a381232ea629f59c26aebf972823900e92efe4bee200
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5148f4a0429b56039088b4393cfcab680c3af25b037593fe69f3727d64615009
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d15ebff9b6275f35ed91d179fc6aa0df6144af185e5ca68cd213907d032111d8
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -1970,11 +1970,229 @@
|
|
| 1970 |
"eval_steps_per_second": 20.582,
|
| 1971 |
"num_input_tokens_seen": 4347894913,
|
| 1972 |
"step": 9000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1973 |
}
|
| 1974 |
],
|
| 1975 |
"logging_steps": 50,
|
| 1976 |
"max_steps": 16568,
|
| 1977 |
-
"num_input_tokens_seen":
|
| 1978 |
"num_train_epochs": 4,
|
| 1979 |
"save_steps": 1000,
|
| 1980 |
"stateful_callbacks": {
|
|
@@ -1989,7 +2207,7 @@
|
|
| 1989 |
"attributes": {}
|
| 1990 |
}
|
| 1991 |
},
|
| 1992 |
-
"total_flos": 1.
|
| 1993 |
"train_batch_size": 16,
|
| 1994 |
"trial_name": null,
|
| 1995 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.413778535540233,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 10000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 1970 |
"eval_steps_per_second": 20.582,
|
| 1971 |
"num_input_tokens_seen": 4347894913,
|
| 1972 |
"step": 9000
|
| 1973 |
+
},
|
| 1974 |
+
{
|
| 1975 |
+
"epoch": 2.1844380403458215,
|
| 1976 |
+
"grad_norm": 0.2734375,
|
| 1977 |
+
"learning_rate": 2.8361249434133093e-05,
|
| 1978 |
+
"loss": 2.0951,
|
| 1979 |
+
"mean_token_accuracy": 0.5561914920061827,
|
| 1980 |
+
"num_input_tokens_seen": 4372093409,
|
| 1981 |
+
"num_tokens": 1842403609.0,
|
| 1982 |
+
"step": 9050
|
| 1983 |
+
},
|
| 1984 |
+
{
|
| 1985 |
+
"epoch": 2.1965085927244745,
|
| 1986 |
+
"grad_norm": 0.275390625,
|
| 1987 |
+
"learning_rate": 2.817262713143202e-05,
|
| 1988 |
+
"loss": 2.0915,
|
| 1989 |
+
"mean_token_accuracy": 0.5557398213073611,
|
| 1990 |
+
"num_input_tokens_seen": 4396072241,
|
| 1991 |
+
"num_tokens": 1852448709.0,
|
| 1992 |
+
"step": 9100
|
| 1993 |
+
},
|
| 1994 |
+
{
|
| 1995 |
+
"epoch": 2.2085791451031276,
|
| 1996 |
+
"grad_norm": 0.2392578125,
|
| 1997 |
+
"learning_rate": 2.7984004828730953e-05,
|
| 1998 |
+
"loss": 2.0965,
|
| 1999 |
+
"mean_token_accuracy": 0.5542204293608666,
|
| 2000 |
+
"num_input_tokens_seen": 4420308385,
|
| 2001 |
+
"num_tokens": 1862702843.0,
|
| 2002 |
+
"step": 9150
|
| 2003 |
+
},
|
| 2004 |
+
{
|
| 2005 |
+
"epoch": 2.220649697481781,
|
| 2006 |
+
"grad_norm": 0.2578125,
|
| 2007 |
+
"learning_rate": 2.779538252602988e-05,
|
| 2008 |
+
"loss": 2.0873,
|
| 2009 |
+
"mean_token_accuracy": 0.555770318582654,
|
| 2010 |
+
"num_input_tokens_seen": 4444408305,
|
| 2011 |
+
"num_tokens": 1872813360.0,
|
| 2012 |
+
"step": 9200
|
| 2013 |
+
},
|
| 2014 |
+
{
|
| 2015 |
+
"epoch": 2.232720249860434,
|
| 2016 |
+
"grad_norm": 0.248046875,
|
| 2017 |
+
"learning_rate": 2.760676022332881e-05,
|
| 2018 |
+
"loss": 2.0984,
|
| 2019 |
+
"mean_token_accuracy": 0.5543450859189033,
|
| 2020 |
+
"num_input_tokens_seen": 4468586049,
|
| 2021 |
+
"num_tokens": 1883034727.0,
|
| 2022 |
+
"step": 9250
|
| 2023 |
+
},
|
| 2024 |
+
{
|
| 2025 |
+
"epoch": 2.2447908022390877,
|
| 2026 |
+
"grad_norm": 0.26171875,
|
| 2027 |
+
"learning_rate": 2.7418137920627736e-05,
|
| 2028 |
+
"loss": 2.0913,
|
| 2029 |
+
"mean_token_accuracy": 0.5554680547490716,
|
| 2030 |
+
"num_input_tokens_seen": 4492717489,
|
| 2031 |
+
"num_tokens": 1893259660.0,
|
| 2032 |
+
"step": 9300
|
| 2033 |
+
},
|
| 2034 |
+
{
|
| 2035 |
+
"epoch": 2.2568613546177407,
|
| 2036 |
+
"grad_norm": 0.3046875,
|
| 2037 |
+
"learning_rate": 2.7229515617926664e-05,
|
| 2038 |
+
"loss": 2.0976,
|
| 2039 |
+
"mean_token_accuracy": 0.5547211924567819,
|
| 2040 |
+
"num_input_tokens_seen": 4516832449,
|
| 2041 |
+
"num_tokens": 1903351453.0,
|
| 2042 |
+
"step": 9350
|
| 2043 |
+
},
|
| 2044 |
+
{
|
| 2045 |
+
"epoch": 2.268931906996394,
|
| 2046 |
+
"grad_norm": 0.240234375,
|
| 2047 |
+
"learning_rate": 2.7040893315225596e-05,
|
| 2048 |
+
"loss": 2.095,
|
| 2049 |
+
"mean_token_accuracy": 0.5545766900852322,
|
| 2050 |
+
"num_input_tokens_seen": 4540881473,
|
| 2051 |
+
"num_tokens": 1913462038.0,
|
| 2052 |
+
"step": 9400
|
| 2053 |
+
},
|
| 2054 |
+
{
|
| 2055 |
+
"epoch": 2.2810024593750473,
|
| 2056 |
+
"grad_norm": 0.2412109375,
|
| 2057 |
+
"learning_rate": 2.685227101252452e-05,
|
| 2058 |
+
"loss": 2.1047,
|
| 2059 |
+
"mean_token_accuracy": 0.5530835852399468,
|
| 2060 |
+
"num_input_tokens_seen": 4565196353,
|
| 2061 |
+
"num_tokens": 1923836730.0,
|
| 2062 |
+
"step": 9450
|
| 2063 |
+
},
|
| 2064 |
+
{
|
| 2065 |
+
"epoch": 2.2930730117537004,
|
| 2066 |
+
"grad_norm": 0.25390625,
|
| 2067 |
+
"learning_rate": 2.6663648709823454e-05,
|
| 2068 |
+
"loss": 2.1036,
|
| 2069 |
+
"num_input_tokens_seen": 4589393665,
|
| 2070 |
+
"step": 9500
|
| 2071 |
+
},
|
| 2072 |
+
{
|
| 2073 |
+
"epoch": 2.2930730117537004,
|
| 2074 |
+
"eval_loss": 1.9684821367263794,
|
| 2075 |
+
"eval_mean_token_accuracy": 0.5784456487953707,
|
| 2076 |
+
"eval_num_tokens": 1933999749.0,
|
| 2077 |
+
"eval_runtime": 130.3401,
|
| 2078 |
+
"eval_samples_per_second": 82.185,
|
| 2079 |
+
"eval_steps_per_second": 20.546,
|
| 2080 |
+
"num_input_tokens_seen": 4589393665,
|
| 2081 |
+
"step": 9500
|
| 2082 |
+
},
|
| 2083 |
+
{
|
| 2084 |
+
"epoch": 2.3051435641323534,
|
| 2085 |
+
"grad_norm": 0.2373046875,
|
| 2086 |
+
"learning_rate": 2.647502640712238e-05,
|
| 2087 |
+
"loss": 2.1091,
|
| 2088 |
+
"mean_token_accuracy": 0.5529859235696495,
|
| 2089 |
+
"num_input_tokens_seen": 4613609921,
|
| 2090 |
+
"num_tokens": 1944210855.0,
|
| 2091 |
+
"step": 9550
|
| 2092 |
+
},
|
| 2093 |
+
{
|
| 2094 |
+
"epoch": 2.317214116511007,
|
| 2095 |
+
"grad_norm": 0.2490234375,
|
| 2096 |
+
"learning_rate": 2.6286404104421307e-05,
|
| 2097 |
+
"loss": 2.0976,
|
| 2098 |
+
"mean_token_accuracy": 0.554888856895268,
|
| 2099 |
+
"num_input_tokens_seen": 4637474321,
|
| 2100 |
+
"num_tokens": 1954258079.0,
|
| 2101 |
+
"step": 9600
|
| 2102 |
+
},
|
| 2103 |
+
{
|
| 2104 |
+
"epoch": 2.32928466888966,
|
| 2105 |
+
"grad_norm": 0.25,
|
| 2106 |
+
"learning_rate": 2.609778180172024e-05,
|
| 2107 |
+
"loss": 2.1061,
|
| 2108 |
+
"mean_token_accuracy": 0.5531089297309518,
|
| 2109 |
+
"num_input_tokens_seen": 4661687841,
|
| 2110 |
+
"num_tokens": 1964447306.0,
|
| 2111 |
+
"step": 9650
|
| 2112 |
+
},
|
| 2113 |
+
{
|
| 2114 |
+
"epoch": 2.341355221268313,
|
| 2115 |
+
"grad_norm": 0.283203125,
|
| 2116 |
+
"learning_rate": 2.5909159499019165e-05,
|
| 2117 |
+
"loss": 2.0972,
|
| 2118 |
+
"mean_token_accuracy": 0.5547297456115484,
|
| 2119 |
+
"num_input_tokens_seen": 4685886657,
|
| 2120 |
+
"num_tokens": 1974672380.0,
|
| 2121 |
+
"step": 9700
|
| 2122 |
+
},
|
| 2123 |
+
{
|
| 2124 |
+
"epoch": 2.3534257736469666,
|
| 2125 |
+
"grad_norm": 0.275390625,
|
| 2126 |
+
"learning_rate": 2.5720537196318097e-05,
|
| 2127 |
+
"loss": 2.0874,
|
| 2128 |
+
"mean_token_accuracy": 0.556226581223309,
|
| 2129 |
+
"num_input_tokens_seen": 4710004273,
|
| 2130 |
+
"num_tokens": 1984832310.0,
|
| 2131 |
+
"step": 9750
|
| 2132 |
+
},
|
| 2133 |
+
{
|
| 2134 |
+
"epoch": 2.3654963260256197,
|
| 2135 |
+
"grad_norm": 0.2392578125,
|
| 2136 |
+
"learning_rate": 2.5531914893617022e-05,
|
| 2137 |
+
"loss": 2.096,
|
| 2138 |
+
"mean_token_accuracy": 0.5547980547696352,
|
| 2139 |
+
"num_input_tokens_seen": 4734271009,
|
| 2140 |
+
"num_tokens": 1995090784.0,
|
| 2141 |
+
"step": 9800
|
| 2142 |
+
},
|
| 2143 |
+
{
|
| 2144 |
+
"epoch": 2.377566878404273,
|
| 2145 |
+
"grad_norm": 0.28515625,
|
| 2146 |
+
"learning_rate": 2.534329259091595e-05,
|
| 2147 |
+
"loss": 2.0871,
|
| 2148 |
+
"mean_token_accuracy": 0.5552258058264852,
|
| 2149 |
+
"num_input_tokens_seen": 4758291265,
|
| 2150 |
+
"num_tokens": 2005240317.0,
|
| 2151 |
+
"step": 9850
|
| 2152 |
+
},
|
| 2153 |
+
{
|
| 2154 |
+
"epoch": 2.3896374307829262,
|
| 2155 |
+
"grad_norm": 0.2470703125,
|
| 2156 |
+
"learning_rate": 2.5154670288214883e-05,
|
| 2157 |
+
"loss": 2.0865,
|
| 2158 |
+
"mean_token_accuracy": 0.5557247434183955,
|
| 2159 |
+
"num_input_tokens_seen": 4782472097,
|
| 2160 |
+
"num_tokens": 2015507708.0,
|
| 2161 |
+
"step": 9900
|
| 2162 |
+
},
|
| 2163 |
+
{
|
| 2164 |
+
"epoch": 2.4017079831615793,
|
| 2165 |
+
"grad_norm": 0.2421875,
|
| 2166 |
+
"learning_rate": 2.4966047985513808e-05,
|
| 2167 |
+
"loss": 2.1074,
|
| 2168 |
+
"mean_token_accuracy": 0.5527091028168798,
|
| 2169 |
+
"num_input_tokens_seen": 4806608113,
|
| 2170 |
+
"num_tokens": 2025820931.0,
|
| 2171 |
+
"step": 9950
|
| 2172 |
+
},
|
| 2173 |
+
{
|
| 2174 |
+
"epoch": 2.413778535540233,
|
| 2175 |
+
"grad_norm": 0.2421875,
|
| 2176 |
+
"learning_rate": 2.477742568281274e-05,
|
| 2177 |
+
"loss": 2.1001,
|
| 2178 |
+
"num_input_tokens_seen": 4830743425,
|
| 2179 |
+
"step": 10000
|
| 2180 |
+
},
|
| 2181 |
+
{
|
| 2182 |
+
"epoch": 2.413778535540233,
|
| 2183 |
+
"eval_loss": 1.9683291912078857,
|
| 2184 |
+
"eval_mean_token_accuracy": 0.5784874623550952,
|
| 2185 |
+
"eval_num_tokens": 2035904188.0,
|
| 2186 |
+
"eval_runtime": 130.7093,
|
| 2187 |
+
"eval_samples_per_second": 81.953,
|
| 2188 |
+
"eval_steps_per_second": 20.488,
|
| 2189 |
+
"num_input_tokens_seen": 4830743425,
|
| 2190 |
+
"step": 10000
|
| 2191 |
}
|
| 2192 |
],
|
| 2193 |
"logging_steps": 50,
|
| 2194 |
"max_steps": 16568,
|
| 2195 |
+
"num_input_tokens_seen": 4830743425,
|
| 2196 |
"num_train_epochs": 4,
|
| 2197 |
"save_steps": 1000,
|
| 2198 |
"stateful_callbacks": {
|
|
|
|
| 2207 |
"attributes": {}
|
| 2208 |
}
|
| 2209 |
},
|
| 2210 |
+
"total_flos": 1.292271014243328e+18,
|
| 2211 |
"train_batch_size": 16,
|
| 2212 |
"trial_name": null,
|
| 2213 |
"trial_params": null
|