Azrail commited on
Commit
254680a
·
verified ·
1 Parent(s): bab5851

Training in progress, step 12000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25851b0b62512a3b653e1b28e1122d3212578c5e77ebf5b79e658ffec9b3b79f
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b83de3716293416e17f57907b1e6034054cf0cb82c7485e524b4d7d1450783b
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df743e5d843f3a7837833bec4e1caf6ae5d4bce7ba980e12a541afbc37b034f0
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a64cb29e942a69a8dc03ff6ac3a4e293f03dde8909732e3b914b2a3bf04f6716
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25b1915a16f6a9d0eca7bf59f4a66ab58a1d3558fffae49f30b6000a597cffb1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e88c68399442716e4a372c4039d5dcf90ac56e28a588e1c0ea57e0e690737de
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:811fd3ba19eb7a55c539858dcaf05c190bd36b9252f7748cbb128712f2400a11
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8af59fb9ac4de4ac193b8a4959e006fc89e2686baafa42f4be575214da0ad2e3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.24162647567192067,
6
  "eval_steps": 500,
7
- "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1966,11 +1966,189 @@
1966
  "eval_steps_per_second": 19.037,
1967
  "num_input_tokens_seen": 11534336000,
1968
  "step": 11000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1969
  }
1970
  ],
1971
  "logging_steps": 50,
1972
  "max_steps": 200000,
1973
- "num_input_tokens_seen": 11534336000,
1974
  "num_train_epochs": 5,
1975
  "save_steps": 1000,
1976
  "stateful_callbacks": {
@@ -1985,7 +2163,7 @@
1985
  "attributes": {}
1986
  }
1987
  },
1988
- "total_flos": 6.568888875614208e+18,
1989
  "train_batch_size": 64,
1990
  "trial_name": null,
1991
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.26359251891482255,
6
  "eval_steps": 500,
7
+ "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1966
  "eval_steps_per_second": 19.037,
1967
  "num_input_tokens_seen": 11534336000,
1968
  "step": 11000
1969
+ },
1970
+ {
1971
+ "epoch": 0.24272477783406576,
1972
+ "grad_norm": 0.1190350204706192,
1973
+ "learning_rate": 0.001,
1974
+ "loss": 2.7994,
1975
+ "num_input_tokens_seen": 11586764800,
1976
+ "step": 11050
1977
+ },
1978
+ {
1979
+ "epoch": 0.24382307999621086,
1980
+ "grad_norm": 0.12825961410999298,
1981
+ "learning_rate": 0.001,
1982
+ "loss": 2.7992,
1983
+ "num_input_tokens_seen": 11639193600,
1984
+ "step": 11100
1985
+ },
1986
+ {
1987
+ "epoch": 0.24492138215835596,
1988
+ "grad_norm": 0.12561525404453278,
1989
+ "learning_rate": 0.001,
1990
+ "loss": 2.8009,
1991
+ "num_input_tokens_seen": 11691622400,
1992
+ "step": 11150
1993
+ },
1994
+ {
1995
+ "epoch": 0.24601968432050106,
1996
+ "grad_norm": 0.12596049904823303,
1997
+ "learning_rate": 0.001,
1998
+ "loss": 2.8002,
1999
+ "num_input_tokens_seen": 11744051200,
2000
+ "step": 11200
2001
+ },
2002
+ {
2003
+ "epoch": 0.24711798648264613,
2004
+ "grad_norm": 0.1415141373872757,
2005
+ "learning_rate": 0.001,
2006
+ "loss": 2.8004,
2007
+ "num_input_tokens_seen": 11796480000,
2008
+ "step": 11250
2009
+ },
2010
+ {
2011
+ "epoch": 0.24821628864479123,
2012
+ "grad_norm": 0.1359766125679016,
2013
+ "learning_rate": 0.001,
2014
+ "loss": 2.7988,
2015
+ "num_input_tokens_seen": 11848908800,
2016
+ "step": 11300
2017
+ },
2018
+ {
2019
+ "epoch": 0.24931459080693633,
2020
+ "grad_norm": 0.13459013402462006,
2021
+ "learning_rate": 0.001,
2022
+ "loss": 2.7991,
2023
+ "num_input_tokens_seen": 11901337600,
2024
+ "step": 11350
2025
+ },
2026
+ {
2027
+ "epoch": 0.2504128929690814,
2028
+ "grad_norm": 0.1344253420829773,
2029
+ "learning_rate": 0.001,
2030
+ "loss": 2.805,
2031
+ "num_input_tokens_seen": 11953766400,
2032
+ "step": 11400
2033
+ },
2034
+ {
2035
+ "epoch": 0.2515111951312265,
2036
+ "grad_norm": 0.13629016280174255,
2037
+ "learning_rate": 0.001,
2038
+ "loss": 2.7954,
2039
+ "num_input_tokens_seen": 12006195200,
2040
+ "step": 11450
2041
+ },
2042
+ {
2043
+ "epoch": 0.2526094972933716,
2044
+ "grad_norm": 0.12940892577171326,
2045
+ "learning_rate": 0.001,
2046
+ "loss": 2.8009,
2047
+ "num_input_tokens_seen": 12058624000,
2048
+ "step": 11500
2049
+ },
2050
+ {
2051
+ "epoch": 0.2526094972933716,
2052
+ "eval_loss": 2.7012581825256348,
2053
+ "eval_runtime": 65.7039,
2054
+ "eval_samples_per_second": 76.099,
2055
+ "eval_steps_per_second": 19.025,
2056
+ "num_input_tokens_seen": 12058624000,
2057
+ "step": 11500
2058
+ },
2059
+ {
2060
+ "epoch": 0.2537077994555167,
2061
+ "grad_norm": 0.15021966397762299,
2062
+ "learning_rate": 0.001,
2063
+ "loss": 2.7963,
2064
+ "num_input_tokens_seen": 12111052800,
2065
+ "step": 11550
2066
+ },
2067
+ {
2068
+ "epoch": 0.2548061016176618,
2069
+ "grad_norm": 0.12381847202777863,
2070
+ "learning_rate": 0.001,
2071
+ "loss": 2.7954,
2072
+ "num_input_tokens_seen": 12163481600,
2073
+ "step": 11600
2074
+ },
2075
+ {
2076
+ "epoch": 0.2559044037798069,
2077
+ "grad_norm": 0.14849607646465302,
2078
+ "learning_rate": 0.001,
2079
+ "loss": 2.7837,
2080
+ "num_input_tokens_seen": 12215910400,
2081
+ "step": 11650
2082
+ },
2083
+ {
2084
+ "epoch": 0.25700270594195196,
2085
+ "grad_norm": 0.1286240816116333,
2086
+ "learning_rate": 0.001,
2087
+ "loss": 2.7999,
2088
+ "num_input_tokens_seen": 12268339200,
2089
+ "step": 11700
2090
+ },
2091
+ {
2092
+ "epoch": 0.2581010081040971,
2093
+ "grad_norm": 0.11861539632081985,
2094
+ "learning_rate": 0.001,
2095
+ "loss": 2.7979,
2096
+ "num_input_tokens_seen": 12320768000,
2097
+ "step": 11750
2098
+ },
2099
+ {
2100
+ "epoch": 0.25919931026624216,
2101
+ "grad_norm": 0.11512617021799088,
2102
+ "learning_rate": 0.001,
2103
+ "loss": 2.7926,
2104
+ "num_input_tokens_seen": 12373196800,
2105
+ "step": 11800
2106
+ },
2107
+ {
2108
+ "epoch": 0.2602976124283873,
2109
+ "grad_norm": 0.13469178974628448,
2110
+ "learning_rate": 0.001,
2111
+ "loss": 2.7881,
2112
+ "num_input_tokens_seen": 12425625600,
2113
+ "step": 11850
2114
+ },
2115
+ {
2116
+ "epoch": 0.26139591459053235,
2117
+ "grad_norm": 0.15504290163516998,
2118
+ "learning_rate": 0.001,
2119
+ "loss": 2.7917,
2120
+ "num_input_tokens_seen": 12478054400,
2121
+ "step": 11900
2122
+ },
2123
+ {
2124
+ "epoch": 0.2624942167526775,
2125
+ "grad_norm": 0.1363905370235443,
2126
+ "learning_rate": 0.001,
2127
+ "loss": 2.7869,
2128
+ "num_input_tokens_seen": 12530483200,
2129
+ "step": 11950
2130
+ },
2131
+ {
2132
+ "epoch": 0.26359251891482255,
2133
+ "grad_norm": 0.11095720529556274,
2134
+ "learning_rate": 0.001,
2135
+ "loss": 2.7883,
2136
+ "num_input_tokens_seen": 12582912000,
2137
+ "step": 12000
2138
+ },
2139
+ {
2140
+ "epoch": 0.26359251891482255,
2141
+ "eval_loss": 2.6911227703094482,
2142
+ "eval_runtime": 65.4928,
2143
+ "eval_samples_per_second": 76.344,
2144
+ "eval_steps_per_second": 19.086,
2145
+ "num_input_tokens_seen": 12582912000,
2146
+ "step": 12000
2147
  }
2148
  ],
2149
  "logging_steps": 50,
2150
  "max_steps": 200000,
2151
+ "num_input_tokens_seen": 12582912000,
2152
  "num_train_epochs": 5,
2153
  "save_steps": 1000,
2154
  "stateful_callbacks": {
 
2163
  "attributes": {}
2164
  }
2165
  },
2166
+ "total_flos": 7.166060591579136e+18,
2167
  "train_batch_size": 64,
2168
  "trial_name": null,
2169
  "trial_params": null