FormlessAI commited on
Commit
ef3bd29
·
verified ·
1 Parent(s): c7e3dfe

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59c72492ae77112d7498629e7110aa4ead00892b60bfde7a53cdde23e0a2a87e
3
  size 1037269336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:099138994a05956d4ad7412e1c77594651af3a2aa230c7317bfb36b46e773219
3
  size 1037269336
last-checkpoint/global_step1400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:931bf2b3aa0cd0103da716b1bb3371bd68e7c5d66027cb953138845658fa82da
3
+ size 781993445
last-checkpoint/global_step1400/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c67ff6fdf0d60a40dcc05f16952bf7c1a50087f648497be91a731f90289ffbd
3
+ size 781993509
last-checkpoint/global_step1400/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75b9548a8e037a9855db805a9fca7c84c919b39c663e5afd1433444e63ac2492
3
+ size 781993509
last-checkpoint/global_step1400/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62824a35bbc11118f6c0b47721acdc04b73827ae3edb9e1e83c147a91353174e
3
+ size 781993509
last-checkpoint/global_step1400/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541ccf4a60928b2c17f8cd479db9a00a3ccc251fe7d9ce5b0b2b5585832613f5
3
+ size 2610290277
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step1250
 
1
+ global_step1400
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed87dcb091d4ef7e28c34173b3e5e817c8a65a26c060e643a15f114db3b0387e
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0659f7df65a8d4f022538c1db1324bb83c98939fba11457f135a834e4fc8b08d
3
  size 15429
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f34f165e5ce4e6a030cf3446153db3218902f01675bb6ef508a5d91da25fb4b4
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0774f2829e7ea47ef6e28c2b3b1640314596ceb8b0712423ec369fe44281c840
3
  size 15429
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa19ff1a67f27b22564aa2ddebd6a615ac92d0b0794aa763662b482303827931
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88f44ee23c1ea07ac60d274eaef197906028fd3d21288357504503497316897e
3
  size 15429
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fff90a5aceb3cd4a5999415d57df5f60aeb2a804a347e1c874416d7c196e1499
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a15b31e0a15dc06b8a90dc969f3b213dbad5faacc1b3e26d1dd6b1716d9b3394
3
  size 15429
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d49af4b0761a2e15e6280ed21708d43c8b8fb5531bab12134da87b28369ed4b
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1937275837d80b853c7dc3d5d6eec94618d6af1bb9c3bc4f9035a475fc209b5a
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "best_global_step": null,
3
- "best_metric": 2.1646382808685303,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.1817124582061346,
6
  "eval_steps": 50,
7
- "global_step": 1250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -1958,6 +1958,240 @@
1958
  "eval_samples_per_second": 175.426,
1959
  "eval_steps_per_second": 11.001,
1960
  "step": 1250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1961
  }
1962
  ],
1963
  "logging_steps": 5,
@@ -1986,7 +2220,7 @@
1986
  "attributes": {}
1987
  }
1988
  },
1989
- "total_flos": 3.259402418923766e+17,
1990
  "train_batch_size": 4,
1991
  "trial_name": null,
1992
  "trial_params": null
 
1
  {
2
  "best_global_step": null,
3
+ "best_metric": 2.156383514404297,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.20351795319087077,
6
  "eval_steps": 50,
7
+ "global_step": 1400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
1958
  "eval_samples_per_second": 175.426,
1959
  "eval_steps_per_second": 11.001,
1960
  "step": 1250
1961
+ },
1962
+ {
1963
+ "epoch": 0.18243930803895916,
1964
+ "grad_norm": 2.6892552375793457,
1965
+ "learning_rate": 9.687491790245934e-05,
1966
+ "loss": 2.1997,
1967
+ "step": 1255
1968
+ },
1969
+ {
1970
+ "epoch": 0.1831661578717837,
1971
+ "grad_norm": 3.251024007797241,
1972
+ "learning_rate": 9.684639619477183e-05,
1973
+ "loss": 2.2394,
1974
+ "step": 1260
1975
+ },
1976
+ {
1977
+ "epoch": 0.18389300770460823,
1978
+ "grad_norm": 2.727262496948242,
1979
+ "learning_rate": 9.6817756798978e-05,
1980
+ "loss": 2.3058,
1981
+ "step": 1265
1982
+ },
1983
+ {
1984
+ "epoch": 0.18461985753743276,
1985
+ "grad_norm": 2.5505638122558594,
1986
+ "learning_rate": 9.678899978717747e-05,
1987
+ "loss": 2.3825,
1988
+ "step": 1270
1989
+ },
1990
+ {
1991
+ "epoch": 0.1853467073702573,
1992
+ "grad_norm": 2.5758070945739746,
1993
+ "learning_rate": 9.676012523176601e-05,
1994
+ "loss": 2.1274,
1995
+ "step": 1275
1996
+ },
1997
+ {
1998
+ "epoch": 0.18607355720308183,
1999
+ "grad_norm": 2.4207687377929688,
2000
+ "learning_rate": 9.67311332054353e-05,
2001
+ "loss": 2.2879,
2002
+ "step": 1280
2003
+ },
2004
+ {
2005
+ "epoch": 0.1868004070359064,
2006
+ "grad_norm": 2.7668607234954834,
2007
+ "learning_rate": 9.670202378117268e-05,
2008
+ "loss": 2.3968,
2009
+ "step": 1285
2010
+ },
2011
+ {
2012
+ "epoch": 0.18752725686873092,
2013
+ "grad_norm": 2.7199885845184326,
2014
+ "learning_rate": 9.667279703226111e-05,
2015
+ "loss": 2.3093,
2016
+ "step": 1290
2017
+ },
2018
+ {
2019
+ "epoch": 0.18825410670155546,
2020
+ "grad_norm": 2.2902231216430664,
2021
+ "learning_rate": 9.664345303227893e-05,
2022
+ "loss": 2.2313,
2023
+ "step": 1295
2024
+ },
2025
+ {
2026
+ "epoch": 0.18898095653438,
2027
+ "grad_norm": 2.6537952423095703,
2028
+ "learning_rate": 9.661399185509959e-05,
2029
+ "loss": 2.2216,
2030
+ "step": 1300
2031
+ },
2032
+ {
2033
+ "epoch": 0.18898095653438,
2034
+ "eval_loss": 2.1652886867523193,
2035
+ "eval_runtime": 22.3841,
2036
+ "eval_samples_per_second": 147.471,
2037
+ "eval_steps_per_second": 9.248,
2038
+ "step": 1300
2039
+ },
2040
+ {
2041
+ "epoch": 0.18970780636720452,
2042
+ "grad_norm": 2.699932336807251,
2043
+ "learning_rate": 9.658441357489157e-05,
2044
+ "loss": 2.2718,
2045
+ "step": 1305
2046
+ },
2047
+ {
2048
+ "epoch": 0.19043465620002908,
2049
+ "grad_norm": 2.2263143062591553,
2050
+ "learning_rate": 9.655471826611817e-05,
2051
+ "loss": 2.17,
2052
+ "step": 1310
2053
+ },
2054
+ {
2055
+ "epoch": 0.19116150603285362,
2056
+ "grad_norm": 2.5049476623535156,
2057
+ "learning_rate": 9.652490600353728e-05,
2058
+ "loss": 2.2691,
2059
+ "step": 1315
2060
+ },
2061
+ {
2062
+ "epoch": 0.19188835586567815,
2063
+ "grad_norm": 2.3425512313842773,
2064
+ "learning_rate": 9.649497686220124e-05,
2065
+ "loss": 2.1081,
2066
+ "step": 1320
2067
+ },
2068
+ {
2069
+ "epoch": 0.19261520569850268,
2070
+ "grad_norm": 2.410156726837158,
2071
+ "learning_rate": 9.646493091745662e-05,
2072
+ "loss": 2.1548,
2073
+ "step": 1325
2074
+ },
2075
+ {
2076
+ "epoch": 0.19334205553132722,
2077
+ "grad_norm": 2.3801674842834473,
2078
+ "learning_rate": 9.643476824494408e-05,
2079
+ "loss": 2.3055,
2080
+ "step": 1330
2081
+ },
2082
+ {
2083
+ "epoch": 0.19406890536415178,
2084
+ "grad_norm": 2.3507750034332275,
2085
+ "learning_rate": 9.640448892059808e-05,
2086
+ "loss": 2.2072,
2087
+ "step": 1335
2088
+ },
2089
+ {
2090
+ "epoch": 0.1947957551969763,
2091
+ "grad_norm": 2.3552613258361816,
2092
+ "learning_rate": 9.63740930206468e-05,
2093
+ "loss": 2.1899,
2094
+ "step": 1340
2095
+ },
2096
+ {
2097
+ "epoch": 0.19552260502980084,
2098
+ "grad_norm": 2.4262423515319824,
2099
+ "learning_rate": 9.634358062161187e-05,
2100
+ "loss": 2.059,
2101
+ "step": 1345
2102
+ },
2103
+ {
2104
+ "epoch": 0.19624945486262538,
2105
+ "grad_norm": 2.285623073577881,
2106
+ "learning_rate": 9.631295180030823e-05,
2107
+ "loss": 2.2887,
2108
+ "step": 1350
2109
+ },
2110
+ {
2111
+ "epoch": 0.19624945486262538,
2112
+ "eval_loss": 2.168858051300049,
2113
+ "eval_runtime": 18.8242,
2114
+ "eval_samples_per_second": 175.359,
2115
+ "eval_steps_per_second": 10.996,
2116
+ "step": 1350
2117
+ },
2118
+ {
2119
+ "epoch": 0.1969763046954499,
2120
+ "grad_norm": 2.717876434326172,
2121
+ "learning_rate": 9.628220663384389e-05,
2122
+ "loss": 2.2158,
2123
+ "step": 1355
2124
+ },
2125
+ {
2126
+ "epoch": 0.19770315452827447,
2127
+ "grad_norm": 2.481752872467041,
2128
+ "learning_rate": 9.62513451996198e-05,
2129
+ "loss": 2.1252,
2130
+ "step": 1360
2131
+ },
2132
+ {
2133
+ "epoch": 0.198430004361099,
2134
+ "grad_norm": 2.680485725402832,
2135
+ "learning_rate": 9.622036757532952e-05,
2136
+ "loss": 2.3132,
2137
+ "step": 1365
2138
+ },
2139
+ {
2140
+ "epoch": 0.19915685419392354,
2141
+ "grad_norm": 2.589087724685669,
2142
+ "learning_rate": 9.618927383895924e-05,
2143
+ "loss": 2.3242,
2144
+ "step": 1370
2145
+ },
2146
+ {
2147
+ "epoch": 0.19988370402674807,
2148
+ "grad_norm": 2.7027056217193604,
2149
+ "learning_rate": 9.615806406878738e-05,
2150
+ "loss": 2.228,
2151
+ "step": 1375
2152
+ },
2153
+ {
2154
+ "epoch": 0.2006105538595726,
2155
+ "grad_norm": 2.358421564102173,
2156
+ "learning_rate": 9.612673834338451e-05,
2157
+ "loss": 2.2923,
2158
+ "step": 1380
2159
+ },
2160
+ {
2161
+ "epoch": 0.20133740369239714,
2162
+ "grad_norm": 2.9651687145233154,
2163
+ "learning_rate": 9.609529674161311e-05,
2164
+ "loss": 2.2235,
2165
+ "step": 1385
2166
+ },
2167
+ {
2168
+ "epoch": 0.2020642535252217,
2169
+ "grad_norm": 2.6667802333831787,
2170
+ "learning_rate": 9.606373934262737e-05,
2171
+ "loss": 2.2205,
2172
+ "step": 1390
2173
+ },
2174
+ {
2175
+ "epoch": 0.20279110335804623,
2176
+ "grad_norm": 2.5625758171081543,
2177
+ "learning_rate": 9.603206622587299e-05,
2178
+ "loss": 2.2178,
2179
+ "step": 1395
2180
+ },
2181
+ {
2182
+ "epoch": 0.20351795319087077,
2183
+ "grad_norm": 2.4167873859405518,
2184
+ "learning_rate": 9.6000277471087e-05,
2185
+ "loss": 2.1721,
2186
+ "step": 1400
2187
+ },
2188
+ {
2189
+ "epoch": 0.20351795319087077,
2190
+ "eval_loss": 2.156383514404297,
2191
+ "eval_runtime": 18.7682,
2192
+ "eval_samples_per_second": 175.883,
2193
+ "eval_steps_per_second": 11.029,
2194
+ "step": 1400
2195
  }
2196
  ],
2197
  "logging_steps": 5,
 
2220
  "attributes": {}
2221
  }
2222
  },
2223
+ "total_flos": 3.654657124471931e+17,
2224
  "train_batch_size": 4,
2225
  "trial_name": null,
2226
  "trial_params": null