aghatage commited on
Commit
5cdd7fa
·
verified ·
1 Parent(s): 8e263df

Training in progress, step 5500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81eac720b158c7f43a3b9b48f3c680e3548bab4820189790d8de2f257ac92036
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e175cfbb3b1e5047d2a07a1f65e6011d48b21f6ac86f4b54bb7a003b3e25ddd9
3
  size 12017472
last-checkpoint/global_step5500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1260828109bb51d4ec084d9edc433ecc1fd867fbe5d156a6c5ea7d7b2fe0528
3
+ size 71982309
last-checkpoint/global_step5500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ec0cdef2fdf8552c3a2888f93dae3f9ed787dcab7d4e98334b4826387d6456a
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step5000
 
1
+ global_step5500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3dd594d08139e0846701d4c186ee22eb3ed05631cdda05ef04a8843616048835
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c75d63279b47e795ad4622a2e3404a0983cd22c3e120a053ebabf9c78e50af21
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 5000,
3
- "best_metric": 0.5900602340698242,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-5000",
5
- "epoch": 3.6340665333575712,
6
  "eval_steps": 250,
7
- "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2017,6 +2017,206 @@
2017
  "eval_samples_per_second": 43.688,
2018
  "eval_steps_per_second": 5.468,
2019
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2020
  }
2021
  ],
2022
  "logging_steps": 25,
@@ -2036,7 +2236,7 @@
2036
  "attributes": {}
2037
  }
2038
  },
2039
- "total_flos": 2.7767572610102067e+17,
2040
  "train_batch_size": 4,
2041
  "trial_name": null,
2042
  "trial_params": null
 
1
  {
2
+ "best_global_step": 5500,
3
+ "best_metric": 0.5829094648361206,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-5500",
5
+ "epoch": 3.997636793310307,
6
  "eval_steps": 250,
7
+ "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2017
  "eval_samples_per_second": 43.688,
2018
  "eval_steps_per_second": 5.468,
2019
  "step": 5000
2020
+ },
2021
+ {
2022
+ "epoch": 3.6522450463552083,
2023
+ "grad_norm": 0.7423418760299683,
2024
+ "learning_rate": 6.350273695266381e-05,
2025
+ "loss": 0.5796,
2026
+ "mean_token_accuracy": 0.82060361713171,
2027
+ "num_tokens": 110631489.0,
2028
+ "step": 5025
2029
+ },
2030
+ {
2031
+ "epoch": 3.670423559352845,
2032
+ "grad_norm": 0.8051418662071228,
2033
+ "learning_rate": 6.334757586954799e-05,
2034
+ "loss": 0.5858,
2035
+ "mean_token_accuracy": 0.818088726401329,
2036
+ "num_tokens": 111178589.0,
2037
+ "step": 5050
2038
+ },
2039
+ {
2040
+ "epoch": 3.6886020723504815,
2041
+ "grad_norm": 0.8583251237869263,
2042
+ "learning_rate": 6.319188010178665e-05,
2043
+ "loss": 0.5859,
2044
+ "mean_token_accuracy": 0.818018836081028,
2045
+ "num_tokens": 111748170.0,
2046
+ "step": 5075
2047
+ },
2048
+ {
2049
+ "epoch": 3.7067805853481186,
2050
+ "grad_norm": 0.719312846660614,
2051
+ "learning_rate": 6.303565321498068e-05,
2052
+ "loss": 0.585,
2053
+ "mean_token_accuracy": 0.8183588898181915,
2054
+ "num_tokens": 112291726.0,
2055
+ "step": 5100
2056
+ },
2057
+ {
2058
+ "epoch": 3.7249590983457552,
2059
+ "grad_norm": 0.768618106842041,
2060
+ "learning_rate": 6.287889878689412e-05,
2061
+ "loss": 0.5833,
2062
+ "mean_token_accuracy": 0.8190835517644882,
2063
+ "num_tokens": 112838799.0,
2064
+ "step": 5125
2065
+ },
2066
+ {
2067
+ "epoch": 3.743137611343392,
2068
+ "grad_norm": 0.8095366358757019,
2069
+ "learning_rate": 6.272162040737227e-05,
2070
+ "loss": 0.586,
2071
+ "mean_token_accuracy": 0.8185149121284485,
2072
+ "num_tokens": 113391677.0,
2073
+ "step": 5150
2074
+ },
2075
+ {
2076
+ "epoch": 3.761316124341029,
2077
+ "grad_norm": 0.762540340423584,
2078
+ "learning_rate": 6.256382167825952e-05,
2079
+ "loss": 0.5816,
2080
+ "mean_token_accuracy": 0.8203315672278404,
2081
+ "num_tokens": 113936197.0,
2082
+ "step": 5175
2083
+ },
2084
+ {
2085
+ "epoch": 3.779494637338666,
2086
+ "grad_norm": 0.7430989146232605,
2087
+ "learning_rate": 6.240550621331678e-05,
2088
+ "loss": 0.5821,
2089
+ "mean_token_accuracy": 0.8193337711691856,
2090
+ "num_tokens": 114493484.0,
2091
+ "step": 5200
2092
+ },
2093
+ {
2094
+ "epoch": 3.7976731503363026,
2095
+ "grad_norm": 0.7766333222389221,
2096
+ "learning_rate": 6.224667763813883e-05,
2097
+ "loss": 0.5833,
2098
+ "mean_token_accuracy": 0.8199924173951149,
2099
+ "num_tokens": 115036265.0,
2100
+ "step": 5225
2101
+ },
2102
+ {
2103
+ "epoch": 3.815851663333939,
2104
+ "grad_norm": 0.7837154269218445,
2105
+ "learning_rate": 6.208733959007113e-05,
2106
+ "loss": 0.5743,
2107
+ "mean_token_accuracy": 0.8223940685391427,
2108
+ "num_tokens": 115580134.0,
2109
+ "step": 5250
2110
+ },
2111
+ {
2112
+ "epoch": 3.815851663333939,
2113
+ "eval_loss": 0.5864232778549194,
2114
+ "eval_mean_token_accuracy": 0.8171391937078214,
2115
+ "eval_num_tokens": 115580134.0,
2116
+ "eval_runtime": 112.2751,
2117
+ "eval_samples_per_second": 43.554,
2118
+ "eval_steps_per_second": 5.451,
2119
+ "step": 5250
2120
+ },
2121
+ {
2122
+ "epoch": 3.8340301763315763,
2123
+ "grad_norm": 0.7619733810424805,
2124
+ "learning_rate": 6.19274957181267e-05,
2125
+ "loss": 0.5814,
2126
+ "mean_token_accuracy": 0.8204685914516449,
2127
+ "num_tokens": 116138332.0,
2128
+ "step": 5275
2129
+ },
2130
+ {
2131
+ "epoch": 3.852208689329213,
2132
+ "grad_norm": 0.7965566515922546,
2133
+ "learning_rate": 6.176714968290246e-05,
2134
+ "loss": 0.589,
2135
+ "mean_token_accuracy": 0.8183123478293419,
2136
+ "num_tokens": 116694993.0,
2137
+ "step": 5300
2138
+ },
2139
+ {
2140
+ "epoch": 3.8703872023268495,
2141
+ "grad_norm": 0.7417636513710022,
2142
+ "learning_rate": 6.160630515649538e-05,
2143
+ "loss": 0.5843,
2144
+ "mean_token_accuracy": 0.8171842768788338,
2145
+ "num_tokens": 117258648.0,
2146
+ "step": 5325
2147
+ },
2148
+ {
2149
+ "epoch": 3.8885657153244866,
2150
+ "grad_norm": 0.7932421565055847,
2151
+ "learning_rate": 6.144496582241842e-05,
2152
+ "loss": 0.5811,
2153
+ "mean_token_accuracy": 0.819763533771038,
2154
+ "num_tokens": 117811709.0,
2155
+ "step": 5350
2156
+ },
2157
+ {
2158
+ "epoch": 3.906744228322123,
2159
+ "grad_norm": 0.7742260694503784,
2160
+ "learning_rate": 6.128313537551622e-05,
2161
+ "loss": 0.583,
2162
+ "mean_token_accuracy": 0.8193339914083481,
2163
+ "num_tokens": 118368306.0,
2164
+ "step": 5375
2165
+ },
2166
+ {
2167
+ "epoch": 3.92492274131976,
2168
+ "grad_norm": 0.7565304040908813,
2169
+ "learning_rate": 6.112081752188036e-05,
2170
+ "loss": 0.5913,
2171
+ "mean_token_accuracy": 0.816694650053978,
2172
+ "num_tokens": 118923425.0,
2173
+ "step": 5400
2174
+ },
2175
+ {
2176
+ "epoch": 3.943101254317397,
2177
+ "grad_norm": 0.733914315700531,
2178
+ "learning_rate": 6.09580159787646e-05,
2179
+ "loss": 0.5797,
2180
+ "mean_token_accuracy": 0.8201961496472359,
2181
+ "num_tokens": 119479832.0,
2182
+ "step": 5425
2183
+ },
2184
+ {
2185
+ "epoch": 3.9612797673150335,
2186
+ "grad_norm": 0.800318717956543,
2187
+ "learning_rate": 6.07947344744997e-05,
2188
+ "loss": 0.5798,
2189
+ "mean_token_accuracy": 0.8196702027320861,
2190
+ "num_tokens": 120025792.0,
2191
+ "step": 5450
2192
+ },
2193
+ {
2194
+ "epoch": 3.9794582803126706,
2195
+ "grad_norm": 0.7655355334281921,
2196
+ "learning_rate": 6.0630976748408074e-05,
2197
+ "loss": 0.5763,
2198
+ "mean_token_accuracy": 0.8210220813751221,
2199
+ "num_tokens": 120587031.0,
2200
+ "step": 5475
2201
+ },
2202
+ {
2203
+ "epoch": 3.997636793310307,
2204
+ "grad_norm": 0.8047225475311279,
2205
+ "learning_rate": 6.046674655071809e-05,
2206
+ "loss": 0.573,
2207
+ "mean_token_accuracy": 0.8227437067031861,
2208
+ "num_tokens": 121138768.0,
2209
+ "step": 5500
2210
+ },
2211
+ {
2212
+ "epoch": 3.997636793310307,
2213
+ "eval_loss": 0.5829094648361206,
2214
+ "eval_mean_token_accuracy": 0.8182347503557704,
2215
+ "eval_num_tokens": 121138768.0,
2216
+ "eval_runtime": 112.4332,
2217
+ "eval_samples_per_second": 43.492,
2218
+ "eval_steps_per_second": 5.443,
2219
+ "step": 5500
2220
  }
2221
  ],
2222
  "logging_steps": 25,
 
2236
  "attributes": {}
2237
  }
2238
  },
2239
+ "total_flos": 3.0548540768282214e+17,
2240
  "train_batch_size": 4,
2241
  "trial_name": null,
2242
  "trial_params": null