Training in progress, step 6000
Browse files- model.safetensors +1 -1
- training_log.txt +200 -0
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16060556616
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:635e738f919678189191de44fd4b70faf93107154227cb6e9c11c84668e539a1
|
| 3 |
size 16060556616
|
training_log.txt
CHANGED
|
@@ -204,3 +204,203 @@ Training started at: 2026-04-04 09:18:44
|
|
| 204 |
[2026-04-04 14:03:05] Step 3990: loss: 0.8577, grad_norm: 0.2070, learning_rate: 0.0000, epoch: 0.0199
|
| 205 |
[2026-04-04 14:04:30] Step 4000: loss: 0.8141, grad_norm: 0.1709, learning_rate: 0.0000, epoch: 0.0200
|
| 206 |
[2026-04-04 14:07:12] Step 4010: loss: 0.8507, grad_norm: 0.1602, learning_rate: 0.0000, epoch: 0.0200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
[2026-04-04 14:03:05] Step 3990: loss: 0.8577, grad_norm: 0.2070, learning_rate: 0.0000, epoch: 0.0199
|
| 205 |
[2026-04-04 14:04:30] Step 4000: loss: 0.8141, grad_norm: 0.1709, learning_rate: 0.0000, epoch: 0.0200
|
| 206 |
[2026-04-04 14:07:12] Step 4010: loss: 0.8507, grad_norm: 0.1602, learning_rate: 0.0000, epoch: 0.0200
|
| 207 |
+
[2026-04-04 14:08:37] Step 4020: loss: 0.8619, grad_norm: 0.1729, learning_rate: 0.0000, epoch: 0.0201
|
| 208 |
+
[2026-04-04 14:10:01] Step 4030: loss: 0.8497, grad_norm: 0.1514, learning_rate: 0.0000, epoch: 0.0202
|
| 209 |
+
[2026-04-04 14:11:26] Step 4040: loss: 0.8421, grad_norm: 0.1729, learning_rate: 0.0000, epoch: 0.0202
|
| 210 |
+
[2026-04-04 14:12:50] Step 4050: loss: 0.8204, grad_norm: 0.2129, learning_rate: 0.0000, epoch: 0.0203
|
| 211 |
+
[2026-04-04 14:14:14] Step 4060: loss: 0.8450, grad_norm: 0.2236, learning_rate: 0.0000, epoch: 0.0203
|
| 212 |
+
[2026-04-04 14:15:39] Step 4070: loss: 0.7626, grad_norm: 0.1138, learning_rate: 0.0000, epoch: 0.0204
|
| 213 |
+
[2026-04-04 14:17:03] Step 4080: loss: 0.6547, grad_norm: 0.1084, learning_rate: 0.0000, epoch: 0.0204
|
| 214 |
+
[2026-04-04 14:18:28] Step 4090: loss: 0.9119, grad_norm: 0.2070, learning_rate: 0.0000, epoch: 0.0204
|
| 215 |
+
[2026-04-04 14:19:52] Step 4100: loss: 0.7530, grad_norm: 0.2500, learning_rate: 0.0000, epoch: 0.0205
|
| 216 |
+
[2026-04-04 14:21:17] Step 4110: loss: 0.8678, grad_norm: 0.1592, learning_rate: 0.0000, epoch: 0.0205
|
| 217 |
+
[2026-04-04 14:22:41] Step 4120: loss: 0.8312, grad_norm: 0.1982, learning_rate: 0.0000, epoch: 0.0206
|
| 218 |
+
[2026-04-04 14:24:06] Step 4130: loss: 0.8719, grad_norm: 0.4414, learning_rate: 0.0000, epoch: 0.0207
|
| 219 |
+
[2026-04-04 14:25:30] Step 4140: loss: 0.8141, grad_norm: 0.2598, learning_rate: 0.0000, epoch: 0.0207
|
| 220 |
+
[2026-04-04 14:26:55] Step 4150: loss: 0.8780, grad_norm: 0.2324, learning_rate: 0.0000, epoch: 0.0208
|
| 221 |
+
[2026-04-04 14:28:19] Step 4160: loss: 0.8864, grad_norm: 0.1514, learning_rate: 0.0000, epoch: 0.0208
|
| 222 |
+
[2026-04-04 14:29:44] Step 4170: loss: 0.9051, grad_norm: 0.1611, learning_rate: 0.0000, epoch: 0.0209
|
| 223 |
+
[2026-04-04 14:31:08] Step 4180: loss: 0.8238, grad_norm: 0.2676, learning_rate: 0.0000, epoch: 0.0209
|
| 224 |
+
[2026-04-04 14:32:32] Step 4190: loss: 0.9161, grad_norm: 0.1904, learning_rate: 0.0000, epoch: 0.0209
|
| 225 |
+
[2026-04-04 14:33:56] Step 4200: loss: 0.9077, grad_norm: 0.2158, learning_rate: 0.0000, epoch: 0.0210
|
| 226 |
+
[2026-04-04 14:35:21] Step 4210: loss: 0.8494, grad_norm: 0.1465, learning_rate: 0.0000, epoch: 0.0210
|
| 227 |
+
[2026-04-04 14:36:45] Step 4220: loss: 0.9484, grad_norm: 0.1855, learning_rate: 0.0000, epoch: 0.0211
|
| 228 |
+
[2026-04-04 14:38:09] Step 4230: loss: 0.8625, grad_norm: 0.2236, learning_rate: 0.0000, epoch: 0.0211
|
| 229 |
+
[2026-04-04 14:39:33] Step 4240: loss: 0.8828, grad_norm: 0.2031, learning_rate: 0.0000, epoch: 0.0212
|
| 230 |
+
[2026-04-04 14:40:58] Step 4250: loss: 0.8607, grad_norm: 0.1572, learning_rate: 0.0000, epoch: 0.0213
|
| 231 |
+
[2026-04-04 14:42:22] Step 4260: loss: 0.9015, grad_norm: 0.2129, learning_rate: 0.0000, epoch: 0.0213
|
| 232 |
+
[2026-04-04 14:43:46] Step 4270: loss: 0.9145, grad_norm: 0.1748, learning_rate: 0.0000, epoch: 0.0214
|
| 233 |
+
[2026-04-04 14:45:10] Step 4280: loss: 0.8502, grad_norm: 0.1914, learning_rate: 0.0000, epoch: 0.0214
|
| 234 |
+
[2026-04-04 14:46:34] Step 4290: loss: 0.8569, grad_norm: 0.2432, learning_rate: 0.0000, epoch: 0.0215
|
| 235 |
+
[2026-04-04 14:47:58] Step 4300: loss: 0.8992, grad_norm: 0.1865, learning_rate: 0.0000, epoch: 0.0215
|
| 236 |
+
[2026-04-04 14:49:23] Step 4310: loss: 0.9081, grad_norm: 0.1641, learning_rate: 0.0000, epoch: 0.0215
|
| 237 |
+
[2026-04-04 14:50:47] Step 4320: loss: 0.9125, grad_norm: 0.2012, learning_rate: 0.0000, epoch: 0.0216
|
| 238 |
+
[2026-04-04 14:52:11] Step 4330: loss: 0.8991, grad_norm: 0.2295, learning_rate: 0.0000, epoch: 0.0216
|
| 239 |
+
[2026-04-04 14:53:35] Step 4340: loss: 0.9418, grad_norm: 0.1611, learning_rate: 0.0000, epoch: 0.0217
|
| 240 |
+
[2026-04-04 14:55:00] Step 4350: loss: 1.0309, grad_norm: 0.6445, learning_rate: 0.0000, epoch: 0.0217
|
| 241 |
+
[2026-04-04 14:56:24] Step 4360: loss: 0.8411, grad_norm: 0.1279, learning_rate: 0.0000, epoch: 0.0218
|
| 242 |
+
[2026-04-04 14:57:49] Step 4370: loss: 0.9025, grad_norm: 0.2930, learning_rate: 0.0000, epoch: 0.0219
|
| 243 |
+
[2026-04-04 14:59:13] Step 4380: loss: 0.8707, grad_norm: 0.1768, learning_rate: 0.0000, epoch: 0.0219
|
| 244 |
+
[2026-04-04 15:00:37] Step 4390: loss: 0.8823, grad_norm: 0.2197, learning_rate: 0.0000, epoch: 0.0220
|
| 245 |
+
[2026-04-04 15:02:02] Step 4400: loss: 0.9306, grad_norm: 0.1885, learning_rate: 0.0000, epoch: 0.0220
|
| 246 |
+
[2026-04-04 15:03:26] Step 4410: loss: 0.8781, grad_norm: 0.2041, learning_rate: 0.0000, epoch: 0.0221
|
| 247 |
+
[2026-04-04 15:04:50] Step 4420: loss: 0.7951, grad_norm: 0.1895, learning_rate: 0.0000, epoch: 0.0221
|
| 248 |
+
[2026-04-04 15:06:14] Step 4430: loss: 0.8375, grad_norm: 0.2324, learning_rate: 0.0000, epoch: 0.0221
|
| 249 |
+
[2026-04-04 15:07:38] Step 4440: loss: 0.8831, grad_norm: 0.2949, learning_rate: 0.0000, epoch: 0.0222
|
| 250 |
+
[2026-04-04 15:09:02] Step 4450: loss: 0.8990, grad_norm: 0.2275, learning_rate: 0.0000, epoch: 0.0222
|
| 251 |
+
[2026-04-04 15:10:26] Step 4460: loss: 0.8656, grad_norm: 0.1650, learning_rate: 0.0000, epoch: 0.0223
|
| 252 |
+
[2026-04-04 15:11:51] Step 4470: loss: 0.8416, grad_norm: 0.1777, learning_rate: 0.0000, epoch: 0.0223
|
| 253 |
+
[2026-04-04 15:13:15] Step 4480: loss: 0.9821, grad_norm: 0.1807, learning_rate: 0.0000, epoch: 0.0224
|
| 254 |
+
[2026-04-04 15:14:39] Step 4490: loss: 0.8345, grad_norm: 0.2490, learning_rate: 0.0000, epoch: 0.0225
|
| 255 |
+
[2026-04-04 15:16:03] Step 4500: loss: 0.7619, grad_norm: 0.2246, learning_rate: 0.0000, epoch: 0.0225
|
| 256 |
+
[2026-04-04 15:17:27] Step 4510: loss: 0.9103, grad_norm: 0.1846, learning_rate: 0.0000, epoch: 0.0226
|
| 257 |
+
[2026-04-04 15:18:51] Step 4520: loss: 0.8421, grad_norm: 0.2129, learning_rate: 0.0000, epoch: 0.0226
|
| 258 |
+
[2026-04-04 15:20:16] Step 4530: loss: 0.9850, grad_norm: 0.1963, learning_rate: 0.0000, epoch: 0.0226
|
| 259 |
+
[2026-04-04 15:21:40] Step 4540: loss: 0.8908, grad_norm: 0.2139, learning_rate: 0.0000, epoch: 0.0227
|
| 260 |
+
[2026-04-04 15:23:04] Step 4550: loss: 0.8702, grad_norm: 0.2061, learning_rate: 0.0000, epoch: 0.0227
|
| 261 |
+
[2026-04-04 15:24:28] Step 4560: loss: 0.9466, grad_norm: 0.1914, learning_rate: 0.0000, epoch: 0.0228
|
| 262 |
+
[2026-04-04 15:25:53] Step 4570: loss: 0.8640, grad_norm: 0.1709, learning_rate: 0.0000, epoch: 0.0228
|
| 263 |
+
[2026-04-04 15:27:17] Step 4580: loss: 1.0906, grad_norm: 0.2451, learning_rate: 0.0000, epoch: 0.0229
|
| 264 |
+
[2026-04-04 15:28:41] Step 4590: loss: 0.8248, grad_norm: 0.1689, learning_rate: 0.0000, epoch: 0.0230
|
| 265 |
+
[2026-04-04 15:30:05] Step 4600: loss: 0.9053, grad_norm: 0.4121, learning_rate: 0.0000, epoch: 0.0230
|
| 266 |
+
[2026-04-04 15:31:30] Step 4610: loss: 0.9402, grad_norm: 0.2158, learning_rate: 0.0000, epoch: 0.0231
|
| 267 |
+
[2026-04-04 15:32:54] Step 4620: loss: 0.9434, grad_norm: 0.1650, learning_rate: 0.0000, epoch: 0.0231
|
| 268 |
+
[2026-04-04 15:34:18] Step 4630: loss: 0.9944, grad_norm: 0.1953, learning_rate: 0.0000, epoch: 0.0232
|
| 269 |
+
[2026-04-04 15:35:42] Step 4640: loss: 0.9170, grad_norm: 0.2031, learning_rate: 0.0000, epoch: 0.0232
|
| 270 |
+
[2026-04-04 15:37:06] Step 4650: loss: 0.9447, grad_norm: 888.0000, learning_rate: 0.0000, epoch: 0.0232
|
| 271 |
+
[2026-04-04 15:38:30] Step 4660: loss: 0.8551, grad_norm: 0.2402, learning_rate: 0.0000, epoch: 0.0233
|
| 272 |
+
[2026-04-04 15:39:54] Step 4670: loss: 0.8951, grad_norm: 0.2119, learning_rate: 0.0000, epoch: 0.0233
|
| 273 |
+
[2026-04-04 15:41:19] Step 4680: loss: 0.8970, grad_norm: 0.1885, learning_rate: 0.0000, epoch: 0.0234
|
| 274 |
+
[2026-04-04 15:42:43] Step 4690: loss: 1.4014, grad_norm: 0.2158, learning_rate: 0.0000, epoch: 0.0234
|
| 275 |
+
[2026-04-04 15:44:07] Step 4700: loss: 0.8219, grad_norm: 0.1777, learning_rate: 0.0000, epoch: 0.0235
|
| 276 |
+
[2026-04-04 15:45:31] Step 4710: loss: 0.9223, grad_norm: 0.1982, learning_rate: 0.0000, epoch: 0.0236
|
| 277 |
+
[2026-04-04 15:46:55] Step 4720: loss: 0.8339, grad_norm: 0.2715, learning_rate: 0.0000, epoch: 0.0236
|
| 278 |
+
[2026-04-04 15:48:19] Step 4730: loss: 0.8905, grad_norm: 0.2305, learning_rate: 0.0000, epoch: 0.0237
|
| 279 |
+
[2026-04-04 15:49:44] Step 4740: loss: 0.8911, grad_norm: 0.3223, learning_rate: 0.0000, epoch: 0.0237
|
| 280 |
+
[2026-04-04 15:51:07] Step 4750: loss: 0.8771, grad_norm: 0.2910, learning_rate: 0.0000, epoch: 0.0238
|
| 281 |
+
[2026-04-04 15:52:31] Step 4760: loss: 0.9284, grad_norm: 0.1641, learning_rate: 0.0000, epoch: 0.0238
|
| 282 |
+
[2026-04-04 15:53:55] Step 4770: loss: 0.9055, grad_norm: 0.2451, learning_rate: 0.0000, epoch: 0.0238
|
| 283 |
+
[2026-04-04 15:55:20] Step 4780: loss: 0.9106, grad_norm: 0.3027, learning_rate: 0.0000, epoch: 0.0239
|
| 284 |
+
[2026-04-04 15:56:44] Step 4790: loss: 0.8667, grad_norm: 0.1611, learning_rate: 0.0000, epoch: 0.0239
|
| 285 |
+
[2026-04-04 15:58:09] Step 4800: loss: 0.9344, grad_norm: 0.1729, learning_rate: 0.0000, epoch: 0.0240
|
| 286 |
+
[2026-04-04 15:59:33] Step 4810: loss: 0.9068, grad_norm: 0.1748, learning_rate: 0.0000, epoch: 0.0240
|
| 287 |
+
[2026-04-04 16:00:57] Step 4820: loss: 0.9075, grad_norm: 0.1465, learning_rate: 0.0000, epoch: 0.0241
|
| 288 |
+
[2026-04-04 16:02:22] Step 4830: loss: 0.8334, grad_norm: 0.1621, learning_rate: 0.0000, epoch: 0.0242
|
| 289 |
+
[2026-04-04 16:03:46] Step 4840: loss: 0.8115, grad_norm: 0.4375, learning_rate: 0.0000, epoch: 0.0242
|
| 290 |
+
[2026-04-04 16:05:10] Step 4850: loss: 0.8499, grad_norm: 0.2188, learning_rate: 0.0000, epoch: 0.0243
|
| 291 |
+
[2026-04-04 16:06:35] Step 4860: loss: 0.9564, grad_norm: 0.1826, learning_rate: 0.0000, epoch: 0.0243
|
| 292 |
+
[2026-04-04 16:07:59] Step 4870: loss: 0.9589, grad_norm: 0.1670, learning_rate: 0.0000, epoch: 0.0244
|
| 293 |
+
[2026-04-04 16:09:23] Step 4880: loss: 0.9117, grad_norm: 0.2178, learning_rate: 0.0000, epoch: 0.0244
|
| 294 |
+
[2026-04-04 16:10:47] Step 4890: loss: 0.7845, grad_norm: 0.1338, learning_rate: 0.0000, epoch: 0.0244
|
| 295 |
+
[2026-04-04 16:12:11] Step 4900: loss: 0.8921, grad_norm: 0.1738, learning_rate: 0.0000, epoch: 0.0245
|
| 296 |
+
[2026-04-04 16:13:35] Step 4910: loss: 0.8833, grad_norm: 0.1396, learning_rate: 0.0000, epoch: 0.0245
|
| 297 |
+
[2026-04-04 16:14:59] Step 4920: loss: 0.7464, grad_norm: 0.1934, learning_rate: 0.0000, epoch: 0.0246
|
| 298 |
+
[2026-04-04 16:16:23] Step 4930: loss: 0.8945, grad_norm: 0.1680, learning_rate: 0.0000, epoch: 0.0246
|
| 299 |
+
[2026-04-04 16:17:47] Step 4940: loss: 0.8504, grad_norm: 0.1836, learning_rate: 0.0000, epoch: 0.0247
|
| 300 |
+
[2026-04-04 16:19:11] Step 4950: loss: 0.7694, grad_norm: 0.1875, learning_rate: 0.0000, epoch: 0.0248
|
| 301 |
+
[2026-04-04 16:20:35] Step 4960: loss: 0.7889, grad_norm: 0.1270, learning_rate: 0.0000, epoch: 0.0248
|
| 302 |
+
[2026-04-04 16:21:59] Step 4970: loss: 0.9033, grad_norm: 0.1777, learning_rate: 0.0000, epoch: 0.0249
|
| 303 |
+
[2026-04-04 16:23:23] Step 4980: loss: 0.8958, grad_norm: 0.1758, learning_rate: 0.0000, epoch: 0.0249
|
| 304 |
+
[2026-04-04 16:24:48] Step 4990: loss: 0.8875, grad_norm: 0.1924, learning_rate: 0.0000, epoch: 0.0249
|
| 305 |
+
[2026-04-04 16:26:12] Step 5000: loss: 0.8543, grad_norm: 0.2471, learning_rate: 0.0000, epoch: 0.0250
|
| 306 |
+
[2026-04-04 16:27:36] Step 5010: loss: 0.8250, grad_norm: 0.1826, learning_rate: 0.0000, epoch: 0.0250
|
| 307 |
+
[2026-04-04 16:29:01] Step 5020: loss: 0.8954, grad_norm: 0.2100, learning_rate: 0.0000, epoch: 0.0251
|
| 308 |
+
[2026-04-04 16:30:26] Step 5030: loss: 0.7521, grad_norm: 0.1836, learning_rate: 0.0000, epoch: 0.0251
|
| 309 |
+
[2026-04-04 16:31:51] Step 5040: loss: 0.7933, grad_norm: 0.1377, learning_rate: 0.0000, epoch: 0.0252
|
| 310 |
+
[2026-04-04 16:33:15] Step 5050: loss: 0.8361, grad_norm: 0.1348, learning_rate: 0.0000, epoch: 0.0253
|
| 311 |
+
[2026-04-04 16:34:40] Step 5060: loss: 0.8986, grad_norm: 0.1514, learning_rate: 0.0000, epoch: 0.0253
|
| 312 |
+
[2026-04-04 16:36:06] Step 5070: loss: 0.8269, grad_norm: 0.2285, learning_rate: 0.0000, epoch: 0.0254
|
| 313 |
+
[2026-04-04 16:37:30] Step 5080: loss: 0.9033, grad_norm: 0.1572, learning_rate: 0.0000, epoch: 0.0254
|
| 314 |
+
[2026-04-04 16:38:56] Step 5090: loss: 0.8741, grad_norm: 0.1484, learning_rate: 0.0000, epoch: 0.0255
|
| 315 |
+
[2026-04-04 16:40:21] Step 5100: loss: 0.7754, grad_norm: 0.1611, learning_rate: 0.0000, epoch: 0.0255
|
| 316 |
+
[2026-04-04 16:41:45] Step 5110: loss: 0.7053, grad_norm: 0.1445, learning_rate: 0.0000, epoch: 0.0255
|
| 317 |
+
[2026-04-04 16:43:10] Step 5120: loss: 0.8493, grad_norm: 0.1514, learning_rate: 0.0000, epoch: 0.0256
|
| 318 |
+
[2026-04-04 16:44:34] Step 5130: loss: 0.8398, grad_norm: 0.1494, learning_rate: 0.0000, epoch: 0.0256
|
| 319 |
+
[2026-04-04 16:45:59] Step 5140: loss: 0.8019, grad_norm: 0.1650, learning_rate: 0.0000, epoch: 0.0257
|
| 320 |
+
[2026-04-04 16:47:23] Step 5150: loss: 0.7548, grad_norm: 0.2012, learning_rate: 0.0000, epoch: 0.0257
|
| 321 |
+
[2026-04-04 16:48:48] Step 5160: loss: 0.8675, grad_norm: 0.1621, learning_rate: 0.0000, epoch: 0.0258
|
| 322 |
+
[2026-04-04 16:50:13] Step 5170: loss: 0.7951, grad_norm: 0.1934, learning_rate: 0.0000, epoch: 0.0259
|
| 323 |
+
[2026-04-04 16:51:39] Step 5180: loss: 0.9608, grad_norm: 0.2051, learning_rate: 0.0000, epoch: 0.0259
|
| 324 |
+
[2026-04-04 16:53:03] Step 5190: loss: 0.9374, grad_norm: 0.2090, learning_rate: 0.0000, epoch: 0.0260
|
| 325 |
+
[2026-04-04 16:54:28] Step 5200: loss: 0.8681, grad_norm: 0.1270, learning_rate: 0.0000, epoch: 0.0260
|
| 326 |
+
[2026-04-04 16:55:52] Step 5210: loss: 0.8505, grad_norm: 0.1699, learning_rate: 0.0000, epoch: 0.0261
|
| 327 |
+
[2026-04-04 16:57:17] Step 5220: loss: 0.8585, grad_norm: 0.1270, learning_rate: 0.0000, epoch: 0.0261
|
| 328 |
+
[2026-04-04 16:58:42] Step 5230: loss: 0.7865, grad_norm: 0.2500, learning_rate: 0.0000, epoch: 0.0261
|
| 329 |
+
[2026-04-04 17:00:07] Step 5240: loss: 0.8359, grad_norm: 0.1680, learning_rate: 0.0000, epoch: 0.0262
|
| 330 |
+
[2026-04-04 17:01:32] Step 5250: loss: 0.9111, grad_norm: 0.2471, learning_rate: 0.0000, epoch: 0.0262
|
| 331 |
+
[2026-04-04 17:02:56] Step 5260: loss: 0.8239, grad_norm: 0.1641, learning_rate: 0.0000, epoch: 0.0263
|
| 332 |
+
[2026-04-04 17:04:21] Step 5270: loss: 0.9209, grad_norm: 0.1797, learning_rate: 0.0000, epoch: 0.0263
|
| 333 |
+
[2026-04-04 17:05:46] Step 5280: loss: 0.9036, grad_norm: 0.1895, learning_rate: 0.0000, epoch: 0.0264
|
| 334 |
+
[2026-04-04 17:07:10] Step 5290: loss: 0.8678, grad_norm: 0.2139, learning_rate: 0.0000, epoch: 0.0265
|
| 335 |
+
[2026-04-04 17:08:34] Step 5300: loss: 0.8450, grad_norm: 0.3770, learning_rate: 0.0000, epoch: 0.0265
|
| 336 |
+
[2026-04-04 17:09:58] Step 5310: loss: 0.8964, grad_norm: 0.1777, learning_rate: 0.0000, epoch: 0.0266
|
| 337 |
+
[2026-04-04 17:11:23] Step 5320: loss: 0.8130, grad_norm: 0.2119, learning_rate: 0.0000, epoch: 0.0266
|
| 338 |
+
[2026-04-04 17:12:47] Step 5330: loss: 0.8837, grad_norm: 0.2197, learning_rate: 0.0000, epoch: 0.0267
|
| 339 |
+
[2026-04-04 17:14:12] Step 5340: loss: 0.9022, grad_norm: 0.1465, learning_rate: 0.0000, epoch: 0.0267
|
| 340 |
+
[2026-04-04 17:15:36] Step 5350: loss: 0.9066, grad_norm: 0.2070, learning_rate: 0.0000, epoch: 0.0267
|
| 341 |
+
[2026-04-04 17:17:01] Step 5360: loss: 0.8575, grad_norm: 0.1660, learning_rate: 0.0000, epoch: 0.0268
|
| 342 |
+
[2026-04-04 17:18:25] Step 5370: loss: 1.0600, grad_norm: 0.3203, learning_rate: 0.0000, epoch: 0.0268
|
| 343 |
+
[2026-04-04 17:19:50] Step 5380: loss: 0.8405, grad_norm: 0.2393, learning_rate: 0.0000, epoch: 0.0269
|
| 344 |
+
[2026-04-04 17:21:15] Step 5390: loss: 0.8278, grad_norm: 0.1982, learning_rate: 0.0000, epoch: 0.0270
|
| 345 |
+
[2026-04-04 17:22:39] Step 5400: loss: 0.9193, grad_norm: 0.1982, learning_rate: 0.0000, epoch: 0.0270
|
| 346 |
+
[2026-04-04 17:24:03] Step 5410: loss: 0.9441, grad_norm: 0.3066, learning_rate: 0.0000, epoch: 0.0271
|
| 347 |
+
[2026-04-04 17:25:28] Step 5420: loss: 0.8435, grad_norm: 0.1914, learning_rate: 0.0000, epoch: 0.0271
|
| 348 |
+
[2026-04-04 17:26:53] Step 5430: loss: 0.8175, grad_norm: 0.1523, learning_rate: 0.0000, epoch: 0.0272
|
| 349 |
+
[2026-04-04 17:28:17] Step 5440: loss: 0.8007, grad_norm: 0.2695, learning_rate: 0.0000, epoch: 0.0272
|
| 350 |
+
[2026-04-04 17:29:41] Step 5450: loss: 0.9872, grad_norm: 0.2461, learning_rate: 0.0000, epoch: 0.0272
|
| 351 |
+
[2026-04-04 17:31:06] Step 5460: loss: 0.8669, grad_norm: 0.2080, learning_rate: 0.0000, epoch: 0.0273
|
| 352 |
+
[2026-04-04 17:32:30] Step 5470: loss: 0.8589, grad_norm: 0.3223, learning_rate: 0.0000, epoch: 0.0273
|
| 353 |
+
[2026-04-04 17:33:54] Step 5480: loss: 0.9051, grad_norm: 0.1758, learning_rate: 0.0000, epoch: 0.0274
|
| 354 |
+
[2026-04-04 17:35:19] Step 5490: loss: 0.9042, grad_norm: 0.2539, learning_rate: 0.0000, epoch: 0.0274
|
| 355 |
+
[2026-04-04 17:36:44] Step 5500: loss: 0.8508, grad_norm: 0.1367, learning_rate: 0.0000, epoch: 0.0275
|
| 356 |
+
[2026-04-04 17:38:08] Step 5510: loss: 0.9220, grad_norm: 0.1475, learning_rate: 0.0000, epoch: 0.0276
|
| 357 |
+
[2026-04-04 17:39:33] Step 5520: loss: 0.9046, grad_norm: 0.5312, learning_rate: 0.0000, epoch: 0.0276
|
| 358 |
+
[2026-04-04 17:40:58] Step 5530: loss: 0.8969, grad_norm: 0.2002, learning_rate: 0.0000, epoch: 0.0277
|
| 359 |
+
[2026-04-04 17:42:22] Step 5540: loss: 0.9212, grad_norm: 0.2910, learning_rate: 0.0000, epoch: 0.0277
|
| 360 |
+
[2026-04-04 17:43:46] Step 5550: loss: 0.8605, grad_norm: 0.2930, learning_rate: 0.0000, epoch: 0.0278
|
| 361 |
+
[2026-04-04 17:45:11] Step 5560: loss: 0.9055, grad_norm: 0.2676, learning_rate: 0.0000, epoch: 0.0278
|
| 362 |
+
[2026-04-04 17:46:35] Step 5570: loss: 0.9778, grad_norm: 0.1689, learning_rate: 0.0000, epoch: 0.0278
|
| 363 |
+
[2026-04-04 17:48:00] Step 5580: loss: 0.9480, grad_norm: 0.2207, learning_rate: 0.0000, epoch: 0.0279
|
| 364 |
+
[2026-04-04 17:49:24] Step 5590: loss: 0.9285, grad_norm: 0.2354, learning_rate: 0.0000, epoch: 0.0279
|
| 365 |
+
[2026-04-04 17:50:48] Step 5600: loss: 0.9215, grad_norm: 0.2637, learning_rate: 0.0000, epoch: 0.0280
|
| 366 |
+
[2026-04-04 17:52:12] Step 5610: loss: 0.8681, grad_norm: 0.1973, learning_rate: 0.0000, epoch: 0.0280
|
| 367 |
+
[2026-04-04 17:53:37] Step 5620: loss: 0.9485, grad_norm: 0.2393, learning_rate: 0.0000, epoch: 0.0281
|
| 368 |
+
[2026-04-04 17:55:01] Step 5630: loss: 0.9471, grad_norm: 5.0938, learning_rate: 0.0000, epoch: 0.0282
|
| 369 |
+
[2026-04-04 17:56:26] Step 5640: loss: 0.9299, grad_norm: 0.1602, learning_rate: 0.0000, epoch: 0.0282
|
| 370 |
+
[2026-04-04 17:57:50] Step 5650: loss: 0.9612, grad_norm: 0.6289, learning_rate: 0.0000, epoch: 0.0283
|
| 371 |
+
[2026-04-04 17:59:15] Step 5660: loss: 0.8904, grad_norm: 0.3438, learning_rate: 0.0000, epoch: 0.0283
|
| 372 |
+
[2026-04-04 18:00:39] Step 5670: loss: 0.7939, grad_norm: 0.2119, learning_rate: 0.0000, epoch: 0.0284
|
| 373 |
+
[2026-04-04 18:02:03] Step 5680: loss: 0.8262, grad_norm: 0.2168, learning_rate: 0.0000, epoch: 0.0284
|
| 374 |
+
[2026-04-04 18:03:28] Step 5690: loss: 0.8845, grad_norm: 0.2412, learning_rate: 0.0000, epoch: 0.0284
|
| 375 |
+
[2026-04-04 18:04:52] Step 5700: loss: 0.9226, grad_norm: 0.1738, learning_rate: 0.0000, epoch: 0.0285
|
| 376 |
+
[2026-04-04 18:06:17] Step 5710: loss: 0.8747, grad_norm: 0.3926, learning_rate: 0.0000, epoch: 0.0285
|
| 377 |
+
[2026-04-04 18:07:42] Step 5720: loss: 0.8984, grad_norm: 0.1768, learning_rate: 0.0000, epoch: 0.0286
|
| 378 |
+
[2026-04-04 18:09:06] Step 5730: loss: 0.8789, grad_norm: 0.1709, learning_rate: 0.0000, epoch: 0.0286
|
| 379 |
+
[2026-04-04 18:10:29] Step 5740: loss: 1.0040, grad_norm: 0.5234, learning_rate: 0.0000, epoch: 0.0287
|
| 380 |
+
[2026-04-04 18:11:54] Step 5750: loss: 0.8954, grad_norm: 0.2109, learning_rate: 0.0000, epoch: 0.0288
|
| 381 |
+
[2026-04-04 18:13:18] Step 5760: loss: 0.8071, grad_norm: 0.1533, learning_rate: 0.0000, epoch: 0.0288
|
| 382 |
+
[2026-04-04 18:14:42] Step 5770: loss: 0.9445, grad_norm: 0.1543, learning_rate: 0.0000, epoch: 0.0289
|
| 383 |
+
[2026-04-04 18:16:07] Step 5780: loss: 0.8859, grad_norm: 1592.0000, learning_rate: 0.0000, epoch: 0.0289
|
| 384 |
+
[2026-04-04 18:17:31] Step 5790: loss: 0.9482, grad_norm: 0.1836, learning_rate: 0.0000, epoch: 0.0290
|
| 385 |
+
[2026-04-04 18:18:56] Step 5800: loss: 0.9173, grad_norm: 0.2432, learning_rate: 0.0000, epoch: 0.0290
|
| 386 |
+
[2026-04-04 18:20:20] Step 5810: loss: 0.9080, grad_norm: 0.1533, learning_rate: 0.0000, epoch: 0.0290
|
| 387 |
+
[2026-04-04 18:21:45] Step 5820: loss: 0.7803, grad_norm: 0.1875, learning_rate: 0.0000, epoch: 0.0291
|
| 388 |
+
[2026-04-04 18:23:09] Step 5830: loss: 0.8363, grad_norm: 0.1719, learning_rate: 0.0000, epoch: 0.0291
|
| 389 |
+
[2026-04-04 18:24:33] Step 5840: loss: 0.9606, grad_norm: 0.3848, learning_rate: 0.0000, epoch: 0.0292
|
| 390 |
+
[2026-04-04 18:25:57] Step 5850: loss: 0.8659, grad_norm: 0.2227, learning_rate: 0.0000, epoch: 0.0293
|
| 391 |
+
[2026-04-04 18:27:21] Step 5860: loss: 0.7850, grad_norm: 0.2256, learning_rate: 0.0000, epoch: 0.0293
|
| 392 |
+
[2026-04-04 18:28:46] Step 5870: loss: 0.8648, grad_norm: 0.2158, learning_rate: 0.0000, epoch: 0.0294
|
| 393 |
+
[2026-04-04 18:30:10] Step 5880: loss: 0.9564, grad_norm: 0.2461, learning_rate: 0.0000, epoch: 0.0294
|
| 394 |
+
[2026-04-04 18:31:34] Step 5890: loss: 0.9538, grad_norm: 0.1855, learning_rate: 0.0000, epoch: 0.0295
|
| 395 |
+
[2026-04-04 18:32:59] Step 5900: loss: 0.8727, grad_norm: 0.1875, learning_rate: 0.0000, epoch: 0.0295
|
| 396 |
+
[2026-04-04 18:34:23] Step 5910: loss: 0.9268, grad_norm: 3.0781, learning_rate: 0.0000, epoch: 0.0295
|
| 397 |
+
[2026-04-04 18:35:47] Step 5920: loss: 0.8603, grad_norm: 0.1797, learning_rate: 0.0000, epoch: 0.0296
|
| 398 |
+
[2026-04-04 18:37:11] Step 5930: loss: 0.9185, grad_norm: 0.3555, learning_rate: 0.0000, epoch: 0.0296
|
| 399 |
+
[2026-04-04 18:38:36] Step 5940: loss: 0.8986, grad_norm: 0.1562, learning_rate: 0.0000, epoch: 0.0297
|
| 400 |
+
[2026-04-04 18:40:00] Step 5950: loss: 0.9599, grad_norm: 0.1826, learning_rate: 0.0000, epoch: 0.0297
|
| 401 |
+
[2026-04-04 18:41:24] Step 5960: loss: 0.8563, grad_norm: 0.2002, learning_rate: 0.0000, epoch: 0.0298
|
| 402 |
+
[2026-04-04 18:42:49] Step 5970: loss: 0.8872, grad_norm: 0.4316, learning_rate: 0.0000, epoch: 0.0299
|
| 403 |
+
[2026-04-04 18:44:13] Step 5980: loss: 0.7684, grad_norm: 0.1934, learning_rate: 0.0000, epoch: 0.0299
|
| 404 |
+
[2026-04-04 18:45:37] Step 5990: loss: 0.8413, grad_norm: 0.2578, learning_rate: 0.0000, epoch: 0.0300
|
| 405 |
+
[2026-04-04 18:47:01] Step 6000: loss: 0.9206, grad_norm: 0.2090, learning_rate: 0.0000, epoch: 0.0300
|
| 406 |
+
[2026-04-04 18:50:59] Step 6010: loss: 0.8848, grad_norm: 0.1934, learning_rate: 0.0000, epoch: 0.0301
|