Training in progress, step 5000
Browse files- model.safetensors +1 -1
- training_args.bin +1 -1
- training_log.txt +503 -402
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 16060556616
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c24eefb159064476b2be5d5a5a23bd99c7941b07ca3fc1e6e96cfee0ea3bfaa2
|
| 3 |
size 16060556616
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5265
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:566b98d618c43899f8b01538667552d7c8a4f9f3959066cd81b19bd535ee9706
|
| 3 |
size 5265
|
training_log.txt
CHANGED
|
@@ -1,406 +1,507 @@
|
|
| 1 |
|
| 2 |
==================================================
|
| 3 |
-
Training started at: 2026-04-
|
| 4 |
==================================================
|
| 5 |
|
| 6 |
-
[2026-04-
|
| 7 |
-
[2026-04-
|
| 8 |
-
[2026-04-
|
| 9 |
-
[2026-04-
|
| 10 |
-
[2026-04-
|
| 11 |
-
[2026-04-
|
| 12 |
-
[2026-04-
|
| 13 |
-
[2026-04-
|
| 14 |
-
[2026-04-
|
| 15 |
-
[2026-04-
|
| 16 |
-
[2026-04-
|
| 17 |
-
[2026-04-
|
| 18 |
-
[2026-04-
|
| 19 |
-
[2026-04-
|
| 20 |
-
[2026-04-
|
| 21 |
-
[2026-04-
|
| 22 |
-
[2026-04-
|
| 23 |
-
[2026-04-
|
| 24 |
-
[2026-04-
|
| 25 |
-
[2026-04-
|
| 26 |
-
[2026-04-
|
| 27 |
-
[2026-04-
|
| 28 |
-
[2026-04-
|
| 29 |
-
[2026-04-
|
| 30 |
-
[2026-04-
|
| 31 |
-
[2026-04-
|
| 32 |
-
[2026-04-
|
| 33 |
-
[2026-04-
|
| 34 |
-
[2026-04-
|
| 35 |
-
[2026-04-
|
| 36 |
-
[2026-04-
|
| 37 |
-
[2026-04-
|
| 38 |
-
[2026-04-
|
| 39 |
-
[2026-04-
|
| 40 |
-
[2026-04-
|
| 41 |
-
[2026-04-
|
| 42 |
-
[2026-04-
|
| 43 |
-
[2026-04-
|
| 44 |
-
[2026-04-
|
| 45 |
-
[2026-04-
|
| 46 |
-
[2026-04-
|
| 47 |
-
[2026-04-
|
| 48 |
-
[2026-04-
|
| 49 |
-
[2026-04-
|
| 50 |
-
[2026-04-
|
| 51 |
-
[2026-04-
|
| 52 |
-
[2026-04-
|
| 53 |
-
[2026-04-
|
| 54 |
-
[2026-04-
|
| 55 |
-
[2026-04-
|
| 56 |
-
[2026-04-
|
| 57 |
-
[2026-04-
|
| 58 |
-
[2026-04-
|
| 59 |
-
[2026-04-
|
| 60 |
-
[2026-04-
|
| 61 |
-
[2026-04-
|
| 62 |
-
[2026-04-
|
| 63 |
-
[2026-04-
|
| 64 |
-
[2026-04-
|
| 65 |
-
[2026-04-
|
| 66 |
-
[2026-04-
|
| 67 |
-
[2026-04-
|
| 68 |
-
[2026-04-
|
| 69 |
-
[2026-04-
|
| 70 |
-
[2026-04-
|
| 71 |
-
[2026-04-
|
| 72 |
-
[2026-04-
|
| 73 |
-
[2026-04-
|
| 74 |
-
[2026-04-
|
| 75 |
-
[2026-04-
|
| 76 |
-
[2026-04-
|
| 77 |
-
[2026-04-
|
| 78 |
-
[2026-04-
|
| 79 |
-
[2026-04-
|
| 80 |
-
[2026-04-
|
| 81 |
-
[2026-04-
|
| 82 |
-
[2026-04-
|
| 83 |
-
[2026-04-
|
| 84 |
-
[2026-04-
|
| 85 |
-
[2026-04-
|
| 86 |
-
[2026-04-
|
| 87 |
-
[2026-04-
|
| 88 |
-
[2026-04-
|
| 89 |
-
[2026-04-
|
| 90 |
-
[2026-04-
|
| 91 |
-
[2026-04-
|
| 92 |
-
[2026-04-
|
| 93 |
-
[2026-04-
|
| 94 |
-
[2026-04-
|
| 95 |
-
[2026-04-
|
| 96 |
-
[2026-04-
|
| 97 |
-
[2026-04-
|
| 98 |
-
[2026-04-
|
| 99 |
-
[2026-04-
|
| 100 |
-
[2026-04-
|
| 101 |
-
[2026-04-
|
| 102 |
-
[2026-04-
|
| 103 |
-
[2026-04-
|
| 104 |
-
[2026-04-
|
| 105 |
-
[2026-04-
|
| 106 |
-
[2026-04-
|
| 107 |
-
[2026-04-
|
| 108 |
-
[2026-04-
|
| 109 |
-
[2026-04-
|
| 110 |
-
[2026-04-
|
| 111 |
-
[2026-04-
|
| 112 |
-
[2026-04-
|
| 113 |
-
[2026-04-
|
| 114 |
-
[2026-04-
|
| 115 |
-
[2026-04-
|
| 116 |
-
[2026-04-
|
| 117 |
-
[2026-04-
|
| 118 |
-
[2026-04-
|
| 119 |
-
[2026-04-
|
| 120 |
-
[2026-04-
|
| 121 |
-
[2026-04-
|
| 122 |
-
[2026-04-
|
| 123 |
-
[2026-04-
|
| 124 |
-
[2026-04-
|
| 125 |
-
[2026-04-
|
| 126 |
-
[2026-04-
|
| 127 |
-
[2026-04-
|
| 128 |
-
[2026-04-
|
| 129 |
-
[2026-04-
|
| 130 |
-
[2026-04-
|
| 131 |
-
[2026-04-
|
| 132 |
-
[2026-04-
|
| 133 |
-
[2026-04-
|
| 134 |
-
[2026-04-
|
| 135 |
-
[2026-04-
|
| 136 |
-
[2026-04-
|
| 137 |
-
[2026-04-
|
| 138 |
-
[2026-04-
|
| 139 |
-
[2026-04-
|
| 140 |
-
[2026-04-
|
| 141 |
-
[2026-04-
|
| 142 |
-
[2026-04-
|
| 143 |
-
[2026-04-
|
| 144 |
-
[2026-04-
|
| 145 |
-
[2026-04-
|
| 146 |
-
[2026-04-
|
| 147 |
-
[2026-04-
|
| 148 |
-
[2026-04-
|
| 149 |
-
[2026-04-
|
| 150 |
-
[2026-04-
|
| 151 |
-
[2026-04-
|
| 152 |
-
[2026-04-
|
| 153 |
-
[2026-04-
|
| 154 |
-
[2026-04-
|
| 155 |
-
[2026-04-
|
| 156 |
-
[2026-04-
|
| 157 |
-
[2026-04-
|
| 158 |
-
[2026-04-
|
| 159 |
-
[2026-04-
|
| 160 |
-
[2026-04-
|
| 161 |
-
[2026-04-
|
| 162 |
-
[2026-04-
|
| 163 |
-
[2026-04-
|
| 164 |
-
[2026-04-
|
| 165 |
-
[2026-04-
|
| 166 |
-
[2026-04-
|
| 167 |
-
[2026-04-
|
| 168 |
-
[2026-04-
|
| 169 |
-
[2026-04-
|
| 170 |
-
[2026-04-
|
| 171 |
-
[2026-04-
|
| 172 |
-
[2026-04-
|
| 173 |
-
[2026-04-
|
| 174 |
-
[2026-04-
|
| 175 |
-
[2026-04-
|
| 176 |
-
[2026-04-
|
| 177 |
-
[2026-04-
|
| 178 |
-
[2026-04-
|
| 179 |
-
[2026-04-
|
| 180 |
-
[2026-04-
|
| 181 |
-
[2026-04-
|
| 182 |
-
[2026-04-
|
| 183 |
-
[2026-04-
|
| 184 |
-
[2026-04-
|
| 185 |
-
[2026-04-
|
| 186 |
-
[2026-04-
|
| 187 |
-
[2026-04-
|
| 188 |
-
[2026-04-
|
| 189 |
-
[2026-04-
|
| 190 |
-
[2026-04-
|
| 191 |
-
[2026-04-
|
| 192 |
-
[2026-04-
|
| 193 |
-
[2026-04-
|
| 194 |
-
[2026-04-
|
| 195 |
-
[2026-04-
|
| 196 |
-
[2026-04-
|
| 197 |
-
[2026-04-
|
| 198 |
-
[2026-04-
|
| 199 |
-
[2026-04-
|
| 200 |
-
[2026-04-
|
| 201 |
-
[2026-04-
|
| 202 |
-
[2026-04-
|
| 203 |
-
[2026-04-
|
| 204 |
-
[2026-04-
|
| 205 |
-
[2026-04-
|
| 206 |
-
[2026-04-
|
| 207 |
-
[2026-04-
|
| 208 |
-
[2026-04-
|
| 209 |
-
[2026-04-
|
| 210 |
-
[2026-04-
|
| 211 |
-
[2026-04-
|
| 212 |
-
[2026-04-
|
| 213 |
-
[2026-04-
|
| 214 |
-
[2026-04-
|
| 215 |
-
[2026-04-
|
| 216 |
-
[2026-04-
|
| 217 |
-
[2026-04-
|
| 218 |
-
[2026-04-
|
| 219 |
-
[2026-04-
|
| 220 |
-
[2026-04-
|
| 221 |
-
[2026-04-
|
| 222 |
-
[2026-04-
|
| 223 |
-
[2026-04-
|
| 224 |
-
[2026-04-
|
| 225 |
-
[2026-04-
|
| 226 |
-
[2026-04-
|
| 227 |
-
[2026-04-
|
| 228 |
-
[2026-04-
|
| 229 |
-
[2026-04-
|
| 230 |
-
[2026-04-
|
| 231 |
-
[2026-04-
|
| 232 |
-
[2026-04-
|
| 233 |
-
[2026-04-
|
| 234 |
-
[2026-04-
|
| 235 |
-
[2026-04-
|
| 236 |
-
[2026-04-
|
| 237 |
-
[2026-04-
|
| 238 |
-
[2026-04-
|
| 239 |
-
[2026-04-
|
| 240 |
-
[2026-04-
|
| 241 |
-
[2026-04-
|
| 242 |
-
[2026-04-
|
| 243 |
-
[2026-04-
|
| 244 |
-
[2026-04-
|
| 245 |
-
[2026-04-
|
| 246 |
-
[2026-04-
|
| 247 |
-
[2026-04-
|
| 248 |
-
[2026-04-
|
| 249 |
-
[2026-04-
|
| 250 |
-
[2026-04-
|
| 251 |
-
[2026-04-
|
| 252 |
-
[2026-04-
|
| 253 |
-
[2026-04-
|
| 254 |
-
[2026-04-
|
| 255 |
-
[2026-04-
|
| 256 |
-
[2026-04-
|
| 257 |
-
[2026-04-
|
| 258 |
-
[2026-04-
|
| 259 |
-
[2026-04-
|
| 260 |
-
[2026-04-
|
| 261 |
-
[2026-04-
|
| 262 |
-
[2026-04-
|
| 263 |
-
[2026-04-
|
| 264 |
-
[2026-04-
|
| 265 |
-
[2026-04-
|
| 266 |
-
[2026-04-
|
| 267 |
-
[2026-04-
|
| 268 |
-
[2026-04-
|
| 269 |
-
[2026-04-
|
| 270 |
-
[2026-04-
|
| 271 |
-
[2026-04-
|
| 272 |
-
[2026-04-
|
| 273 |
-
[2026-04-
|
| 274 |
-
[2026-04-
|
| 275 |
-
[2026-04-
|
| 276 |
-
[2026-04-
|
| 277 |
-
[2026-04-
|
| 278 |
-
[2026-04-
|
| 279 |
-
[2026-04-
|
| 280 |
-
[2026-04-
|
| 281 |
-
[2026-04-
|
| 282 |
-
[2026-04-
|
| 283 |
-
[2026-04-
|
| 284 |
-
[2026-04-
|
| 285 |
-
[2026-04-
|
| 286 |
-
[2026-04-
|
| 287 |
-
[2026-04-
|
| 288 |
-
[2026-04-
|
| 289 |
-
[2026-04-
|
| 290 |
-
[2026-04-
|
| 291 |
-
[2026-04-
|
| 292 |
-
[2026-04-
|
| 293 |
-
[2026-04-
|
| 294 |
-
[2026-04-
|
| 295 |
-
[2026-04-
|
| 296 |
-
[2026-04-
|
| 297 |
-
[2026-04-
|
| 298 |
-
[2026-04-
|
| 299 |
-
[2026-04-
|
| 300 |
-
[2026-04-
|
| 301 |
-
[2026-04-
|
| 302 |
-
[2026-04-
|
| 303 |
-
[2026-04-
|
| 304 |
-
[2026-04-
|
| 305 |
-
[2026-04-
|
| 306 |
-
[2026-04-
|
| 307 |
-
[2026-04-
|
| 308 |
-
[2026-04-
|
| 309 |
-
[2026-04-
|
| 310 |
-
[2026-04-
|
| 311 |
-
[2026-04-
|
| 312 |
-
[2026-04-
|
| 313 |
-
[2026-04-
|
| 314 |
-
[2026-04-
|
| 315 |
-
[2026-04-
|
| 316 |
-
[2026-04-
|
| 317 |
-
[2026-04-
|
| 318 |
-
[2026-04-
|
| 319 |
-
[2026-04-
|
| 320 |
-
[2026-04-
|
| 321 |
-
[2026-04-
|
| 322 |
-
[2026-04-
|
| 323 |
-
[2026-04-
|
| 324 |
-
[2026-04-
|
| 325 |
-
[2026-04-
|
| 326 |
-
[2026-04-
|
| 327 |
-
[2026-04-
|
| 328 |
-
[2026-04-
|
| 329 |
-
[2026-04-
|
| 330 |
-
[2026-04-
|
| 331 |
-
[2026-04-
|
| 332 |
-
[2026-04-
|
| 333 |
-
[2026-04-
|
| 334 |
-
[2026-04-
|
| 335 |
-
[2026-04-
|
| 336 |
-
[2026-04-
|
| 337 |
-
[2026-04-
|
| 338 |
-
[2026-04-
|
| 339 |
-
[2026-04-
|
| 340 |
-
[2026-04-
|
| 341 |
-
[2026-04-
|
| 342 |
-
[2026-04-
|
| 343 |
-
[2026-04-
|
| 344 |
-
[2026-04-
|
| 345 |
-
[2026-04-
|
| 346 |
-
[2026-04-
|
| 347 |
-
[2026-04-
|
| 348 |
-
[2026-04-
|
| 349 |
-
[2026-04-
|
| 350 |
-
[2026-04-
|
| 351 |
-
[2026-04-
|
| 352 |
-
[2026-04-
|
| 353 |
-
[2026-04-
|
| 354 |
-
[2026-04-
|
| 355 |
-
[2026-04-
|
| 356 |
-
[2026-04-
|
| 357 |
-
[2026-04-
|
| 358 |
-
[2026-04-
|
| 359 |
-
[2026-04-
|
| 360 |
-
[2026-04-
|
| 361 |
-
[2026-04-
|
| 362 |
-
[2026-04-
|
| 363 |
-
[2026-04-
|
| 364 |
-
[2026-04-
|
| 365 |
-
[2026-04-
|
| 366 |
-
[2026-04-
|
| 367 |
-
[2026-04-
|
| 368 |
-
[2026-04-
|
| 369 |
-
[2026-04-
|
| 370 |
-
[2026-04-
|
| 371 |
-
[2026-04-
|
| 372 |
-
[2026-04-
|
| 373 |
-
[2026-04-
|
| 374 |
-
[2026-04-
|
| 375 |
-
[2026-04-
|
| 376 |
-
[2026-04-
|
| 377 |
-
[2026-04-
|
| 378 |
-
[2026-04-
|
| 379 |
-
[2026-04-
|
| 380 |
-
[2026-04-
|
| 381 |
-
[2026-04-
|
| 382 |
-
[2026-04-
|
| 383 |
-
[2026-04-
|
| 384 |
-
[2026-04-
|
| 385 |
-
[2026-04-
|
| 386 |
-
[2026-04-
|
| 387 |
-
[2026-04-
|
| 388 |
-
[2026-04-
|
| 389 |
-
[2026-04-
|
| 390 |
-
[2026-04-
|
| 391 |
-
[2026-04-
|
| 392 |
-
[2026-04-
|
| 393 |
-
[2026-04-
|
| 394 |
-
[2026-04-
|
| 395 |
-
[2026-04-
|
| 396 |
-
[2026-04-
|
| 397 |
-
[2026-04-
|
| 398 |
-
[2026-04-
|
| 399 |
-
[2026-04-
|
| 400 |
-
[2026-04-
|
| 401 |
-
[2026-04-
|
| 402 |
-
[2026-04-
|
| 403 |
-
[2026-04-
|
| 404 |
-
[2026-04-
|
| 405 |
-
[2026-04-
|
| 406 |
-
[2026-04-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
==================================================
|
| 3 |
+
Training started at: 2026-04-05 10:32:25
|
| 4 |
==================================================
|
| 5 |
|
| 6 |
+
[2026-04-05 10:34:27] Step 1: loss: 5.374693, grad_norm: 704.000000, learning_rate: 0.000000, epoch: 0.000005
|
| 7 |
+
[2026-04-05 10:35:42] Step 10: loss: 6.319018, grad_norm: 44.250000, learning_rate: 0.000001, epoch: 0.000051
|
| 8 |
+
[2026-04-05 10:37:07] Step 20: loss: 4.786968, grad_norm: 0.894531, learning_rate: 0.000002, epoch: 0.000101
|
| 9 |
+
[2026-04-05 10:38:33] Step 30: loss: 4.273304, grad_norm: 1.117188, learning_rate: 0.000003, epoch: 0.000152
|
| 10 |
+
[2026-04-05 10:39:58] Step 40: loss: 4.699950, grad_norm: 0.996094, learning_rate: 0.000004, epoch: 0.000202
|
| 11 |
+
[2026-04-05 10:41:24] Step 50: loss: 3.808567, grad_norm: 0.429688, learning_rate: 0.000005, epoch: 0.000253
|
| 12 |
+
[2026-04-05 10:42:49] Step 60: loss: 4.120002, grad_norm: 0.333984, learning_rate: 0.000006, epoch: 0.000303
|
| 13 |
+
[2026-04-05 10:44:15] Step 70: loss: 3.806800, grad_norm: 0.679688, learning_rate: 0.000007, epoch: 0.000354
|
| 14 |
+
[2026-04-05 10:45:40] Step 80: loss: 4.653148, grad_norm: 1.046875, learning_rate: 0.000008, epoch: 0.000404
|
| 15 |
+
[2026-04-05 10:47:05] Step 90: loss: 3.986430, grad_norm: 0.746094, learning_rate: 0.000009, epoch: 0.000455
|
| 16 |
+
[2026-04-05 10:48:31] Step 100: loss: 3.177014, grad_norm: 0.972656, learning_rate: 0.000010, epoch: 0.000505
|
| 17 |
+
[2026-04-05 10:49:58] Step 110: loss: 2.341252, grad_norm: 0.636719, learning_rate: 0.000011, epoch: 0.000556
|
| 18 |
+
[2026-04-05 10:51:23] Step 120: loss: 3.686772, grad_norm: 0.496094, learning_rate: 0.000012, epoch: 0.000606
|
| 19 |
+
[2026-04-05 10:52:48] Step 130: loss: 4.326537, grad_norm: 4.031250, learning_rate: 0.000013, epoch: 0.000657
|
| 20 |
+
[2026-04-05 10:54:14] Step 140: loss: 2.882456, grad_norm: 0.248047, learning_rate: 0.000014, epoch: 0.000707
|
| 21 |
+
[2026-04-05 10:55:39] Step 150: loss: 3.564154, grad_norm: 1.023438, learning_rate: 0.000015, epoch: 0.000758
|
| 22 |
+
[2026-04-05 10:57:05] Step 160: loss: 3.621474, grad_norm: 0.574219, learning_rate: 0.000016, epoch: 0.000808
|
| 23 |
+
[2026-04-05 10:58:31] Step 170: loss: 3.432347, grad_norm: 0.247070, learning_rate: 0.000017, epoch: 0.000859
|
| 24 |
+
[2026-04-05 10:59:55] Step 180: loss: 3.368018, grad_norm: 0.339844, learning_rate: 0.000018, epoch: 0.000909
|
| 25 |
+
[2026-04-05 11:01:21] Step 190: loss: 2.903378, grad_norm: 1.210938, learning_rate: 0.000019, epoch: 0.000960
|
| 26 |
+
[2026-04-05 11:02:47] Step 200: loss: 2.334621, grad_norm: 0.285156, learning_rate: 0.000020, epoch: 0.001010
|
| 27 |
+
[2026-04-05 11:04:11] Step 210: loss: 3.505114, grad_norm: 0.494141, learning_rate: 0.000021, epoch: 0.001061
|
| 28 |
+
[2026-04-05 11:05:36] Step 220: loss: 1.382823, grad_norm: 0.320312, learning_rate: 0.000022, epoch: 0.001111
|
| 29 |
+
[2026-04-05 11:07:02] Step 230: loss: 0.917196, grad_norm: 0.371094, learning_rate: 0.000023, epoch: 0.001162
|
| 30 |
+
[2026-04-05 11:08:29] Step 240: loss: 0.975546, grad_norm: 0.267578, learning_rate: 0.000024, epoch: 0.001212
|
| 31 |
+
[2026-04-05 11:09:55] Step 250: loss: 0.629887, grad_norm: 0.228516, learning_rate: 0.000025, epoch: 0.001263
|
| 32 |
+
[2026-04-05 11:11:22] Step 260: loss: 0.699167, grad_norm: 1.085938, learning_rate: 0.000026, epoch: 0.001313
|
| 33 |
+
[2026-04-05 11:12:49] Step 270: loss: 0.739236, grad_norm: 0.455078, learning_rate: 0.000027, epoch: 0.001364
|
| 34 |
+
[2026-04-05 11:14:15] Step 280: loss: 0.738538, grad_norm: 0.174805, learning_rate: 0.000028, epoch: 0.001414
|
| 35 |
+
[2026-04-05 11:15:40] Step 290: loss: 0.670122, grad_norm: 0.223633, learning_rate: 0.000029, epoch: 0.001465
|
| 36 |
+
[2026-04-05 11:17:06] Step 300: loss: 0.608715, grad_norm: 0.277344, learning_rate: 0.000030, epoch: 0.001515
|
| 37 |
+
[2026-04-05 11:18:31] Step 310: loss: 0.838763, grad_norm: 0.232422, learning_rate: 0.000031, epoch: 0.001566
|
| 38 |
+
[2026-04-05 11:19:57] Step 320: loss: 0.797982, grad_norm: 0.578125, learning_rate: 0.000032, epoch: 0.001616
|
| 39 |
+
[2026-04-05 11:21:24] Step 330: loss: 0.644051, grad_norm: 0.185547, learning_rate: 0.000033, epoch: 0.001667
|
| 40 |
+
[2026-04-05 11:22:49] Step 340: loss: 0.777247, grad_norm: 0.236328, learning_rate: 0.000034, epoch: 0.001717
|
| 41 |
+
[2026-04-05 11:24:16] Step 350: loss: 0.741875, grad_norm: 0.182617, learning_rate: 0.000035, epoch: 0.001768
|
| 42 |
+
[2026-04-05 11:25:42] Step 360: loss: 0.585453, grad_norm: 0.451172, learning_rate: 0.000036, epoch: 0.001818
|
| 43 |
+
[2026-04-05 11:27:08] Step 370: loss: 0.728102, grad_norm: 0.271484, learning_rate: 0.000037, epoch: 0.001869
|
| 44 |
+
[2026-04-05 11:28:35] Step 380: loss: 0.672046, grad_norm: 0.185547, learning_rate: 0.000038, epoch: 0.001919
|
| 45 |
+
[2026-04-05 11:30:01] Step 390: loss: 0.613392, grad_norm: 0.306641, learning_rate: 0.000039, epoch: 0.001970
|
| 46 |
+
[2026-04-05 11:31:26] Step 400: loss: 0.712388, grad_norm: 0.179688, learning_rate: 0.000040, epoch: 0.002020
|
| 47 |
+
[2026-04-05 11:32:53] Step 410: loss: 0.642185, grad_norm: 0.223633, learning_rate: 0.000041, epoch: 0.002071
|
| 48 |
+
[2026-04-05 11:34:18] Step 420: loss: 0.583045, grad_norm: 0.492188, learning_rate: 0.000042, epoch: 0.002121
|
| 49 |
+
[2026-04-05 11:35:44] Step 430: loss: 0.668336, grad_norm: 0.179688, learning_rate: 0.000043, epoch: 0.002172
|
| 50 |
+
[2026-04-05 11:37:10] Step 440: loss: 0.709393, grad_norm: 0.185547, learning_rate: 0.000044, epoch: 0.002222
|
| 51 |
+
[2026-04-05 11:38:36] Step 450: loss: 0.799581, grad_norm: 0.263672, learning_rate: 0.000045, epoch: 0.002273
|
| 52 |
+
[2026-04-05 11:40:02] Step 460: loss: 0.613966, grad_norm: 0.314453, learning_rate: 0.000046, epoch: 0.002323
|
| 53 |
+
[2026-04-05 11:41:28] Step 470: loss: 0.782370, grad_norm: 0.261719, learning_rate: 0.000047, epoch: 0.002374
|
| 54 |
+
[2026-04-05 11:42:54] Step 480: loss: 0.686781, grad_norm: 0.972656, learning_rate: 0.000048, epoch: 0.002424
|
| 55 |
+
[2026-04-05 11:44:20] Step 490: loss: 0.691988, grad_norm: 0.310547, learning_rate: 0.000049, epoch: 0.002475
|
| 56 |
+
[2026-04-05 11:45:45] Step 500: loss: 0.876984, grad_norm: 0.369141, learning_rate: 0.000050, epoch: 0.002525
|
| 57 |
+
[2026-04-05 11:47:11] Step 510: loss: 0.864422, grad_norm: 27.125000, learning_rate: 0.000050, epoch: 0.002576
|
| 58 |
+
[2026-04-05 11:48:37] Step 520: loss: 0.639947, grad_norm: 3.015625, learning_rate: 0.000050, epoch: 0.002626
|
| 59 |
+
[2026-04-05 11:50:03] Step 530: loss: 0.694945, grad_norm: 0.267578, learning_rate: 0.000050, epoch: 0.002677
|
| 60 |
+
[2026-04-05 11:51:29] Step 540: loss: 0.740541, grad_norm: 0.257812, learning_rate: 0.000050, epoch: 0.002727
|
| 61 |
+
[2026-04-05 11:52:55] Step 550: loss: 0.665531, grad_norm: 1.664062, learning_rate: 0.000050, epoch: 0.002778
|
| 62 |
+
[2026-04-05 11:54:22] Step 560: loss: 0.724073, grad_norm: 0.261719, learning_rate: 0.000050, epoch: 0.002828
|
| 63 |
+
[2026-04-05 11:55:48] Step 570: loss: 0.746542, grad_norm: 0.243164, learning_rate: 0.000050, epoch: 0.002879
|
| 64 |
+
[2026-04-05 11:57:15] Step 580: loss: 0.692906, grad_norm: 0.194336, learning_rate: 0.000050, epoch: 0.002929
|
| 65 |
+
[2026-04-05 11:58:41] Step 590: loss: 0.690486, grad_norm: 0.341797, learning_rate: 0.000050, epoch: 0.002980
|
| 66 |
+
[2026-04-05 12:00:07] Step 600: loss: 0.650022, grad_norm: 0.289062, learning_rate: 0.000050, epoch: 0.003030
|
| 67 |
+
[2026-04-05 12:01:34] Step 610: loss: 0.640746, grad_norm: 0.221680, learning_rate: 0.000050, epoch: 0.003081
|
| 68 |
+
[2026-04-05 12:03:00] Step 620: loss: 1.449164, grad_norm: 1.546875, learning_rate: 0.000050, epoch: 0.003131
|
| 69 |
+
[2026-04-05 12:04:26] Step 630: loss: 0.656220, grad_norm: 5.218750, learning_rate: 0.000050, epoch: 0.003182
|
| 70 |
+
[2026-04-05 12:05:53] Step 640: loss: 0.636382, grad_norm: 0.196289, learning_rate: 0.000050, epoch: 0.003232
|
| 71 |
+
[2026-04-05 12:07:19] Step 650: loss: 0.668934, grad_norm: 0.675781, learning_rate: 0.000050, epoch: 0.003283
|
| 72 |
+
[2026-04-05 12:08:45] Step 660: loss: 0.748885, grad_norm: 0.257812, learning_rate: 0.000050, epoch: 0.003333
|
| 73 |
+
[2026-04-05 12:10:11] Step 670: loss: 0.788645, grad_norm: 0.394531, learning_rate: 0.000050, epoch: 0.003384
|
| 74 |
+
[2026-04-05 12:11:36] Step 680: loss: 0.697456, grad_norm: 1.367188, learning_rate: 0.000050, epoch: 0.003434
|
| 75 |
+
[2026-04-05 12:13:03] Step 690: loss: 0.792172, grad_norm: 0.265625, learning_rate: 0.000050, epoch: 0.003485
|
| 76 |
+
[2026-04-05 12:14:29] Step 700: loss: 0.606292, grad_norm: 0.216797, learning_rate: 0.000050, epoch: 0.003535
|
| 77 |
+
[2026-04-05 12:15:55] Step 710: loss: 0.598090, grad_norm: 1.273438, learning_rate: 0.000050, epoch: 0.003586
|
| 78 |
+
[2026-04-05 12:17:21] Step 720: loss: 0.541651, grad_norm: 0.240234, learning_rate: 0.000050, epoch: 0.003636
|
| 79 |
+
[2026-04-05 12:18:47] Step 730: loss: 0.705516, grad_norm: 0.269531, learning_rate: 0.000050, epoch: 0.003687
|
| 80 |
+
[2026-04-05 12:20:13] Step 740: loss: 0.738221, grad_norm: 0.283203, learning_rate: 0.000050, epoch: 0.003737
|
| 81 |
+
[2026-04-05 12:21:39] Step 750: loss: 0.643487, grad_norm: 0.314453, learning_rate: 0.000050, epoch: 0.003788
|
| 82 |
+
[2026-04-05 12:23:06] Step 760: loss: 0.658763, grad_norm: 0.216797, learning_rate: 0.000050, epoch: 0.003838
|
| 83 |
+
[2026-04-05 12:24:31] Step 770: loss: 0.842584, grad_norm: 0.302734, learning_rate: 0.000050, epoch: 0.003889
|
| 84 |
+
[2026-04-05 12:25:56] Step 780: loss: 0.650289, grad_norm: 0.285156, learning_rate: 0.000050, epoch: 0.003939
|
| 85 |
+
[2026-04-05 12:27:22] Step 790: loss: 0.637068, grad_norm: 0.820312, learning_rate: 0.000050, epoch: 0.003990
|
| 86 |
+
[2026-04-05 12:28:48] Step 800: loss: 0.753985, grad_norm: 0.380859, learning_rate: 0.000050, epoch: 0.004040
|
| 87 |
+
[2026-04-05 12:30:14] Step 810: loss: 0.632987, grad_norm: 0.210938, learning_rate: 0.000050, epoch: 0.004091
|
| 88 |
+
[2026-04-05 12:31:41] Step 820: loss: 0.630292, grad_norm: 0.281250, learning_rate: 0.000050, epoch: 0.004141
|
| 89 |
+
[2026-04-05 12:33:05] Step 830: loss: 1.052068, grad_norm: 0.341797, learning_rate: 0.000050, epoch: 0.004192
|
| 90 |
+
[2026-04-05 12:34:31] Step 840: loss: 0.878007, grad_norm: 0.223633, learning_rate: 0.000050, epoch: 0.004242
|
| 91 |
+
[2026-04-05 12:35:57] Step 850: loss: 0.665595, grad_norm: 1.851562, learning_rate: 0.000050, epoch: 0.004293
|
| 92 |
+
[2026-04-05 12:37:24] Step 860: loss: 0.665583, grad_norm: 0.216797, learning_rate: 0.000050, epoch: 0.004343
|
| 93 |
+
[2026-04-05 12:38:51] Step 870: loss: 0.647318, grad_norm: 0.263672, learning_rate: 0.000050, epoch: 0.004394
|
| 94 |
+
[2026-04-05 12:40:17] Step 880: loss: 0.662116, grad_norm: 0.166016, learning_rate: 0.000050, epoch: 0.004444
|
| 95 |
+
[2026-04-05 12:41:44] Step 890: loss: 0.642127, grad_norm: 0.190430, learning_rate: 0.000050, epoch: 0.004495
|
| 96 |
+
[2026-04-05 12:43:08] Step 900: loss: 0.716069, grad_norm: 0.253906, learning_rate: 0.000050, epoch: 0.004545
|
| 97 |
+
[2026-04-05 12:44:34] Step 910: loss: 0.595208, grad_norm: 0.687500, learning_rate: 0.000050, epoch: 0.004596
|
| 98 |
+
[2026-04-05 12:46:00] Step 920: loss: 0.696013, grad_norm: 0.226562, learning_rate: 0.000050, epoch: 0.004646
|
| 99 |
+
[2026-04-05 12:47:27] Step 930: loss: 0.703280, grad_norm: 0.498047, learning_rate: 0.000050, epoch: 0.004697
|
| 100 |
+
[2026-04-05 12:48:52] Step 940: loss: 0.597051, grad_norm: 0.330078, learning_rate: 0.000050, epoch: 0.004747
|
| 101 |
+
[2026-04-05 12:50:18] Step 950: loss: 0.667682, grad_norm: 0.179688, learning_rate: 0.000050, epoch: 0.004798
|
| 102 |
+
[2026-04-05 12:51:45] Step 960: loss: 0.640542, grad_norm: 0.153320, learning_rate: 0.000050, epoch: 0.004848
|
| 103 |
+
[2026-04-05 12:53:11] Step 970: loss: 0.671970, grad_norm: 0.427734, learning_rate: 0.000050, epoch: 0.004899
|
| 104 |
+
[2026-04-05 12:54:37] Step 980: loss: 0.693704, grad_norm: 0.229492, learning_rate: 0.000050, epoch: 0.004949
|
| 105 |
+
[2026-04-05 12:56:04] Step 990: loss: 0.625149, grad_norm: 0.636719, learning_rate: 0.000050, epoch: 0.005000
|
| 106 |
+
[2026-04-05 12:57:30] Step 1000: loss: 0.782581, grad_norm: 0.248047, learning_rate: 0.000050, epoch: 0.005051
|
| 107 |
+
[2026-04-05 12:58:56] Step 1010: loss: 0.668274, grad_norm: 0.181641, learning_rate: 0.000050, epoch: 0.005101
|
| 108 |
+
[2026-04-05 13:00:22] Step 1020: loss: 0.610654, grad_norm: 0.218750, learning_rate: 0.000050, epoch: 0.005152
|
| 109 |
+
[2026-04-05 13:01:49] Step 1030: loss: 0.662170, grad_norm: 0.150391, learning_rate: 0.000050, epoch: 0.005202
|
| 110 |
+
[2026-04-05 13:03:16] Step 1040: loss: 0.666967, grad_norm: 0.298828, learning_rate: 0.000050, epoch: 0.005253
|
| 111 |
+
[2026-04-05 13:04:41] Step 1050: loss: 0.680621, grad_norm: 0.190430, learning_rate: 0.000050, epoch: 0.005303
|
| 112 |
+
[2026-04-05 13:06:08] Step 1060: loss: 0.616938, grad_norm: 0.219727, learning_rate: 0.000050, epoch: 0.005354
|
| 113 |
+
[2026-04-05 13:07:34] Step 1070: loss: 0.619953, grad_norm: 0.142578, learning_rate: 0.000050, epoch: 0.005404
|
| 114 |
+
[2026-04-05 13:09:00] Step 1080: loss: 0.589728, grad_norm: 0.154297, learning_rate: 0.000050, epoch: 0.005455
|
| 115 |
+
[2026-04-05 13:10:27] Step 1090: loss: 0.573597, grad_norm: 0.179688, learning_rate: 0.000050, epoch: 0.005505
|
| 116 |
+
[2026-04-05 13:11:53] Step 1100: loss: 0.613491, grad_norm: 0.206055, learning_rate: 0.000050, epoch: 0.005556
|
| 117 |
+
[2026-04-05 13:13:18] Step 1110: loss: 0.460272, grad_norm: 0.214844, learning_rate: 0.000050, epoch: 0.005606
|
| 118 |
+
[2026-04-05 13:14:45] Step 1120: loss: 0.553796, grad_norm: 0.153320, learning_rate: 0.000050, epoch: 0.005657
|
| 119 |
+
[2026-04-05 13:16:11] Step 1130: loss: 0.724727, grad_norm: 0.198242, learning_rate: 0.000050, epoch: 0.005707
|
| 120 |
+
[2026-04-05 13:17:37] Step 1140: loss: 0.491195, grad_norm: 0.137695, learning_rate: 0.000050, epoch: 0.005758
|
| 121 |
+
[2026-04-05 13:19:01] Step 1150: loss: 0.529206, grad_norm: 0.191406, learning_rate: 0.000050, epoch: 0.005808
|
| 122 |
+
[2026-04-05 13:20:28] Step 1160: loss: 0.525564, grad_norm: 0.140625, learning_rate: 0.000050, epoch: 0.005859
|
| 123 |
+
[2026-04-05 13:21:55] Step 1170: loss: 0.576415, grad_norm: 0.273438, learning_rate: 0.000050, epoch: 0.005909
|
| 124 |
+
[2026-04-05 13:23:21] Step 1180: loss: 0.495870, grad_norm: 0.283203, learning_rate: 0.000050, epoch: 0.005960
|
| 125 |
+
[2026-04-05 13:24:47] Step 1190: loss: 0.691925, grad_norm: 0.213867, learning_rate: 0.000050, epoch: 0.006010
|
| 126 |
+
[2026-04-05 13:26:13] Step 1200: loss: 0.508326, grad_norm: 0.263672, learning_rate: 0.000050, epoch: 0.006061
|
| 127 |
+
[2026-04-05 13:27:39] Step 1210: loss: 0.634969, grad_norm: 0.185547, learning_rate: 0.000050, epoch: 0.006111
|
| 128 |
+
[2026-04-05 13:29:05] Step 1220: loss: 0.454523, grad_norm: 0.191406, learning_rate: 0.000050, epoch: 0.006162
|
| 129 |
+
[2026-04-05 13:30:31] Step 1230: loss: 0.529709, grad_norm: 0.165039, learning_rate: 0.000050, epoch: 0.006212
|
| 130 |
+
[2026-04-05 13:31:57] Step 1240: loss: 0.654458, grad_norm: 0.217773, learning_rate: 0.000050, epoch: 0.006263
|
| 131 |
+
[2026-04-05 13:33:24] Step 1250: loss: 0.644524, grad_norm: 0.145508, learning_rate: 0.000050, epoch: 0.006313
|
| 132 |
+
[2026-04-05 13:34:50] Step 1260: loss: 0.558337, grad_norm: 0.186523, learning_rate: 0.000050, epoch: 0.006364
|
| 133 |
+
[2026-04-05 13:36:17] Step 1270: loss: 0.653074, grad_norm: 0.197266, learning_rate: 0.000050, epoch: 0.006414
|
| 134 |
+
[2026-04-05 13:37:44] Step 1280: loss: 0.553103, grad_norm: 0.476562, learning_rate: 0.000050, epoch: 0.006465
|
| 135 |
+
[2026-04-05 13:39:10] Step 1290: loss: 0.591599, grad_norm: 0.165039, learning_rate: 0.000050, epoch: 0.006515
|
| 136 |
+
[2026-04-05 13:40:37] Step 1300: loss: 0.455996, grad_norm: 0.163086, learning_rate: 0.000050, epoch: 0.006566
|
| 137 |
+
[2026-04-05 13:42:02] Step 1310: loss: 0.569381, grad_norm: 0.230469, learning_rate: 0.000050, epoch: 0.006616
|
| 138 |
+
[2026-04-05 13:43:28] Step 1320: loss: 0.583628, grad_norm: 0.169922, learning_rate: 0.000050, epoch: 0.006667
|
| 139 |
+
[2026-04-05 13:44:55] Step 1330: loss: 0.469083, grad_norm: 0.185547, learning_rate: 0.000050, epoch: 0.006717
|
| 140 |
+
[2026-04-05 13:46:22] Step 1340: loss: 0.524718, grad_norm: 0.180664, learning_rate: 0.000050, epoch: 0.006768
|
| 141 |
+
[2026-04-05 13:47:48] Step 1350: loss: 0.517907, grad_norm: 0.746094, learning_rate: 0.000050, epoch: 0.006818
|
| 142 |
+
[2026-04-05 13:49:14] Step 1360: loss: 0.545603, grad_norm: 0.306641, learning_rate: 0.000050, epoch: 0.006869
|
| 143 |
+
[2026-04-05 13:50:41] Step 1370: loss: 0.430360, grad_norm: 0.169922, learning_rate: 0.000050, epoch: 0.006919
|
| 144 |
+
[2026-04-05 13:52:06] Step 1380: loss: 0.564411, grad_norm: 0.535156, learning_rate: 0.000050, epoch: 0.006970
|
| 145 |
+
[2026-04-05 13:53:33] Step 1390: loss: 0.523255, grad_norm: 0.330078, learning_rate: 0.000050, epoch: 0.007020
|
| 146 |
+
[2026-04-05 13:55:00] Step 1400: loss: 0.616001, grad_norm: 0.470703, learning_rate: 0.000050, epoch: 0.007071
|
| 147 |
+
[2026-04-05 13:56:25] Step 1410: loss: 0.639578, grad_norm: 0.178711, learning_rate: 0.000050, epoch: 0.007121
|
| 148 |
+
[2026-04-05 13:57:50] Step 1420: loss: 0.471984, grad_norm: 0.310547, learning_rate: 0.000050, epoch: 0.007172
|
| 149 |
+
[2026-04-05 13:59:16] Step 1430: loss: 0.549651, grad_norm: 0.263672, learning_rate: 0.000050, epoch: 0.007222
|
| 150 |
+
[2026-04-05 14:00:42] Step 1440: loss: 0.576872, grad_norm: 0.235352, learning_rate: 0.000050, epoch: 0.007273
|
| 151 |
+
[2026-04-05 14:02:08] Step 1450: loss: 0.552616, grad_norm: 0.242188, learning_rate: 0.000050, epoch: 0.007323
|
| 152 |
+
[2026-04-05 14:03:34] Step 1460: loss: 0.531245, grad_norm: 0.177734, learning_rate: 0.000050, epoch: 0.007374
|
| 153 |
+
[2026-04-05 14:05:01] Step 1470: loss: 1.899408, grad_norm: 0.574219, learning_rate: 0.000050, epoch: 0.007424
|
| 154 |
+
[2026-04-05 14:06:26] Step 1480: loss: 1.379459, grad_norm: 0.466797, learning_rate: 0.000050, epoch: 0.007475
|
| 155 |
+
[2026-04-05 14:07:52] Step 1490: loss: 1.148382, grad_norm: 0.281250, learning_rate: 0.000050, epoch: 0.007525
|
| 156 |
+
[2026-04-05 14:09:18] Step 1500: loss: 1.149607, grad_norm: 0.235352, learning_rate: 0.000050, epoch: 0.007576
|
| 157 |
+
[2026-04-05 14:10:45] Step 1510: loss: 1.059986, grad_norm: 0.345703, learning_rate: 0.000050, epoch: 0.007626
|
| 158 |
+
[2026-04-05 14:12:11] Step 1520: loss: 0.955449, grad_norm: 2.234375, learning_rate: 0.000050, epoch: 0.007677
|
| 159 |
+
[2026-04-05 14:13:37] Step 1530: loss: 1.007732, grad_norm: 0.380859, learning_rate: 0.000050, epoch: 0.007727
|
| 160 |
+
[2026-04-05 14:15:02] Step 1540: loss: 1.008875, grad_norm: 0.542969, learning_rate: 0.000050, epoch: 0.007778
|
| 161 |
+
[2026-04-05 14:16:28] Step 1550: loss: 1.136967, grad_norm: 6.312500, learning_rate: 0.000050, epoch: 0.007828
|
| 162 |
+
[2026-04-05 14:17:52] Step 1560: loss: 1.325755, grad_norm: 0.455078, learning_rate: 0.000050, epoch: 0.007879
|
| 163 |
+
[2026-04-05 14:19:18] Step 1570: loss: 1.099739, grad_norm: 0.410156, learning_rate: 0.000050, epoch: 0.007929
|
| 164 |
+
[2026-04-05 14:20:43] Step 1580: loss: 1.002961, grad_norm: 0.554688, learning_rate: 0.000050, epoch: 0.007980
|
| 165 |
+
[2026-04-05 14:22:09] Step 1590: loss: 1.057071, grad_norm: 0.250000, learning_rate: 0.000050, epoch: 0.008030
|
| 166 |
+
[2026-04-05 14:23:35] Step 1600: loss: 0.905516, grad_norm: 0.249023, learning_rate: 0.000050, epoch: 0.008081
|
| 167 |
+
[2026-04-05 14:25:01] Step 1610: loss: 1.050816, grad_norm: 0.337891, learning_rate: 0.000050, epoch: 0.008131
|
| 168 |
+
[2026-04-05 14:26:27] Step 1620: loss: 0.760539, grad_norm: 0.238281, learning_rate: 0.000050, epoch: 0.008182
|
| 169 |
+
[2026-04-05 14:27:54] Step 1630: loss: 0.866014, grad_norm: 0.337891, learning_rate: 0.000050, epoch: 0.008232
|
| 170 |
+
[2026-04-05 14:29:20] Step 1640: loss: 0.878096, grad_norm: 0.878906, learning_rate: 0.000050, epoch: 0.008283
|
| 171 |
+
[2026-04-05 14:30:46] Step 1650: loss: 0.899268, grad_norm: 0.198242, learning_rate: 0.000050, epoch: 0.008333
|
| 172 |
+
[2026-04-05 14:32:12] Step 1660: loss: 0.805451, grad_norm: 0.312500, learning_rate: 0.000050, epoch: 0.008384
|
| 173 |
+
[2026-04-05 14:33:37] Step 1670: loss: 0.875339, grad_norm: 0.226562, learning_rate: 0.000050, epoch: 0.008434
|
| 174 |
+
[2026-04-05 14:35:04] Step 1680: loss: 0.903578, grad_norm: 0.197266, learning_rate: 0.000050, epoch: 0.008485
|
| 175 |
+
[2026-04-05 14:36:30] Step 1690: loss: 0.893119, grad_norm: 0.500000, learning_rate: 0.000050, epoch: 0.008535
|
| 176 |
+
[2026-04-05 14:37:56] Step 1700: loss: 0.808819, grad_norm: 0.170898, learning_rate: 0.000050, epoch: 0.008586
|
| 177 |
+
[2026-04-05 14:39:21] Step 1710: loss: 0.837901, grad_norm: 0.263672, learning_rate: 0.000050, epoch: 0.008636
|
| 178 |
+
[2026-04-05 14:40:48] Step 1720: loss: 0.789509, grad_norm: 0.226562, learning_rate: 0.000050, epoch: 0.008687
|
| 179 |
+
[2026-04-05 14:42:13] Step 1730: loss: 0.984356, grad_norm: 0.265625, learning_rate: 0.000050, epoch: 0.008737
|
| 180 |
+
[2026-04-05 14:43:38] Step 1740: loss: 1.009939, grad_norm: 0.218750, learning_rate: 0.000050, epoch: 0.008788
|
| 181 |
+
[2026-04-05 14:45:04] Step 1750: loss: 0.945137, grad_norm: 1.250000, learning_rate: 0.000050, epoch: 0.008838
|
| 182 |
+
[2026-04-05 14:46:30] Step 1760: loss: 0.892640, grad_norm: 0.200195, learning_rate: 0.000050, epoch: 0.008889
|
| 183 |
+
[2026-04-05 14:47:56] Step 1770: loss: 0.845068, grad_norm: 0.240234, learning_rate: 0.000050, epoch: 0.008939
|
| 184 |
+
[2026-04-05 14:49:20] Step 1780: loss: 1.132531, grad_norm: 0.652344, learning_rate: 0.000050, epoch: 0.008990
|
| 185 |
+
[2026-04-05 14:50:47] Step 1790: loss: 1.225483, grad_norm: 32.750000, learning_rate: 0.000050, epoch: 0.009040
|
| 186 |
+
[2026-04-05 14:52:13] Step 1800: loss: 1.753622, grad_norm: 0.304688, learning_rate: 0.000050, epoch: 0.009091
|
| 187 |
+
[2026-04-05 14:53:39] Step 1810: loss: 1.767740, grad_norm: 3.062500, learning_rate: 0.000050, epoch: 0.009141
|
| 188 |
+
[2026-04-05 14:55:05] Step 1820: loss: 1.830482, grad_norm: 2.359375, learning_rate: 0.000050, epoch: 0.009192
|
| 189 |
+
[2026-04-05 14:56:32] Step 1830: loss: 1.026241, grad_norm: 0.345703, learning_rate: 0.000050, epoch: 0.009242
|
| 190 |
+
[2026-04-05 14:57:58] Step 1840: loss: 1.049114, grad_norm: 0.237305, learning_rate: 0.000050, epoch: 0.009293
|
| 191 |
+
[2026-04-05 14:59:25] Step 1850: loss: 1.213499, grad_norm: 0.227539, learning_rate: 0.000050, epoch: 0.009343
|
| 192 |
+
[2026-04-05 15:00:51] Step 1860: loss: 1.006206, grad_norm: 0.355469, learning_rate: 0.000050, epoch: 0.009394
|
| 193 |
+
[2026-04-05 15:02:17] Step 1870: loss: 0.872990, grad_norm: 0.176758, learning_rate: 0.000050, epoch: 0.009444
|
| 194 |
+
[2026-04-05 15:03:42] Step 1880: loss: 0.959800, grad_norm: 0.357422, learning_rate: 0.000050, epoch: 0.009495
|
| 195 |
+
[2026-04-05 15:05:08] Step 1890: loss: 0.843189, grad_norm: 0.269531, learning_rate: 0.000050, epoch: 0.009545
|
| 196 |
+
[2026-04-05 15:06:33] Step 1900: loss: 0.732752, grad_norm: 0.384766, learning_rate: 0.000050, epoch: 0.009596
|
| 197 |
+
[2026-04-05 15:07:59] Step 1910: loss: 0.886395, grad_norm: 0.166016, learning_rate: 0.000050, epoch: 0.009646
|
| 198 |
+
[2026-04-05 15:09:25] Step 1920: loss: 0.814924, grad_norm: 0.179688, learning_rate: 0.000050, epoch: 0.009697
|
| 199 |
+
[2026-04-05 15:10:51] Step 1930: loss: 0.955270, grad_norm: 0.335938, learning_rate: 0.000050, epoch: 0.009747
|
| 200 |
+
[2026-04-05 15:12:18] Step 1940: loss: 1.084479, grad_norm: 0.250000, learning_rate: 0.000050, epoch: 0.009798
|
| 201 |
+
[2026-04-05 15:13:43] Step 1950: loss: 1.049752, grad_norm: 0.255859, learning_rate: 0.000050, epoch: 0.009848
|
| 202 |
+
[2026-04-05 15:15:10] Step 1960: loss: 0.852993, grad_norm: 0.197266, learning_rate: 0.000050, epoch: 0.009899
|
| 203 |
+
[2026-04-05 15:16:36] Step 1970: loss: 1.096658, grad_norm: 0.166016, learning_rate: 0.000050, epoch: 0.009949
|
| 204 |
+
[2026-04-05 15:18:03] Step 1980: loss: 0.874805, grad_norm: 0.217773, learning_rate: 0.000050, epoch: 0.010000
|
| 205 |
+
[2026-04-05 15:19:28] Step 1990: loss: 0.894336, grad_norm: 0.163086, learning_rate: 0.000050, epoch: 0.010051
|
| 206 |
+
[2026-04-05 15:20:54] Step 2000: loss: 0.869858, grad_norm: 0.263672, learning_rate: 0.000050, epoch: 0.010101
|
| 207 |
+
[2026-04-05 15:22:20] Step 2010: loss: 0.753163, grad_norm: 0.175781, learning_rate: 0.000050, epoch: 0.010152
|
| 208 |
+
[2026-04-05 15:23:44] Step 2020: loss: 0.821059, grad_norm: 0.289062, learning_rate: 0.000050, epoch: 0.010202
|
| 209 |
+
[2026-04-05 15:25:09] Step 2030: loss: 0.929537, grad_norm: 0.326172, learning_rate: 0.000050, epoch: 0.010253
|
| 210 |
+
[2026-04-05 15:26:35] Step 2040: loss: 0.774003, grad_norm: 0.217773, learning_rate: 0.000050, epoch: 0.010303
|
| 211 |
+
[2026-04-05 15:28:00] Step 2050: loss: 0.576957, grad_norm: 0.189453, learning_rate: 0.000050, epoch: 0.010354
|
| 212 |
+
[2026-04-05 15:29:27] Step 2060: loss: 0.834949, grad_norm: 0.163086, learning_rate: 0.000050, epoch: 0.010404
|
| 213 |
+
[2026-04-05 15:30:52] Step 2070: loss: 0.714023, grad_norm: 0.213867, learning_rate: 0.000050, epoch: 0.010455
|
| 214 |
+
[2026-04-05 15:32:18] Step 2080: loss: 0.683500, grad_norm: 0.187500, learning_rate: 0.000050, epoch: 0.010505
|
| 215 |
+
[2026-04-05 15:33:44] Step 2090: loss: 0.759647, grad_norm: 0.220703, learning_rate: 0.000050, epoch: 0.010556
|
| 216 |
+
[2026-04-05 15:35:11] Step 2100: loss: 0.683842, grad_norm: 0.146484, learning_rate: 0.000050, epoch: 0.010606
|
| 217 |
+
[2026-04-05 15:36:37] Step 2110: loss: 0.823089, grad_norm: 0.138672, learning_rate: 0.000050, epoch: 0.010657
|
| 218 |
+
[2026-04-05 15:38:03] Step 2120: loss: 0.762830, grad_norm: 0.210938, learning_rate: 0.000050, epoch: 0.010707
|
| 219 |
+
[2026-04-05 15:39:28] Step 2130: loss: 0.739592, grad_norm: 0.182617, learning_rate: 0.000050, epoch: 0.010758
|
| 220 |
+
[2026-04-05 15:40:55] Step 2140: loss: 0.734296, grad_norm: 0.165039, learning_rate: 0.000050, epoch: 0.010808
|
| 221 |
+
[2026-04-05 15:42:22] Step 2150: loss: 0.685651, grad_norm: 0.217773, learning_rate: 0.000050, epoch: 0.010859
|
| 222 |
+
[2026-04-05 15:43:48] Step 2160: loss: 0.764905, grad_norm: 0.129883, learning_rate: 0.000050, epoch: 0.010909
|
| 223 |
+
[2026-04-05 15:45:14] Step 2170: loss: 0.686083, grad_norm: 0.140625, learning_rate: 0.000050, epoch: 0.010960
|
| 224 |
+
[2026-04-05 15:46:41] Step 2180: loss: 0.763166, grad_norm: 0.255859, learning_rate: 0.000050, epoch: 0.011010
|
| 225 |
+
[2026-04-05 15:48:06] Step 2190: loss: 0.669076, grad_norm: 0.127930, learning_rate: 0.000050, epoch: 0.011061
|
| 226 |
+
[2026-04-05 15:49:32] Step 2200: loss: 0.650153, grad_norm: 0.304688, learning_rate: 0.000050, epoch: 0.011111
|
| 227 |
+
[2026-04-05 15:50:58] Step 2210: loss: 0.832567, grad_norm: 0.210938, learning_rate: 0.000050, epoch: 0.011162
|
| 228 |
+
[2026-04-05 15:52:23] Step 2220: loss: 1.139757, grad_norm: 1.109375, learning_rate: 0.000050, epoch: 0.011212
|
| 229 |
+
[2026-04-05 15:53:47] Step 2230: loss: 1.250969, grad_norm: 0.484375, learning_rate: 0.000050, epoch: 0.011263
|
| 230 |
+
[2026-04-05 15:55:11] Step 2240: loss: 1.130311, grad_norm: 0.263672, learning_rate: 0.000050, epoch: 0.011313
|
| 231 |
+
[2026-04-05 15:56:35] Step 2250: loss: 1.492599, grad_norm: 0.898438, learning_rate: 0.000050, epoch: 0.011364
|
| 232 |
+
[2026-04-05 15:57:59] Step 2260: loss: 1.048409, grad_norm: 0.773438, learning_rate: 0.000050, epoch: 0.011414
|
| 233 |
+
[2026-04-05 15:59:22] Step 2270: loss: 1.264527, grad_norm: 0.558594, learning_rate: 0.000050, epoch: 0.011465
|
| 234 |
+
[2026-04-05 16:00:46] Step 2280: loss: 1.209084, grad_norm: 0.226562, learning_rate: 0.000050, epoch: 0.011515
|
| 235 |
+
[2026-04-05 16:02:10] Step 2290: loss: 1.030489, grad_norm: 1.007812, learning_rate: 0.000050, epoch: 0.011566
|
| 236 |
+
[2026-04-05 16:03:33] Step 2300: loss: 1.380919, grad_norm: 1.054688, learning_rate: 0.000050, epoch: 0.011616
|
| 237 |
+
[2026-04-05 16:04:57] Step 2310: loss: 1.237099, grad_norm: 1.898438, learning_rate: 0.000050, epoch: 0.011667
|
| 238 |
+
[2026-04-05 16:06:21] Step 2320: loss: 1.099260, grad_norm: 1.437500, learning_rate: 0.000050, epoch: 0.011717
|
| 239 |
+
[2026-04-05 16:07:45] Step 2330: loss: 1.179968, grad_norm: 1.156250, learning_rate: 0.000050, epoch: 0.011768
|
| 240 |
+
[2026-04-05 16:09:09] Step 2340: loss: 1.348586, grad_norm: 0.777344, learning_rate: 0.000050, epoch: 0.011818
|
| 241 |
+
[2026-04-05 16:10:33] Step 2350: loss: 1.062583, grad_norm: 0.345703, learning_rate: 0.000050, epoch: 0.011869
|
| 242 |
+
[2026-04-05 16:11:56] Step 2360: loss: 0.927481, grad_norm: 0.582031, learning_rate: 0.000050, epoch: 0.011919
|
| 243 |
+
[2026-04-05 16:13:21] Step 2370: loss: 1.113601, grad_norm: 0.478516, learning_rate: 0.000050, epoch: 0.011970
|
| 244 |
+
[2026-04-05 16:14:45] Step 2380: loss: 0.967371, grad_norm: 0.464844, learning_rate: 0.000050, epoch: 0.012020
|
| 245 |
+
[2026-04-05 16:16:08] Step 2390: loss: 1.120241, grad_norm: 1.148438, learning_rate: 0.000050, epoch: 0.012071
|
| 246 |
+
[2026-04-05 16:17:32] Step 2400: loss: 0.954143, grad_norm: 0.542969, learning_rate: 0.000050, epoch: 0.012121
|
| 247 |
+
[2026-04-05 16:18:58] Step 2410: loss: 2.039123, grad_norm: 0.503906, learning_rate: 0.000050, epoch: 0.012172
|
| 248 |
+
[2026-04-05 16:20:23] Step 2420: loss: 1.816168, grad_norm: 0.742188, learning_rate: 0.000050, epoch: 0.012222
|
| 249 |
+
[2026-04-05 16:21:49] Step 2430: loss: 1.829216, grad_norm: 8.875000, learning_rate: 0.000050, epoch: 0.012273
|
| 250 |
+
[2026-04-05 16:23:13] Step 2440: loss: 2.097993, grad_norm: 0.468750, learning_rate: 0.000050, epoch: 0.012323
|
| 251 |
+
[2026-04-05 16:24:39] Step 2450: loss: 1.479318, grad_norm: 0.824219, learning_rate: 0.000050, epoch: 0.012374
|
| 252 |
+
[2026-04-05 16:26:06] Step 2460: loss: 1.397717, grad_norm: 0.285156, learning_rate: 0.000050, epoch: 0.012424
|
| 253 |
+
[2026-04-05 16:27:32] Step 2470: loss: 1.590504, grad_norm: 0.808594, learning_rate: 0.000050, epoch: 0.012475
|
| 254 |
+
[2026-04-05 16:28:57] Step 2480: loss: 1.398745, grad_norm: 0.480469, learning_rate: 0.000050, epoch: 0.012525
|
| 255 |
+
[2026-04-05 16:30:24] Step 2490: loss: 1.382736, grad_norm: 2.500000, learning_rate: 0.000050, epoch: 0.012576
|
| 256 |
+
[2026-04-05 16:31:50] Step 2500: loss: 1.278419, grad_norm: 0.371094, learning_rate: 0.000050, epoch: 0.012626
|
| 257 |
+
[2026-04-05 16:33:16] Step 2510: loss: 1.223915, grad_norm: 0.233398, learning_rate: 0.000050, epoch: 0.012677
|
| 258 |
+
[2026-04-05 16:34:42] Step 2520: loss: 1.601561, grad_norm: 1.093750, learning_rate: 0.000050, epoch: 0.012727
|
| 259 |
+
[2026-04-05 16:36:08] Step 2530: loss: 1.213593, grad_norm: 4.312500, learning_rate: 0.000050, epoch: 0.012778
|
| 260 |
+
[2026-04-05 16:37:35] Step 2540: loss: 1.188114, grad_norm: 1.085938, learning_rate: 0.000050, epoch: 0.012828
|
| 261 |
+
[2026-04-05 16:39:01] Step 2550: loss: 1.616557, grad_norm: 0.257812, learning_rate: 0.000050, epoch: 0.012879
|
| 262 |
+
[2026-04-05 16:40:27] Step 2560: loss: 1.487203, grad_norm: 0.380859, learning_rate: 0.000050, epoch: 0.012929
|
| 263 |
+
[2026-04-05 16:41:53] Step 2570: loss: 1.499329, grad_norm: 0.640625, learning_rate: 0.000050, epoch: 0.012980
|
| 264 |
+
[2026-04-05 16:43:19] Step 2580: loss: 1.215436, grad_norm: 0.392578, learning_rate: 0.000050, epoch: 0.013030
|
| 265 |
+
[2026-04-05 16:44:45] Step 2590: loss: 1.291605, grad_norm: 2.015625, learning_rate: 0.000050, epoch: 0.013081
|
| 266 |
+
[2026-04-05 16:46:11] Step 2600: loss: 1.156400, grad_norm: 0.550781, learning_rate: 0.000050, epoch: 0.013131
|
| 267 |
+
[2026-04-05 16:47:37] Step 2610: loss: 1.193227, grad_norm: 0.582031, learning_rate: 0.000050, epoch: 0.013182
|
| 268 |
+
[2026-04-05 16:49:04] Step 2620: loss: 1.147099, grad_norm: 0.204102, learning_rate: 0.000050, epoch: 0.013232
|
| 269 |
+
[2026-04-05 16:50:29] Step 2630: loss: 1.581532, grad_norm: 0.294922, learning_rate: 0.000050, epoch: 0.013283
|
| 270 |
+
[2026-04-05 16:51:55] Step 2640: loss: 1.038281, grad_norm: 0.229492, learning_rate: 0.000050, epoch: 0.013333
|
| 271 |
+
[2026-04-05 16:53:21] Step 2650: loss: 1.072556, grad_norm: 0.251953, learning_rate: 0.000050, epoch: 0.013384
|
| 272 |
+
[2026-04-05 16:54:47] Step 2660: loss: 1.094269, grad_norm: 0.196289, learning_rate: 0.000050, epoch: 0.013434
|
| 273 |
+
[2026-04-05 16:56:14] Step 2670: loss: 0.990274, grad_norm: 0.191406, learning_rate: 0.000050, epoch: 0.013485
|
| 274 |
+
[2026-04-05 16:57:40] Step 2680: loss: 1.059046, grad_norm: 0.302734, learning_rate: 0.000050, epoch: 0.013535
|
| 275 |
+
[2026-04-05 16:59:06] Step 2690: loss: 0.863274, grad_norm: 0.367188, learning_rate: 0.000050, epoch: 0.013586
|
| 276 |
+
[2026-04-05 17:00:32] Step 2700: loss: 0.899668, grad_norm: 0.159180, learning_rate: 0.000050, epoch: 0.013636
|
| 277 |
+
[2026-04-05 17:01:57] Step 2710: loss: 1.116811, grad_norm: 1.015625, learning_rate: 0.000050, epoch: 0.013687
|
| 278 |
+
[2026-04-05 17:03:23] Step 2720: loss: 1.039225, grad_norm: 0.200195, learning_rate: 0.000050, epoch: 0.013737
|
| 279 |
+
[2026-04-05 17:04:50] Step 2730: loss: 0.910703, grad_norm: 0.302734, learning_rate: 0.000050, epoch: 0.013788
|
| 280 |
+
[2026-04-05 17:06:15] Step 2740: loss: 0.952942, grad_norm: 0.257812, learning_rate: 0.000050, epoch: 0.013838
|
| 281 |
+
[2026-04-05 17:07:41] Step 2750: loss: 0.836150, grad_norm: 0.218750, learning_rate: 0.000050, epoch: 0.013889
|
| 282 |
+
[2026-04-05 17:09:05] Step 2760: loss: 1.159457, grad_norm: 0.414062, learning_rate: 0.000050, epoch: 0.013939
|
| 283 |
+
[2026-04-05 17:10:30] Step 2770: loss: 1.032300, grad_norm: 0.388672, learning_rate: 0.000050, epoch: 0.013990
|
| 284 |
+
[2026-04-05 17:11:53] Step 2780: loss: 1.089910, grad_norm: 0.427734, learning_rate: 0.000050, epoch: 0.014040
|
| 285 |
+
[2026-04-05 17:13:17] Step 2790: loss: 0.992994, grad_norm: 0.335938, learning_rate: 0.000050, epoch: 0.014091
|
| 286 |
+
[2026-04-05 17:14:41] Step 2800: loss: 0.949600, grad_norm: 0.310547, learning_rate: 0.000050, epoch: 0.014141
|
| 287 |
+
[2026-04-05 17:16:06] Step 2810: loss: 0.910935, grad_norm: 0.273438, learning_rate: 0.000050, epoch: 0.014192
|
| 288 |
+
[2026-04-05 17:17:30] Step 2820: loss: 0.931577, grad_norm: 0.246094, learning_rate: 0.000050, epoch: 0.014242
|
| 289 |
+
[2026-04-05 17:18:54] Step 2830: loss: 1.009028, grad_norm: 0.531250, learning_rate: 0.000050, epoch: 0.014293
|
| 290 |
+
[2026-04-05 17:20:18] Step 2840: loss: 1.071189, grad_norm: 0.337891, learning_rate: 0.000050, epoch: 0.014343
|
| 291 |
+
[2026-04-05 17:21:42] Step 2850: loss: 0.848302, grad_norm: 0.306641, learning_rate: 0.000050, epoch: 0.014394
|
| 292 |
+
[2026-04-05 17:23:06] Step 2860: loss: 0.861690, grad_norm: 0.300781, learning_rate: 0.000050, epoch: 0.014444
|
| 293 |
+
[2026-04-05 17:24:30] Step 2870: loss: 0.811077, grad_norm: 0.371094, learning_rate: 0.000050, epoch: 0.014495
|
| 294 |
+
[2026-04-05 17:25:55] Step 2880: loss: 0.871173, grad_norm: 0.239258, learning_rate: 0.000050, epoch: 0.014545
|
| 295 |
+
[2026-04-05 17:27:19] Step 2890: loss: 0.917228, grad_norm: 0.351562, learning_rate: 0.000050, epoch: 0.014596
|
| 296 |
+
[2026-04-05 17:28:43] Step 2900: loss: 0.786758, grad_norm: 0.261719, learning_rate: 0.000050, epoch: 0.014646
|
| 297 |
+
[2026-04-05 17:30:08] Step 2910: loss: 1.140635, grad_norm: 0.371094, learning_rate: 0.000050, epoch: 0.014697
|
| 298 |
+
[2026-04-05 17:31:33] Step 2920: loss: 1.568506, grad_norm: 1.101562, learning_rate: 0.000050, epoch: 0.014747
|
| 299 |
+
[2026-04-05 17:32:58] Step 2930: loss: 1.817063, grad_norm: 0.357422, learning_rate: 0.000050, epoch: 0.014798
|
| 300 |
+
[2026-04-05 17:34:23] Step 2940: loss: 1.432751, grad_norm: 0.707031, learning_rate: 0.000050, epoch: 0.014848
|
| 301 |
+
[2026-04-05 17:35:48] Step 2950: loss: 1.237652, grad_norm: 0.388672, learning_rate: 0.000050, epoch: 0.014899
|
| 302 |
+
[2026-04-05 17:37:13] Step 2960: loss: 1.362940, grad_norm: 0.503906, learning_rate: 0.000050, epoch: 0.014949
|
| 303 |
+
[2026-04-05 17:38:38] Step 2970: loss: 1.296589, grad_norm: 0.687500, learning_rate: 0.000050, epoch: 0.015000
|
| 304 |
+
[2026-04-05 17:40:03] Step 2980: loss: 1.269760, grad_norm: 0.281250, learning_rate: 0.000050, epoch: 0.015051
|
| 305 |
+
[2026-04-05 17:41:29] Step 2990: loss: 1.306033, grad_norm: 0.664062, learning_rate: 0.000050, epoch: 0.015101
|
| 306 |
+
[2026-04-05 17:42:53] Step 3000: loss: 1.112861, grad_norm: 0.361328, learning_rate: 0.000050, epoch: 0.015152
|
| 307 |
+
[2026-04-05 17:44:17] Step 3010: loss: 1.465074, grad_norm: 0.357422, learning_rate: 0.000050, epoch: 0.015202
|
| 308 |
+
[2026-04-05 17:45:43] Step 3020: loss: 1.325169, grad_norm: 0.945312, learning_rate: 0.000050, epoch: 0.015253
|
| 309 |
+
[2026-04-05 17:47:07] Step 3030: loss: 1.242030, grad_norm: 1.453125, learning_rate: 0.000050, epoch: 0.015303
|
| 310 |
+
[2026-04-05 17:48:31] Step 3040: loss: 1.179085, grad_norm: 0.640625, learning_rate: 0.000050, epoch: 0.015354
|
| 311 |
+
[2026-04-05 17:49:57] Step 3050: loss: 1.195576, grad_norm: 0.296875, learning_rate: 0.000050, epoch: 0.015404
|
| 312 |
+
[2026-04-05 17:51:22] Step 3060: loss: 1.088905, grad_norm: 0.423828, learning_rate: 0.000050, epoch: 0.015455
|
| 313 |
+
[2026-04-05 17:52:46] Step 3070: loss: 1.133894, grad_norm: 2.093750, learning_rate: 0.000050, epoch: 0.015505
|
| 314 |
+
[2026-04-05 17:54:12] Step 3080: loss: 1.182403, grad_norm: 0.226562, learning_rate: 0.000050, epoch: 0.015556
|
| 315 |
+
[2026-04-05 17:55:38] Step 3090: loss: 1.205568, grad_norm: 0.213867, learning_rate: 0.000050, epoch: 0.015606
|
| 316 |
+
[2026-04-05 17:57:04] Step 3100: loss: 0.972407, grad_norm: 0.398438, learning_rate: 0.000050, epoch: 0.015657
|
| 317 |
+
[2026-04-05 17:58:28] Step 3110: loss: 0.908546, grad_norm: 0.174805, learning_rate: 0.000050, epoch: 0.015707
|
| 318 |
+
[2026-04-05 17:59:53] Step 3120: loss: 1.111314, grad_norm: 0.231445, learning_rate: 0.000050, epoch: 0.015758
|
| 319 |
+
[2026-04-05 18:01:18] Step 3130: loss: 0.902070, grad_norm: 0.152344, learning_rate: 0.000050, epoch: 0.015808
|
| 320 |
+
[2026-04-05 18:02:42] Step 3140: loss: 0.900240, grad_norm: 0.263672, learning_rate: 0.000050, epoch: 0.015859
|
| 321 |
+
[2026-04-05 18:04:05] Step 3150: loss: 0.646932, grad_norm: 0.161133, learning_rate: 0.000050, epoch: 0.015909
|
| 322 |
+
[2026-04-05 18:05:30] Step 3160: loss: 1.058152, grad_norm: 0.247070, learning_rate: 0.000050, epoch: 0.015960
|
| 323 |
+
[2026-04-05 18:06:55] Step 3170: loss: 1.120416, grad_norm: 2.734375, learning_rate: 0.000050, epoch: 0.016010
|
| 324 |
+
[2026-04-05 18:08:20] Step 3180: loss: 1.347795, grad_norm: 0.410156, learning_rate: 0.000050, epoch: 0.016061
|
| 325 |
+
[2026-04-05 18:09:45] Step 3190: loss: 1.075504, grad_norm: 0.929688, learning_rate: 0.000050, epoch: 0.016111
|
| 326 |
+
[2026-04-05 18:11:09] Step 3200: loss: 0.600300, grad_norm: 0.151367, learning_rate: 0.000050, epoch: 0.016162
|
| 327 |
+
[2026-04-05 18:12:33] Step 3210: loss: 0.955885, grad_norm: 0.259766, learning_rate: 0.000050, epoch: 0.016212
|
| 328 |
+
[2026-04-05 18:13:57] Step 3220: loss: 0.891304, grad_norm: 2.765625, learning_rate: 0.000050, epoch: 0.016263
|
| 329 |
+
[2026-04-05 18:15:22] Step 3230: loss: 1.085288, grad_norm: 0.169922, learning_rate: 0.000050, epoch: 0.016313
|
| 330 |
+
[2026-04-05 18:16:46] Step 3240: loss: 0.953492, grad_norm: 0.300781, learning_rate: 0.000050, epoch: 0.016364
|
| 331 |
+
[2026-04-05 18:18:11] Step 3250: loss: 1.122248, grad_norm: 0.449219, learning_rate: 0.000050, epoch: 0.016414
|
| 332 |
+
[2026-04-05 18:19:36] Step 3260: loss: 0.880321, grad_norm: 0.291016, learning_rate: 0.000050, epoch: 0.016465
|
| 333 |
+
[2026-04-05 18:21:02] Step 3270: loss: 1.138729, grad_norm: 0.351562, learning_rate: 0.000050, epoch: 0.016515
|
| 334 |
+
[2026-04-05 18:22:27] Step 3280: loss: 0.897787, grad_norm: 0.224609, learning_rate: 0.000050, epoch: 0.016566
|
| 335 |
+
[2026-04-05 18:23:51] Step 3290: loss: 0.702576, grad_norm: 0.132812, learning_rate: 0.000050, epoch: 0.016616
|
| 336 |
+
[2026-04-05 18:25:15] Step 3300: loss: 0.795779, grad_norm: 0.193359, learning_rate: 0.000050, epoch: 0.016667
|
| 337 |
+
[2026-04-05 18:26:40] Step 3310: loss: 1.040128, grad_norm: 0.792969, learning_rate: 0.000050, epoch: 0.016717
|
| 338 |
+
[2026-04-05 18:28:05] Step 3320: loss: 1.010918, grad_norm: 0.209961, learning_rate: 0.000050, epoch: 0.016768
|
| 339 |
+
[2026-04-05 18:29:31] Step 3330: loss: 1.156274, grad_norm: 0.154297, learning_rate: 0.000050, epoch: 0.016818
|
| 340 |
+
[2026-04-05 18:30:56] Step 3340: loss: 0.944842, grad_norm: 0.449219, learning_rate: 0.000050, epoch: 0.016869
|
| 341 |
+
[2026-04-05 18:32:21] Step 3350: loss: 1.268639, grad_norm: 2.531250, learning_rate: 0.000050, epoch: 0.016919
|
| 342 |
+
[2026-04-05 18:33:45] Step 3360: loss: 0.813688, grad_norm: 3.671875, learning_rate: 0.000050, epoch: 0.016970
|
| 343 |
+
[2026-04-05 18:35:11] Step 3370: loss: 1.152398, grad_norm: 0.333984, learning_rate: 0.000050, epoch: 0.017020
|
| 344 |
+
[2026-04-05 18:36:36] Step 3380: loss: 0.974917, grad_norm: 0.769531, learning_rate: 0.000050, epoch: 0.017071
|
| 345 |
+
[2026-04-05 18:38:02] Step 3390: loss: 0.954514, grad_norm: 0.203125, learning_rate: 0.000050, epoch: 0.017121
|
| 346 |
+
[2026-04-05 18:39:27] Step 3400: loss: 1.091822, grad_norm: 0.197266, learning_rate: 0.000050, epoch: 0.017172
|
| 347 |
+
[2026-04-05 18:40:53] Step 3410: loss: 1.011484, grad_norm: 0.176758, learning_rate: 0.000050, epoch: 0.017222
|
| 348 |
+
[2026-04-05 18:42:17] Step 3420: loss: 0.701433, grad_norm: 0.328125, learning_rate: 0.000050, epoch: 0.017273
|
| 349 |
+
[2026-04-05 18:43:42] Step 3430: loss: 0.868405, grad_norm: 0.128906, learning_rate: 0.000050, epoch: 0.017323
|
| 350 |
+
[2026-04-05 18:45:08] Step 3440: loss: 1.028577, grad_norm: 0.231445, learning_rate: 0.000050, epoch: 0.017374
|
| 351 |
+
[2026-04-05 18:46:33] Step 3450: loss: 1.005750, grad_norm: 0.181641, learning_rate: 0.000050, epoch: 0.017424
|
| 352 |
+
[2026-04-05 18:47:59] Step 3460: loss: 1.001823, grad_norm: 0.294922, learning_rate: 0.000050, epoch: 0.017475
|
| 353 |
+
[2026-04-05 18:49:23] Step 3470: loss: 0.667828, grad_norm: 0.357422, learning_rate: 0.000050, epoch: 0.017525
|
| 354 |
+
[2026-04-05 18:50:49] Step 3480: loss: 0.835672, grad_norm: 1.437500, learning_rate: 0.000050, epoch: 0.017576
|
| 355 |
+
[2026-04-05 18:52:14] Step 3490: loss: 1.058652, grad_norm: 0.135742, learning_rate: 0.000050, epoch: 0.017626
|
| 356 |
+
[2026-04-05 18:53:40] Step 3500: loss: 1.062490, grad_norm: 0.194336, learning_rate: 0.000050, epoch: 0.017677
|
| 357 |
+
[2026-04-05 18:55:05] Step 3510: loss: 0.915247, grad_norm: 0.124023, learning_rate: 0.000050, epoch: 0.017727
|
| 358 |
+
[2026-04-05 18:56:29] Step 3520: loss: 0.586168, grad_norm: 0.073242, learning_rate: 0.000050, epoch: 0.017778
|
| 359 |
+
[2026-04-05 18:57:54] Step 3530: loss: 0.816989, grad_norm: 146.000000, learning_rate: 0.000050, epoch: 0.017828
|
| 360 |
+
[2026-04-05 18:59:19] Step 3540: loss: 0.540502, grad_norm: 0.119629, learning_rate: 0.000050, epoch: 0.017879
|
| 361 |
+
[2026-04-05 19:00:42] Step 3550: loss: 0.578981, grad_norm: 0.115234, learning_rate: 0.000050, epoch: 0.017929
|
| 362 |
+
[2026-04-05 19:02:08] Step 3560: loss: 0.675821, grad_norm: 0.154297, learning_rate: 0.000050, epoch: 0.017980
|
| 363 |
+
[2026-04-05 19:03:34] Step 3570: loss: 0.992571, grad_norm: 0.500000, learning_rate: 0.000050, epoch: 0.018030
|
| 364 |
+
[2026-04-05 19:04:59] Step 3580: loss: 0.860287, grad_norm: 0.225586, learning_rate: 0.000050, epoch: 0.018081
|
| 365 |
+
[2026-04-05 19:06:23] Step 3590: loss: 0.701336, grad_norm: 0.605469, learning_rate: 0.000050, epoch: 0.018131
|
| 366 |
+
[2026-04-05 19:07:49] Step 3600: loss: 0.786840, grad_norm: 0.232422, learning_rate: 0.000050, epoch: 0.018182
|
| 367 |
+
[2026-04-05 19:09:14] Step 3610: loss: 0.871714, grad_norm: 0.225586, learning_rate: 0.000050, epoch: 0.018232
|
| 368 |
+
[2026-04-05 19:10:39] Step 3620: loss: 0.835710, grad_norm: 0.217773, learning_rate: 0.000050, epoch: 0.018283
|
| 369 |
+
[2026-04-05 19:12:04] Step 3630: loss: 0.831312, grad_norm: 0.209961, learning_rate: 0.000050, epoch: 0.018333
|
| 370 |
+
[2026-04-05 19:13:29] Step 3640: loss: 1.358666, grad_norm: 0.867188, learning_rate: 0.000050, epoch: 0.018384
|
| 371 |
+
[2026-04-05 19:14:52] Step 3650: loss: 2.279568, grad_norm: 0.437500, learning_rate: 0.000050, epoch: 0.018434
|
| 372 |
+
[2026-04-05 19:16:16] Step 3660: loss: 0.685308, grad_norm: 0.328125, learning_rate: 0.000050, epoch: 0.018485
|
| 373 |
+
[2026-04-05 19:17:40] Step 3670: loss: 0.648882, grad_norm: 0.166016, learning_rate: 0.000050, epoch: 0.018535
|
| 374 |
+
[2026-04-05 19:19:06] Step 3680: loss: 0.948299, grad_norm: 0.435547, learning_rate: 0.000050, epoch: 0.018586
|
| 375 |
+
[2026-04-05 19:20:31] Step 3690: loss: 0.790501, grad_norm: 0.130859, learning_rate: 0.000050, epoch: 0.018636
|
| 376 |
+
[2026-04-05 19:21:55] Step 3700: loss: 0.744720, grad_norm: 0.230469, learning_rate: 0.000050, epoch: 0.018687
|
| 377 |
+
[2026-04-05 19:23:21] Step 3710: loss: 0.935014, grad_norm: 0.165039, learning_rate: 0.000050, epoch: 0.018737
|
| 378 |
+
[2026-04-05 19:24:46] Step 3720: loss: 0.820345, grad_norm: 0.302734, learning_rate: 0.000050, epoch: 0.018788
|
| 379 |
+
[2026-04-05 19:26:11] Step 3730: loss: 0.955610, grad_norm: 0.146484, learning_rate: 0.000050, epoch: 0.018838
|
| 380 |
+
[2026-04-05 19:27:34] Step 3740: loss: 0.571479, grad_norm: 0.375000, learning_rate: 0.000050, epoch: 0.018889
|
| 381 |
+
[2026-04-05 19:29:00] Step 3750: loss: 1.160526, grad_norm: 0.244141, learning_rate: 0.000050, epoch: 0.018939
|
| 382 |
+
[2026-04-05 19:30:24] Step 3760: loss: 0.704879, grad_norm: 0.075684, learning_rate: 0.000050, epoch: 0.018990
|
| 383 |
+
[2026-04-05 19:31:47] Step 3770: loss: 0.281452, grad_norm: 0.090332, learning_rate: 0.000050, epoch: 0.019040
|
| 384 |
+
[2026-04-05 19:33:12] Step 3780: loss: 0.948658, grad_norm: 1.109375, learning_rate: 0.000050, epoch: 0.019091
|
| 385 |
+
[2026-04-05 19:34:37] Step 3790: loss: 0.881313, grad_norm: 0.208984, learning_rate: 0.000050, epoch: 0.019141
|
| 386 |
+
[2026-04-05 19:36:03] Step 3800: loss: 0.818209, grad_norm: 0.257812, learning_rate: 0.000050, epoch: 0.019192
|
| 387 |
+
[2026-04-05 19:37:27] Step 3810: loss: 0.673398, grad_norm: 0.076172, learning_rate: 0.000050, epoch: 0.019242
|
| 388 |
+
[2026-04-05 19:38:50] Step 3820: loss: 0.505238, grad_norm: 0.582031, learning_rate: 0.000050, epoch: 0.019293
|
| 389 |
+
[2026-04-05 19:40:15] Step 3830: loss: 0.891517, grad_norm: 0.238281, learning_rate: 0.000050, epoch: 0.019343
|
| 390 |
+
[2026-04-05 19:41:40] Step 3840: loss: 0.937767, grad_norm: 0.176758, learning_rate: 0.000050, epoch: 0.019394
|
| 391 |
+
[2026-04-05 19:43:05] Step 3850: loss: 0.587611, grad_norm: 0.183594, learning_rate: 0.000050, epoch: 0.019444
|
| 392 |
+
[2026-04-05 19:44:30] Step 3860: loss: 1.020237, grad_norm: 0.235352, learning_rate: 0.000050, epoch: 0.019495
|
| 393 |
+
[2026-04-05 19:45:55] Step 3870: loss: 1.054631, grad_norm: 0.208984, learning_rate: 0.000050, epoch: 0.019545
|
| 394 |
+
[2026-04-05 19:47:21] Step 3880: loss: 0.901759, grad_norm: 0.139648, learning_rate: 0.000050, epoch: 0.019596
|
| 395 |
+
[2026-04-05 19:48:45] Step 3890: loss: 0.718227, grad_norm: 0.148438, learning_rate: 0.000050, epoch: 0.019646
|
| 396 |
+
[2026-04-05 19:50:09] Step 3900: loss: 0.997413, grad_norm: 1.695312, learning_rate: 0.000050, epoch: 0.019697
|
| 397 |
+
[2026-04-05 19:51:35] Step 3910: loss: 0.907274, grad_norm: 0.161133, learning_rate: 0.000050, epoch: 0.019747
|
| 398 |
+
[2026-04-05 19:53:00] Step 3920: loss: 0.738626, grad_norm: 0.090332, learning_rate: 0.000050, epoch: 0.019798
|
| 399 |
+
[2026-04-05 19:54:25] Step 3930: loss: 1.373290, grad_norm: 0.235352, learning_rate: 0.000050, epoch: 0.019848
|
| 400 |
+
[2026-04-05 19:55:50] Step 3940: loss: 0.769842, grad_norm: 0.210938, learning_rate: 0.000050, epoch: 0.019899
|
| 401 |
+
[2026-04-05 19:57:15] Step 3950: loss: 0.937164, grad_norm: 0.292969, learning_rate: 0.000050, epoch: 0.019949
|
| 402 |
+
[2026-04-05 19:58:41] Step 3960: loss: 0.940461, grad_norm: 0.330078, learning_rate: 0.000050, epoch: 0.020000
|
| 403 |
+
[2026-04-05 20:00:04] Step 3970: loss: 0.805276, grad_norm: 0.229492, learning_rate: 0.000050, epoch: 0.020051
|
| 404 |
+
[2026-04-05 20:01:28] Step 3980: loss: 0.602984, grad_norm: 0.306641, learning_rate: 0.000050, epoch: 0.020101
|
| 405 |
+
[2026-04-05 20:02:53] Step 3990: loss: 0.888927, grad_norm: 0.193359, learning_rate: 0.000050, epoch: 0.020152
|
| 406 |
+
[2026-04-05 20:04:19] Step 4000: loss: 1.048596, grad_norm: 0.523438, learning_rate: 0.000050, epoch: 0.020202
|
| 407 |
+
[2026-04-05 20:05:44] Step 4010: loss: 0.881884, grad_norm: 0.457031, learning_rate: 0.000050, epoch: 0.020253
|
| 408 |
+
[2026-04-05 20:07:09] Step 4020: loss: 0.909422, grad_norm: 0.183594, learning_rate: 0.000050, epoch: 0.020303
|
| 409 |
+
[2026-04-05 20:08:33] Step 4030: loss: 0.751946, grad_norm: 0.441406, learning_rate: 0.000050, epoch: 0.020354
|
| 410 |
+
[2026-04-05 20:09:59] Step 4040: loss: 0.816604, grad_norm: 0.182617, learning_rate: 0.000050, epoch: 0.020404
|
| 411 |
+
[2026-04-05 20:11:24] Step 4050: loss: 1.197823, grad_norm: 0.210938, learning_rate: 0.000050, epoch: 0.020455
|
| 412 |
+
[2026-04-05 20:12:50] Step 4060: loss: 1.082392, grad_norm: 0.828125, learning_rate: 0.000050, epoch: 0.020505
|
| 413 |
+
[2026-04-05 20:14:15] Step 4070: loss: 0.737737, grad_norm: 0.259766, learning_rate: 0.000050, epoch: 0.020556
|
| 414 |
+
[2026-04-05 20:15:40] Step 4080: loss: 0.567219, grad_norm: 0.316406, learning_rate: 0.000050, epoch: 0.020606
|
| 415 |
+
[2026-04-05 20:17:06] Step 4090: loss: 0.919196, grad_norm: 0.125977, learning_rate: 0.000050, epoch: 0.020657
|
| 416 |
+
[2026-04-05 20:18:31] Step 4100: loss: 1.070574, grad_norm: 0.175781, learning_rate: 0.000050, epoch: 0.020707
|
| 417 |
+
[2026-04-05 20:19:56] Step 4110: loss: 0.914139, grad_norm: 0.126953, learning_rate: 0.000050, epoch: 0.020758
|
| 418 |
+
[2026-04-05 20:21:20] Step 4120: loss: 0.658527, grad_norm: 0.242188, learning_rate: 0.000050, epoch: 0.020808
|
| 419 |
+
[2026-04-05 20:22:46] Step 4130: loss: 1.043363, grad_norm: 0.613281, learning_rate: 0.000050, epoch: 0.020859
|
| 420 |
+
[2026-04-05 20:24:09] Step 4140: loss: 0.686100, grad_norm: 0.133789, learning_rate: 0.000050, epoch: 0.020909
|
| 421 |
+
[2026-04-05 20:25:33] Step 4150: loss: 0.803859, grad_norm: 0.242188, learning_rate: 0.000050, epoch: 0.020960
|
| 422 |
+
[2026-04-05 20:26:58] Step 4160: loss: 0.858215, grad_norm: 0.102539, learning_rate: 0.000050, epoch: 0.021010
|
| 423 |
+
[2026-04-05 20:28:22] Step 4170: loss: 2.265225, grad_norm: 1.023438, learning_rate: 0.000050, epoch: 0.021061
|
| 424 |
+
[2026-04-05 20:29:46] Step 4180: loss: 1.579555, grad_norm: 0.605469, learning_rate: 0.000050, epoch: 0.021111
|
| 425 |
+
[2026-04-05 20:31:11] Step 4190: loss: 0.705098, grad_norm: 0.312500, learning_rate: 0.000050, epoch: 0.021162
|
| 426 |
+
[2026-04-05 20:32:36] Step 4200: loss: 0.678449, grad_norm: 0.257812, learning_rate: 0.000050, epoch: 0.021212
|
| 427 |
+
[2026-04-05 20:34:00] Step 4210: loss: 1.039099, grad_norm: 0.169922, learning_rate: 0.000050, epoch: 0.021263
|
| 428 |
+
[2026-04-05 20:35:25] Step 4220: loss: 0.897258, grad_norm: 0.392578, learning_rate: 0.000050, epoch: 0.021313
|
| 429 |
+
[2026-04-05 20:36:49] Step 4230: loss: 0.592581, grad_norm: 0.253906, learning_rate: 0.000050, epoch: 0.021364
|
| 430 |
+
[2026-04-05 20:38:14] Step 4240: loss: 0.581826, grad_norm: 0.414062, learning_rate: 0.000050, epoch: 0.021414
|
| 431 |
+
[2026-04-05 20:39:39] Step 4250: loss: 0.773074, grad_norm: 0.109863, learning_rate: 0.000050, epoch: 0.021465
|
| 432 |
+
[2026-04-05 20:41:04] Step 4260: loss: 0.772055, grad_norm: 0.092773, learning_rate: 0.000050, epoch: 0.021515
|
| 433 |
+
[2026-04-05 20:42:29] Step 4270: loss: 0.519814, grad_norm: 0.173828, learning_rate: 0.000050, epoch: 0.021566
|
| 434 |
+
[2026-04-05 20:43:54] Step 4280: loss: 0.743214, grad_norm: 0.206055, learning_rate: 0.000050, epoch: 0.021616
|
| 435 |
+
[2026-04-05 20:45:19] Step 4290: loss: 0.712543, grad_norm: 0.192383, learning_rate: 0.000050, epoch: 0.021667
|
| 436 |
+
[2026-04-05 20:46:45] Step 4300: loss: 0.858545, grad_norm: 0.112793, learning_rate: 0.000050, epoch: 0.021717
|
| 437 |
+
[2026-04-05 20:48:10] Step 4310: loss: 0.812474, grad_norm: 0.244141, learning_rate: 0.000050, epoch: 0.021768
|
| 438 |
+
[2026-04-05 20:49:34] Step 4320: loss: 0.804857, grad_norm: 0.174805, learning_rate: 0.000050, epoch: 0.021818
|
| 439 |
+
[2026-04-05 20:50:59] Step 4330: loss: 0.676323, grad_norm: 0.060547, learning_rate: 0.000050, epoch: 0.021869
|
| 440 |
+
[2026-04-05 20:52:25] Step 4340: loss: 0.810262, grad_norm: 0.476562, learning_rate: 0.000050, epoch: 0.021919
|
| 441 |
+
[2026-04-05 20:53:49] Step 4350: loss: 0.678963, grad_norm: 1.210938, learning_rate: 0.000050, epoch: 0.021970
|
| 442 |
+
[2026-04-05 20:55:15] Step 4360: loss: 0.843389, grad_norm: 0.096680, learning_rate: 0.000050, epoch: 0.022020
|
| 443 |
+
[2026-04-05 20:56:40] Step 4370: loss: 0.557602, grad_norm: 0.070312, learning_rate: 0.000050, epoch: 0.022071
|
| 444 |
+
[2026-04-05 20:58:04] Step 4380: loss: 0.352494, grad_norm: 0.298828, learning_rate: 0.000050, epoch: 0.022121
|
| 445 |
+
[2026-04-05 20:59:28] Step 4390: loss: 0.624163, grad_norm: 0.472656, learning_rate: 0.000050, epoch: 0.022172
|
| 446 |
+
[2026-04-05 21:00:53] Step 4400: loss: 0.594077, grad_norm: 0.093262, learning_rate: 0.000050, epoch: 0.022222
|
| 447 |
+
[2026-04-05 21:02:18] Step 4410: loss: 0.878214, grad_norm: 0.330078, learning_rate: 0.000050, epoch: 0.022273
|
| 448 |
+
[2026-04-05 21:03:44] Step 4420: loss: 0.740005, grad_norm: 0.503906, learning_rate: 0.000050, epoch: 0.022323
|
| 449 |
+
[2026-04-05 21:05:08] Step 4430: loss: 0.971799, grad_norm: 0.562500, learning_rate: 0.000050, epoch: 0.022374
|
| 450 |
+
[2026-04-05 21:06:33] Step 4440: loss: 0.924411, grad_norm: 0.261719, learning_rate: 0.000050, epoch: 0.022424
|
| 451 |
+
[2026-04-05 21:07:59] Step 4450: loss: 1.042776, grad_norm: 0.140625, learning_rate: 0.000050, epoch: 0.022475
|
| 452 |
+
[2026-04-05 21:09:24] Step 4460: loss: 0.742720, grad_norm: 0.126953, learning_rate: 0.000050, epoch: 0.022525
|
| 453 |
+
[2026-04-05 21:10:49] Step 4470: loss: 0.903004, grad_norm: 0.233398, learning_rate: 0.000050, epoch: 0.022576
|
| 454 |
+
[2026-04-05 21:12:13] Step 4480: loss: 0.624388, grad_norm: 0.161133, learning_rate: 0.000050, epoch: 0.022626
|
| 455 |
+
[2026-04-05 21:13:39] Step 4490: loss: 0.894780, grad_norm: 0.178711, learning_rate: 0.000050, epoch: 0.022677
|
| 456 |
+
[2026-04-05 21:15:04] Step 4500: loss: 0.783386, grad_norm: 0.275391, learning_rate: 0.000050, epoch: 0.022727
|
| 457 |
+
[2026-04-05 21:16:30] Step 4510: loss: 0.852587, grad_norm: 0.182617, learning_rate: 0.000050, epoch: 0.022778
|
| 458 |
+
[2026-04-05 21:17:56] Step 4520: loss: 0.751036, grad_norm: 0.124023, learning_rate: 0.000050, epoch: 0.022828
|
| 459 |
+
[2026-04-05 21:19:21] Step 4530: loss: 0.812920, grad_norm: 0.808594, learning_rate: 0.000050, epoch: 0.022879
|
| 460 |
+
[2026-04-05 21:20:47] Step 4540: loss: 0.928886, grad_norm: 0.201172, learning_rate: 0.000050, epoch: 0.022929
|
| 461 |
+
[2026-04-05 21:22:12] Step 4550: loss: 0.699640, grad_norm: 0.117188, learning_rate: 0.000050, epoch: 0.022980
|
| 462 |
+
[2026-04-05 21:23:35] Step 4560: loss: 0.396365, grad_norm: 0.096191, learning_rate: 0.000050, epoch: 0.023030
|
| 463 |
+
[2026-04-05 21:24:59] Step 4570: loss: 0.460258, grad_norm: 0.089844, learning_rate: 0.000050, epoch: 0.023081
|
| 464 |
+
[2026-04-05 21:26:24] Step 4580: loss: 0.803078, grad_norm: 0.455078, learning_rate: 0.000050, epoch: 0.023131
|
| 465 |
+
[2026-04-05 21:27:48] Step 4590: loss: 0.623093, grad_norm: 0.175781, learning_rate: 0.000050, epoch: 0.023182
|
| 466 |
+
[2026-04-05 21:29:14] Step 4600: loss: 1.008995, grad_norm: 0.166016, learning_rate: 0.000050, epoch: 0.023232
|
| 467 |
+
[2026-04-05 21:30:39] Step 4610: loss: 1.163760, grad_norm: 0.181641, learning_rate: 0.000050, epoch: 0.023283
|
| 468 |
+
[2026-04-05 21:32:03] Step 4620: loss: 0.569889, grad_norm: 0.135742, learning_rate: 0.000050, epoch: 0.023333
|
| 469 |
+
[2026-04-05 21:33:29] Step 4630: loss: 1.077146, grad_norm: 0.158203, learning_rate: 0.000050, epoch: 0.023384
|
| 470 |
+
[2026-04-05 21:34:54] Step 4640: loss: 0.789613, grad_norm: 0.316406, learning_rate: 0.000050, epoch: 0.023434
|
| 471 |
+
[2026-04-05 21:36:19] Step 4650: loss: 0.768905, grad_norm: 0.185547, learning_rate: 0.000050, epoch: 0.023485
|
| 472 |
+
[2026-04-05 21:37:42] Step 4660: loss: 0.265431, grad_norm: 0.196289, learning_rate: 0.000050, epoch: 0.023535
|
| 473 |
+
[2026-04-05 21:39:06] Step 4670: loss: 0.509521, grad_norm: 0.174805, learning_rate: 0.000050, epoch: 0.023586
|
| 474 |
+
[2026-04-05 21:40:32] Step 4680: loss: 0.919802, grad_norm: 0.275391, learning_rate: 0.000050, epoch: 0.023636
|
| 475 |
+
[2026-04-05 21:41:57] Step 4690: loss: 0.862871, grad_norm: 0.269531, learning_rate: 0.000050, epoch: 0.023687
|
| 476 |
+
[2026-04-05 21:43:21] Step 4700: loss: 0.451978, grad_norm: 0.371094, learning_rate: 0.000050, epoch: 0.023737
|
| 477 |
+
[2026-04-05 21:44:45] Step 4710: loss: 0.829016, grad_norm: 0.146484, learning_rate: 0.000050, epoch: 0.023788
|
| 478 |
+
[2026-04-05 21:46:11] Step 4720: loss: 0.757640, grad_norm: 0.145508, learning_rate: 0.000050, epoch: 0.023838
|
| 479 |
+
[2026-04-05 21:47:36] Step 4730: loss: 0.821686, grad_norm: 0.173828, learning_rate: 0.000050, epoch: 0.023889
|
| 480 |
+
[2026-04-05 21:49:01] Step 4740: loss: 0.869125, grad_norm: 0.494141, learning_rate: 0.000050, epoch: 0.023939
|
| 481 |
+
[2026-04-05 21:50:26] Step 4750: loss: 0.680020, grad_norm: 0.092285, learning_rate: 0.000050, epoch: 0.023990
|
| 482 |
+
[2026-04-05 21:51:51] Step 4760: loss: 0.594586, grad_norm: 0.221680, learning_rate: 0.000050, epoch: 0.024040
|
| 483 |
+
[2026-04-05 21:53:16] Step 4770: loss: 0.768582, grad_norm: 0.179688, learning_rate: 0.000050, epoch: 0.024091
|
| 484 |
+
[2026-04-05 21:54:40] Step 4780: loss: 0.582672, grad_norm: 0.112793, learning_rate: 0.000050, epoch: 0.024141
|
| 485 |
+
[2026-04-05 21:56:06] Step 4790: loss: 0.791890, grad_norm: 0.120117, learning_rate: 0.000050, epoch: 0.024192
|
| 486 |
+
[2026-04-05 21:57:31] Step 4800: loss: 0.806308, grad_norm: 0.111328, learning_rate: 0.000050, epoch: 0.024242
|
| 487 |
+
[2026-04-05 21:58:56] Step 4810: loss: 0.739746, grad_norm: 0.148438, learning_rate: 0.000050, epoch: 0.024293
|
| 488 |
+
[2026-04-05 22:00:22] Step 4820: loss: 0.762415, grad_norm: 0.106445, learning_rate: 0.000050, epoch: 0.024343
|
| 489 |
+
[2026-04-05 22:01:47] Step 4830: loss: 0.877260, grad_norm: 0.212891, learning_rate: 0.000050, epoch: 0.024394
|
| 490 |
+
[2026-04-05 22:03:12] Step 4840: loss: 0.674090, grad_norm: 0.147461, learning_rate: 0.000050, epoch: 0.024444
|
| 491 |
+
[2026-04-05 22:04:36] Step 4850: loss: 0.761976, grad_norm: 0.124512, learning_rate: 0.000050, epoch: 0.024495
|
| 492 |
+
[2026-04-05 22:06:00] Step 4860: loss: 0.420770, grad_norm: 0.209961, learning_rate: 0.000050, epoch: 0.024545
|
| 493 |
+
[2026-04-05 22:07:25] Step 4870: loss: 0.853558, grad_norm: 0.146484, learning_rate: 0.000050, epoch: 0.024596
|
| 494 |
+
[2026-04-05 22:08:49] Step 4880: loss: 0.519837, grad_norm: 0.161133, learning_rate: 0.000050, epoch: 0.024646
|
| 495 |
+
[2026-04-05 22:10:14] Step 4890: loss: 0.812432, grad_norm: 0.139648, learning_rate: 0.000050, epoch: 0.024697
|
| 496 |
+
[2026-04-05 22:11:39] Step 4900: loss: 0.782813, grad_norm: 0.104004, learning_rate: 0.000050, epoch: 0.024747
|
| 497 |
+
[2026-04-05 22:13:05] Step 4910: loss: 0.806325, grad_norm: 0.156250, learning_rate: 0.000050, epoch: 0.024798
|
| 498 |
+
[2026-04-05 22:14:30] Step 4920: loss: 0.727326, grad_norm: 0.122559, learning_rate: 0.000050, epoch: 0.024848
|
| 499 |
+
[2026-04-05 22:15:54] Step 4930: loss: 0.891241, grad_norm: 0.261719, learning_rate: 0.000050, epoch: 0.024899
|
| 500 |
+
[2026-04-05 22:17:18] Step 4940: loss: 0.880500, grad_norm: 0.139648, learning_rate: 0.000050, epoch: 0.024949
|
| 501 |
+
[2026-04-05 22:18:43] Step 4950: loss: 0.869738, grad_norm: 0.621094, learning_rate: 0.000050, epoch: 0.025000
|
| 502 |
+
[2026-04-05 22:20:07] Step 4960: loss: 0.483802, grad_norm: 0.279297, learning_rate: 0.000050, epoch: 0.025051
|
| 503 |
+
[2026-04-05 22:21:32] Step 4970: loss: 0.424717, grad_norm: 0.068359, learning_rate: 0.000050, epoch: 0.025101
|
| 504 |
+
[2026-04-05 22:22:57] Step 4980: loss: 0.770722, grad_norm: 0.153320, learning_rate: 0.000050, epoch: 0.025152
|
| 505 |
+
[2026-04-05 22:24:21] Step 4990: loss: 0.476020, grad_norm: 0.094727, learning_rate: 0.000050, epoch: 0.025202
|
| 506 |
+
[2026-04-05 22:25:46] Step 5000: loss: 0.657169, grad_norm: 0.111816, learning_rate: 0.000050, epoch: 0.025253
|
| 507 |
+
[2026-04-05 22:28:18] Step 5010: loss: 0.819577, grad_norm: 0.121094, learning_rate: 0.000050, epoch: 0.025303
|