Ahmed commited on
Upload variant_a/train.log with huggingface_hub
Browse files- variant_a/train.log +858 -0
variant_a/train.log
ADDED
|
@@ -0,0 +1,858 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
step=5 tok=0.0M loss=15.7623 lr=1.20e-05 grad=22.25 tok/s=1353 elapsed=0.5m
|
| 2 |
+
step=57 tok=0.5M loss=8.8461 lr=1.68e-04 grad=1.07 tok/s=7705 elapsed=1.0m
|
| 3 |
+
step=109 tok=0.9M loss=7.2599 lr=3.00e-04 grad=0.87 tok/s=9795 elapsed=1.5m
|
| 4 |
+
step=160 tok=1.3M loss=7.0859 lr=3.00e-04 grad=1.13 tok/s=10810 elapsed=2.0m
|
| 5 |
+
step=211 tok=1.7M loss=6.9771 lr=3.00e-04 grad=1.44 tok/s=11415 elapsed=2.5m
|
| 6 |
+
step=262 tok=2.1M loss=6.8826 lr=3.00e-04 grad=1.39 tok/s=11815 elapsed=3.0m
|
| 7 |
+
step=313 tok=2.6M loss=6.6874 lr=3.00e-04 grad=1.05 tok/s=12099 elapsed=3.5m
|
| 8 |
+
step=364 tok=3.0M loss=6.6031 lr=3.00e-04 grad=1.28 tok/s=12311 elapsed=4.0m
|
| 9 |
+
step=415 tok=3.4M loss=6.3702 lr=3.00e-04 grad=1.17 tok/s=12475 elapsed=4.5m
|
| 10 |
+
step=466 tok=3.8M loss=6.2258 lr=3.00e-04 grad=1.32 tok/s=12605 elapsed=5.0m
|
| 11 |
+
step=517 tok=4.2M loss=6.1473 lr=3.00e-04 grad=1.37 tok/s=12711 elapsed=5.6m
|
| 12 |
+
step=568 tok=4.7M loss=6.0331 lr=3.00e-04 grad=1.05 tok/s=12800 elapsed=6.1m
|
| 13 |
+
step=619 tok=5.1M loss=6.0195 lr=3.00e-04 grad=0.93 tok/s=12875 elapsed=6.6m
|
| 14 |
+
step=670 tok=5.5M loss=5.8778 lr=3.00e-04 grad=0.96 tok/s=12940 elapsed=7.1m
|
| 15 |
+
step=721 tok=5.9M loss=5.7434 lr=3.00e-04 grad=0.98 tok/s=12996 elapsed=7.6m
|
| 16 |
+
step=772 tok=6.3M loss=5.6767 lr=3.00e-04 grad=1.28 tok/s=13045 elapsed=8.1m
|
| 17 |
+
step=823 tok=6.7M loss=5.6730 lr=3.00e-04 grad=0.89 tok/s=13088 elapsed=8.6m
|
| 18 |
+
step=874 tok=7.2M loss=5.5995 lr=3.00e-04 grad=1.00 tok/s=13126 elapsed=9.1m
|
| 19 |
+
step=924 tok=7.6M loss=5.5695 lr=3.00e-04 grad=1.09 tok/s=13153 elapsed=9.6m
|
| 20 |
+
step=975 tok=8.0M loss=5.4738 lr=3.00e-04 grad=0.80 tok/s=13184 elapsed=10.1m
|
| 21 |
+
step=1026 tok=8.4M loss=5.4125 lr=3.00e-04 grad=1.02 tok/s=13213 elapsed=10.6m
|
| 22 |
+
step=1077 tok=8.8M loss=5.3079 lr=3.00e-04 grad=0.81 tok/s=13238 elapsed=11.1m
|
| 23 |
+
step=1128 tok=9.2M loss=5.3070 lr=3.00e-04 grad=0.84 tok/s=13262 elapsed=11.6m
|
| 24 |
+
step=1179 tok=9.7M loss=5.3292 lr=3.00e-04 grad=1.02 tok/s=13283 elapsed=12.1m
|
| 25 |
+
step=1230 tok=10.1M loss=5.1671 lr=2.99e-04 grad=1.09 tok/s=13302 elapsed=12.6m
|
| 26 |
+
step=1281 tok=10.5M loss=5.1852 lr=2.99e-04 grad=0.84 tok/s=13320 elapsed=13.1m
|
| 27 |
+
step=1332 tok=10.9M loss=5.1795 lr=2.99e-04 grad=0.77 tok/s=13337 elapsed=13.6m
|
| 28 |
+
step=1383 tok=11.3M loss=5.0886 lr=2.99e-04 grad=0.79 tok/s=13352 elapsed=14.1m
|
| 29 |
+
step=1434 tok=11.7M loss=5.0369 lr=2.99e-04 grad=0.77 tok/s=13367 elapsed=14.6m
|
| 30 |
+
step=1485 tok=12.2M loss=4.9599 lr=2.99e-04 grad=0.77 tok/s=13380 elapsed=15.2m
|
| 31 |
+
step=1536 tok=12.6M loss=5.0733 lr=2.99e-04 grad=0.73 tok/s=13392 elapsed=15.7m
|
| 32 |
+
step=1587 tok=13.0M loss=5.0350 lr=2.99e-04 grad=0.82 tok/s=13404 elapsed=16.2m
|
| 33 |
+
step=1638 tok=13.4M loss=4.9191 lr=2.99e-04 grad=0.77 tok/s=13415 elapsed=16.7m
|
| 34 |
+
step=1689 tok=13.8M loss=4.8622 lr=2.99e-04 grad=0.93 tok/s=13425 elapsed=17.2m
|
| 35 |
+
step=1740 tok=14.3M loss=4.8014 lr=2.99e-04 grad=0.73 tok/s=13434 elapsed=17.7m
|
| 36 |
+
step=1791 tok=14.7M loss=4.8270 lr=2.99e-04 grad=1.34 tok/s=13444 elapsed=18.2m
|
| 37 |
+
step=1842 tok=15.1M loss=4.7932 lr=2.99e-04 grad=0.84 tok/s=13452 elapsed=18.7m
|
| 38 |
+
step=1893 tok=15.5M loss=4.8468 lr=2.99e-04 grad=0.81 tok/s=13460 elapsed=19.2m
|
| 39 |
+
step=1944 tok=15.9M loss=4.7588 lr=2.99e-04 grad=0.93 tok/s=13468 elapsed=19.7m
|
| 40 |
+
step=1995 tok=16.3M loss=4.7685 lr=2.98e-04 grad=0.94 tok/s=13475 elapsed=20.2m
|
| 41 |
+
step=2046 tok=16.8M loss=4.6532 lr=2.98e-04 grad=0.71 tok/s=13482 elapsed=20.7m
|
| 42 |
+
step=2097 tok=17.2M loss=4.6944 lr=2.98e-04 grad=0.81 tok/s=13489 elapsed=21.2m
|
| 43 |
+
step=2148 tok=17.6M loss=4.6505 lr=2.98e-04 grad=0.75 tok/s=13495 elapsed=21.7m
|
| 44 |
+
step=2199 tok=18.0M loss=4.5919 lr=2.98e-04 grad=0.73 tok/s=13501 elapsed=22.2m
|
| 45 |
+
step=2250 tok=18.4M loss=4.5345 lr=2.98e-04 grad=0.70 tok/s=13507 elapsed=22.7m
|
| 46 |
+
step=2301 tok=18.8M loss=4.5352 lr=2.98e-04 grad=0.73 tok/s=13512 elapsed=23.3m
|
| 47 |
+
step=2352 tok=19.3M loss=4.5733 lr=2.98e-04 grad=1.86 tok/s=13516 elapsed=23.8m
|
| 48 |
+
step=2403 tok=19.7M loss=4.5898 lr=2.98e-04 grad=1.02 tok/s=13521 elapsed=24.3m
|
| 49 |
+
step=2454 tok=20.1M loss=4.4653 lr=2.98e-04 grad=1.05 tok/s=13526 elapsed=24.8m
|
| 50 |
+
step=2505 tok=20.5M loss=4.5273 lr=2.98e-04 grad=0.79 tok/s=13530 elapsed=25.3m
|
| 51 |
+
step=2556 tok=20.9M loss=4.4814 lr=2.97e-04 grad=0.76 tok/s=13535 elapsed=25.8m
|
| 52 |
+
step=2607 tok=21.4M loss=4.5560 lr=2.97e-04 grad=0.71 tok/s=13539 elapsed=26.3m
|
| 53 |
+
step=2658 tok=21.8M loss=4.5001 lr=2.97e-04 grad=0.79 tok/s=13544 elapsed=26.8m
|
| 54 |
+
step=2709 tok=22.2M loss=4.4093 lr=2.97e-04 grad=0.68 tok/s=13548 elapsed=27.3m
|
| 55 |
+
step=2760 tok=22.6M loss=4.4751 lr=2.97e-04 grad=0.62 tok/s=13551 elapsed=27.8m
|
| 56 |
+
step=2811 tok=23.0M loss=4.4777 lr=2.97e-04 grad=1.07 tok/s=13555 elapsed=28.3m
|
| 57 |
+
step=2862 tok=23.4M loss=4.4159 lr=2.97e-04 grad=0.64 tok/s=13559 elapsed=28.8m
|
| 58 |
+
step=2913 tok=23.9M loss=4.3121 lr=2.97e-04 grad=0.84 tok/s=13562 elapsed=29.3m
|
| 59 |
+
step=2964 tok=24.3M loss=4.3306 lr=2.97e-04 grad=0.78 tok/s=13566 elapsed=29.8m
|
| 60 |
+
|
| 61 |
+
=== Eval 1/12 at step 2982 ===
|
| 62 |
+
val_loss=4.4172
|
| 63 |
+
ar: 4.0273
|
| 64 |
+
en: 4.6760
|
| 65 |
+
fr: 5.3304
|
| 66 |
+
es: 5.7219
|
| 67 |
+
ru: 6.5312
|
| 68 |
+
zh: 6.9000
|
| 69 |
+
tr: 5.6908
|
| 70 |
+
code: 3.7483
|
| 71 |
+
math: 4.5381
|
| 72 |
+
classical: 4.0714
|
| 73 |
+
Generation on 140 prompts done in 165s
|
| 74 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_01.pt
|
| 75 |
+
|
| 76 |
+
step=2983 tok=24.4M loss=4.3266 lr=2.96e-04 grad=0.73 tok/s=12426 elapsed=32.8m
|
| 77 |
+
step=3034 tok=24.9M loss=4.3711 lr=2.96e-04 grad=0.72 tok/s=12447 elapsed=33.3m
|
| 78 |
+
step=3085 tok=25.3M loss=4.2613 lr=2.96e-04 grad=0.70 tok/s=12467 elapsed=33.8m
|
| 79 |
+
step=3136 tok=25.7M loss=4.3097 lr=2.96e-04 grad=0.82 tok/s=12486 elapsed=34.3m
|
| 80 |
+
step=3187 tok=26.1M loss=4.3473 lr=2.96e-04 grad=0.68 tok/s=12505 elapsed=34.8m
|
| 81 |
+
step=3238 tok=26.5M loss=4.2285 lr=2.96e-04 grad=0.74 tok/s=12522 elapsed=35.3m
|
| 82 |
+
step=3289 tok=26.9M loss=4.2871 lr=2.96e-04 grad=0.80 tok/s=12540 elapsed=35.8m
|
| 83 |
+
step=3340 tok=27.4M loss=4.2115 lr=2.96e-04 grad=0.66 tok/s=12557 elapsed=36.3m
|
| 84 |
+
step=3391 tok=27.8M loss=4.1897 lr=2.95e-04 grad=0.69 tok/s=12573 elapsed=36.8m
|
| 85 |
+
step=3442 tok=28.2M loss=4.2372 lr=2.95e-04 grad=0.72 tok/s=12589 elapsed=37.3m
|
| 86 |
+
step=3493 tok=28.6M loss=4.2190 lr=2.95e-04 grad=0.62 tok/s=12604 elapsed=37.8m
|
| 87 |
+
step=3544 tok=29.0M loss=4.2524 lr=2.95e-04 grad=0.65 tok/s=12619 elapsed=38.3m
|
| 88 |
+
step=3595 tok=29.5M loss=4.1465 lr=2.95e-04 grad=0.73 tok/s=12634 elapsed=38.9m
|
| 89 |
+
step=3646 tok=29.9M loss=4.1225 lr=2.95e-04 grad=0.88 tok/s=12648 elapsed=39.4m
|
| 90 |
+
step=3697 tok=30.3M loss=4.1217 lr=2.95e-04 grad=0.70 tok/s=12662 elapsed=39.9m
|
| 91 |
+
step=3748 tok=30.7M loss=4.1437 lr=2.94e-04 grad=0.73 tok/s=12676 elapsed=40.4m
|
| 92 |
+
step=3799 tok=31.1M loss=4.0971 lr=2.94e-04 grad=0.86 tok/s=12689 elapsed=40.9m
|
| 93 |
+
step=3850 tok=31.5M loss=4.0627 lr=2.94e-04 grad=0.64 tok/s=12702 elapsed=41.4m
|
| 94 |
+
step=3901 tok=32.0M loss=4.0113 lr=2.94e-04 grad=0.66 tok/s=12715 elapsed=41.9m
|
| 95 |
+
step=3952 tok=32.4M loss=3.9812 lr=2.94e-04 grad=0.70 tok/s=12727 elapsed=42.4m
|
| 96 |
+
step=4003 tok=32.8M loss=4.0215 lr=2.94e-04 grad=0.72 tok/s=12739 elapsed=42.9m
|
| 97 |
+
step=4054 tok=33.2M loss=3.9537 lr=2.93e-04 grad=0.61 tok/s=12751 elapsed=43.4m
|
| 98 |
+
step=4105 tok=33.6M loss=4.0546 lr=2.93e-04 grad=0.62 tok/s=12763 elapsed=43.9m
|
| 99 |
+
step=4156 tok=34.0M loss=3.9001 lr=2.93e-04 grad=0.68 tok/s=12774 elapsed=44.4m
|
| 100 |
+
step=4207 tok=34.5M loss=3.8612 lr=2.93e-04 grad=0.63 tok/s=12785 elapsed=44.9m
|
| 101 |
+
step=4258 tok=34.9M loss=3.8377 lr=2.93e-04 grad=0.69 tok/s=12796 elapsed=45.4m
|
| 102 |
+
step=4309 tok=35.3M loss=3.9083 lr=2.92e-04 grad=0.59 tok/s=12806 elapsed=45.9m
|
| 103 |
+
step=4360 tok=35.7M loss=3.8686 lr=2.92e-04 grad=0.59 tok/s=12817 elapsed=46.4m
|
| 104 |
+
step=4411 tok=36.1M loss=3.8375 lr=2.92e-04 grad=0.57 tok/s=12827 elapsed=47.0m
|
| 105 |
+
step=4462 tok=36.6M loss=3.7954 lr=2.92e-04 grad=0.57 tok/s=12837 elapsed=47.5m
|
| 106 |
+
step=4513 tok=37.0M loss=3.7617 lr=2.92e-04 grad=0.67 tok/s=12846 elapsed=48.0m
|
| 107 |
+
step=4564 tok=37.4M loss=3.7820 lr=2.92e-04 grad=0.75 tok/s=12856 elapsed=48.5m
|
| 108 |
+
step=4615 tok=37.8M loss=3.7435 lr=2.91e-04 grad=0.66 tok/s=12865 elapsed=49.0m
|
| 109 |
+
step=4666 tok=38.2M loss=3.7354 lr=2.91e-04 grad=0.68 tok/s=12874 elapsed=49.5m
|
| 110 |
+
step=4717 tok=38.6M loss=3.7746 lr=2.91e-04 grad=0.55 tok/s=12883 elapsed=50.0m
|
| 111 |
+
step=4768 tok=39.1M loss=3.7609 lr=2.91e-04 grad=0.68 tok/s=12892 elapsed=50.5m
|
| 112 |
+
step=4819 tok=39.5M loss=3.7680 lr=2.91e-04 grad=0.77 tok/s=12900 elapsed=51.0m
|
| 113 |
+
step=4870 tok=39.9M loss=3.7048 lr=2.90e-04 grad=0.64 tok/s=12909 elapsed=51.5m
|
| 114 |
+
step=4921 tok=40.3M loss=3.7020 lr=2.90e-04 grad=0.75 tok/s=12917 elapsed=52.0m
|
| 115 |
+
step=4972 tok=40.7M loss=3.7438 lr=2.90e-04 grad=0.73 tok/s=12925 elapsed=52.5m
|
| 116 |
+
step=5023 tok=41.1M loss=3.7149 lr=2.90e-04 grad=0.72 tok/s=12933 elapsed=53.0m
|
| 117 |
+
step=5074 tok=41.6M loss=3.6520 lr=2.90e-04 grad=0.63 tok/s=12940 elapsed=53.5m
|
| 118 |
+
step=5125 tok=42.0M loss=3.7048 lr=2.89e-04 grad=0.62 tok/s=12948 elapsed=54.0m
|
| 119 |
+
step=5176 tok=42.4M loss=3.6335 lr=2.89e-04 grad=0.56 tok/s=12956 elapsed=54.5m
|
| 120 |
+
step=5227 tok=42.8M loss=3.5784 lr=2.89e-04 grad=0.52 tok/s=12963 elapsed=55.1m
|
| 121 |
+
step=5278 tok=43.2M loss=3.6354 lr=2.89e-04 grad=0.62 tok/s=12970 elapsed=55.6m
|
| 122 |
+
step=5329 tok=43.7M loss=3.6386 lr=2.88e-04 grad=0.55 tok/s=12977 elapsed=56.1m
|
| 123 |
+
step=5380 tok=44.1M loss=3.5839 lr=2.88e-04 grad=0.62 tok/s=12984 elapsed=56.6m
|
| 124 |
+
step=5431 tok=44.5M loss=3.4910 lr=2.88e-04 grad=0.67 tok/s=12991 elapsed=57.1m
|
| 125 |
+
step=5482 tok=44.9M loss=3.5029 lr=2.88e-04 grad=0.59 tok/s=12998 elapsed=57.6m
|
| 126 |
+
step=5533 tok=45.3M loss=3.5649 lr=2.88e-04 grad=0.56 tok/s=13004 elapsed=58.1m
|
| 127 |
+
step=5584 tok=45.7M loss=3.5418 lr=2.87e-04 grad=0.61 tok/s=13011 elapsed=58.6m
|
| 128 |
+
step=5635 tok=46.2M loss=3.5353 lr=2.87e-04 grad=0.57 tok/s=13017 elapsed=59.1m
|
| 129 |
+
step=5686 tok=46.6M loss=3.4530 lr=2.87e-04 grad=0.56 tok/s=13023 elapsed=59.6m
|
| 130 |
+
step=5734 tok=47.0M loss=3.5402 lr=2.87e-04 grad=0.64 tok/s=13024 elapsed=60.1m
|
| 131 |
+
step=5785 tok=47.4M loss=3.5076 lr=2.86e-04 grad=0.63 tok/s=13030 elapsed=60.6m
|
| 132 |
+
step=5836 tok=47.8M loss=3.5178 lr=2.86e-04 grad=0.59 tok/s=13036 elapsed=61.1m
|
| 133 |
+
step=5887 tok=48.2M loss=3.5052 lr=2.86e-04 grad=0.59 tok/s=13042 elapsed=61.6m
|
| 134 |
+
step=5938 tok=48.6M loss=3.4877 lr=2.86e-04 grad=0.60 tok/s=13047 elapsed=62.1m
|
| 135 |
+
step=5989 tok=49.1M loss=3.4650 lr=2.85e-04 grad=0.55 tok/s=13053 elapsed=62.6m
|
| 136 |
+
|
| 137 |
+
=== Eval 2/12 at step 6002 ===
|
| 138 |
+
val_loss=3.4666
|
| 139 |
+
ar: 3.2744
|
| 140 |
+
en: 3.8507
|
| 141 |
+
fr: 4.2812
|
| 142 |
+
es: 4.7639
|
| 143 |
+
ru: 5.5767
|
| 144 |
+
zh: 6.8516
|
| 145 |
+
tr: 4.6016
|
| 146 |
+
code: 2.1758
|
| 147 |
+
math: 3.5958
|
| 148 |
+
classical: 3.2098
|
| 149 |
+
Generation on 140 prompts done in 112s
|
| 150 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_02.pt
|
| 151 |
+
|
| 152 |
+
step=6003 tok=49.2M loss=3.5423 lr=2.85e-04 grad=0.62 tok/s=12677 elapsed=64.7m
|
| 153 |
+
step=6054 tok=49.6M loss=3.5449 lr=2.85e-04 grad=0.55 tok/s=12686 elapsed=65.2m
|
| 154 |
+
step=6105 tok=50.0M loss=3.4854 lr=2.85e-04 grad=0.58 tok/s=12694 elapsed=65.7m
|
| 155 |
+
step=6156 tok=50.4M loss=3.5083 lr=2.85e-04 grad=0.80 tok/s=12702 elapsed=66.2m
|
| 156 |
+
step=6207 tok=50.8M loss=3.4688 lr=2.84e-04 grad=0.57 tok/s=12710 elapsed=66.7m
|
| 157 |
+
step=6258 tok=51.3M loss=3.4829 lr=2.84e-04 grad=0.66 tok/s=12718 elapsed=67.2m
|
| 158 |
+
step=6309 tok=51.7M loss=3.4439 lr=2.84e-04 grad=0.65 tok/s=12726 elapsed=67.7m
|
| 159 |
+
step=6360 tok=52.1M loss=3.4117 lr=2.84e-04 grad=0.54 tok/s=12733 elapsed=68.2m
|
| 160 |
+
step=6411 tok=52.5M loss=3.4902 lr=2.83e-04 grad=0.72 tok/s=12741 elapsed=68.7m
|
| 161 |
+
step=6462 tok=52.9M loss=3.4165 lr=2.83e-04 grad=0.59 tok/s=12748 elapsed=69.2m
|
| 162 |
+
step=6513 tok=53.4M loss=3.4532 lr=2.83e-04 grad=0.55 tok/s=12755 elapsed=69.7m
|
| 163 |
+
step=6564 tok=53.8M loss=3.3856 lr=2.83e-04 grad=0.58 tok/s=12762 elapsed=70.2m
|
| 164 |
+
step=6615 tok=54.2M loss=3.5311 lr=2.82e-04 grad=0.53 tok/s=12769 elapsed=70.7m
|
| 165 |
+
step=6666 tok=54.6M loss=3.4703 lr=2.82e-04 grad=0.76 tok/s=12776 elapsed=71.2m
|
| 166 |
+
step=6717 tok=55.0M loss=3.3981 lr=2.82e-04 grad=0.59 tok/s=12783 elapsed=71.7m
|
| 167 |
+
step=6768 tok=55.4M loss=3.4332 lr=2.81e-04 grad=0.58 tok/s=12790 elapsed=72.3m
|
| 168 |
+
step=6819 tok=55.9M loss=3.4070 lr=2.81e-04 grad=0.55 tok/s=12796 elapsed=72.8m
|
| 169 |
+
step=6870 tok=56.3M loss=3.4571 lr=2.81e-04 grad=0.56 tok/s=12803 elapsed=73.3m
|
| 170 |
+
step=6921 tok=56.7M loss=3.3946 lr=2.81e-04 grad=0.56 tok/s=12809 elapsed=73.8m
|
| 171 |
+
step=6972 tok=57.1M loss=3.3908 lr=2.80e-04 grad=0.59 tok/s=12816 elapsed=74.3m
|
| 172 |
+
step=7023 tok=57.5M loss=3.4124 lr=2.80e-04 grad=0.54 tok/s=12822 elapsed=74.8m
|
| 173 |
+
step=7074 tok=58.0M loss=3.4027 lr=2.80e-04 grad=0.64 tok/s=12828 elapsed=75.3m
|
| 174 |
+
step=7125 tok=58.4M loss=3.3865 lr=2.79e-04 grad=0.62 tok/s=12835 elapsed=75.8m
|
| 175 |
+
step=7176 tok=58.8M loss=3.3744 lr=2.79e-04 grad=0.57 tok/s=12841 elapsed=76.3m
|
| 176 |
+
step=7227 tok=59.2M loss=3.4069 lr=2.79e-04 grad=0.79 tok/s=12847 elapsed=76.8m
|
| 177 |
+
step=7278 tok=59.6M loss=3.3607 lr=2.79e-04 grad=0.54 tok/s=12853 elapsed=77.3m
|
| 178 |
+
step=7329 tok=60.0M loss=3.4398 lr=2.78e-04 grad=0.61 tok/s=12859 elapsed=77.8m
|
| 179 |
+
step=7380 tok=60.5M loss=3.3293 lr=2.78e-04 grad=0.57 tok/s=12864 elapsed=78.3m
|
| 180 |
+
step=7431 tok=60.9M loss=3.3572 lr=2.78e-04 grad=0.59 tok/s=12870 elapsed=78.8m
|
| 181 |
+
step=7482 tok=61.3M loss=3.3706 lr=2.77e-04 grad=0.53 tok/s=12876 elapsed=79.3m
|
| 182 |
+
step=7533 tok=61.7M loss=3.3490 lr=2.77e-04 grad=0.57 tok/s=12881 elapsed=79.8m
|
| 183 |
+
step=7584 tok=62.1M loss=3.3133 lr=2.77e-04 grad=0.55 tok/s=12887 elapsed=80.4m
|
| 184 |
+
step=7635 tok=62.5M loss=3.2547 lr=2.76e-04 grad=0.53 tok/s=12892 elapsed=80.9m
|
| 185 |
+
step=7686 tok=63.0M loss=3.2272 lr=2.76e-04 grad=0.68 tok/s=12897 elapsed=81.4m
|
| 186 |
+
step=7737 tok=63.4M loss=3.3011 lr=2.76e-04 grad=0.51 tok/s=12903 elapsed=81.9m
|
| 187 |
+
step=7788 tok=63.8M loss=3.2931 lr=2.75e-04 grad=0.61 tok/s=12908 elapsed=82.4m
|
| 188 |
+
step=7839 tok=64.2M loss=3.4444 lr=2.75e-04 grad=0.53 tok/s=12913 elapsed=82.9m
|
| 189 |
+
step=7890 tok=64.6M loss=3.3015 lr=2.75e-04 grad=0.55 tok/s=12918 elapsed=83.4m
|
| 190 |
+
step=7941 tok=65.1M loss=3.2876 lr=2.75e-04 grad=0.56 tok/s=12923 elapsed=83.9m
|
| 191 |
+
step=7992 tok=65.5M loss=3.3275 lr=2.74e-04 grad=0.53 tok/s=12928 elapsed=84.4m
|
| 192 |
+
step=8043 tok=65.9M loss=3.3631 lr=2.74e-04 grad=0.58 tok/s=12933 elapsed=84.9m
|
| 193 |
+
step=8094 tok=66.3M loss=3.2274 lr=2.74e-04 grad=0.59 tok/s=12938 elapsed=85.4m
|
| 194 |
+
step=8145 tok=66.7M loss=3.2839 lr=2.73e-04 grad=0.55 tok/s=12943 elapsed=85.9m
|
| 195 |
+
step=8195 tok=67.1M loss=3.2374 lr=2.73e-04 grad=0.57 tok/s=12945 elapsed=86.4m
|
| 196 |
+
step=8246 tok=67.6M loss=3.2967 lr=2.73e-04 grad=0.58 tok/s=12950 elapsed=86.9m
|
| 197 |
+
step=8297 tok=68.0M loss=3.2454 lr=2.72e-04 grad=0.66 tok/s=12955 elapsed=87.4m
|
| 198 |
+
step=8348 tok=68.4M loss=3.1527 lr=2.72e-04 grad=0.52 tok/s=12960 elapsed=87.9m
|
| 199 |
+
step=8399 tok=68.8M loss=3.2289 lr=2.72e-04 grad=0.54 tok/s=12964 elapsed=88.5m
|
| 200 |
+
step=8450 tok=69.2M loss=3.2268 lr=2.71e-04 grad=0.54 tok/s=12969 elapsed=89.0m
|
| 201 |
+
step=8501 tok=69.6M loss=3.2281 lr=2.71e-04 grad=0.48 tok/s=12974 elapsed=89.5m
|
| 202 |
+
step=8552 tok=70.1M loss=3.2235 lr=2.71e-04 grad=0.52 tok/s=12978 elapsed=90.0m
|
| 203 |
+
step=8603 tok=70.5M loss=3.3162 lr=2.70e-04 grad=0.61 tok/s=12983 elapsed=90.5m
|
| 204 |
+
step=8654 tok=70.9M loss=3.2331 lr=2.70e-04 grad=0.56 tok/s=12987 elapsed=91.0m
|
| 205 |
+
step=8705 tok=71.3M loss=3.2882 lr=2.70e-04 grad=0.51 tok/s=12992 elapsed=91.5m
|
| 206 |
+
step=8756 tok=71.7M loss=3.2372 lr=2.69e-04 grad=0.57 tok/s=12996 elapsed=92.0m
|
| 207 |
+
step=8807 tok=72.1M loss=3.2424 lr=2.69e-04 grad=0.54 tok/s=13001 elapsed=92.5m
|
| 208 |
+
step=8858 tok=72.6M loss=3.2978 lr=2.68e-04 grad=0.52 tok/s=13005 elapsed=93.0m
|
| 209 |
+
step=8909 tok=73.0M loss=3.2083 lr=2.68e-04 grad=0.54 tok/s=13009 elapsed=93.5m
|
| 210 |
+
step=8960 tok=73.4M loss=3.1728 lr=2.68e-04 grad=0.54 tok/s=13014 elapsed=94.0m
|
| 211 |
+
step=9011 tok=73.8M loss=3.2893 lr=2.67e-04 grad=0.77 tok/s=13018 elapsed=94.5m
|
| 212 |
+
|
| 213 |
+
=== Eval 3/12 at step 9025 ===
|
| 214 |
+
val_loss=3.2378
|
| 215 |
+
ar: 3.0161
|
| 216 |
+
en: 3.4698
|
| 217 |
+
es: 4.4609
|
| 218 |
+
ru: 4.8542
|
| 219 |
+
zh: 5.9875
|
| 220 |
+
tr: 4.3686
|
| 221 |
+
code: 2.0102
|
| 222 |
+
math: 3.3086
|
| 223 |
+
classical: 3.1250
|
| 224 |
+
Generation on 140 prompts done in 111s
|
| 225 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_03.pt
|
| 226 |
+
|
| 227 |
+
step=9026 tok=73.9M loss=3.3441 lr=2.67e-04 grad=0.62 tok/s=12769 elapsed=96.5m
|
| 228 |
+
step=9077 tok=74.4M loss=3.3477 lr=2.67e-04 grad=0.53 tok/s=12775 elapsed=97.0m
|
| 229 |
+
step=9128 tok=74.8M loss=3.2929 lr=2.67e-04 grad=0.53 tok/s=12780 elapsed=97.5m
|
| 230 |
+
step=9179 tok=75.2M loss=3.2306 lr=2.66e-04 grad=0.54 tok/s=12786 elapsed=98.0m
|
| 231 |
+
step=9230 tok=75.6M loss=3.1406 lr=2.66e-04 grad=0.57 tok/s=12791 elapsed=98.5m
|
| 232 |
+
step=9281 tok=76.0M loss=3.2188 lr=2.66e-04 grad=0.52 tok/s=12796 elapsed=99.0m
|
| 233 |
+
step=9332 tok=76.4M loss=3.2714 lr=2.65e-04 grad=0.56 tok/s=12801 elapsed=99.5m
|
| 234 |
+
step=9383 tok=76.9M loss=3.2834 lr=2.65e-04 grad=0.71 tok/s=12806 elapsed=100.0m
|
| 235 |
+
step=9434 tok=77.3M loss=3.1716 lr=2.64e-04 grad=0.52 tok/s=12811 elapsed=100.5m
|
| 236 |
+
step=9485 tok=77.7M loss=3.1805 lr=2.64e-04 grad=0.69 tok/s=12816 elapsed=101.0m
|
| 237 |
+
step=9536 tok=78.1M loss=3.2059 lr=2.64e-04 grad=0.55 tok/s=12821 elapsed=101.6m
|
| 238 |
+
step=9587 tok=78.5M loss=3.1923 lr=2.63e-04 grad=0.54 tok/s=12826 elapsed=102.1m
|
| 239 |
+
step=9638 tok=79.0M loss=3.1978 lr=2.63e-04 grad=0.56 tok/s=12830 elapsed=102.6m
|
| 240 |
+
step=9689 tok=79.4M loss=3.1394 lr=2.63e-04 grad=0.58 tok/s=12835 elapsed=103.1m
|
| 241 |
+
step=9740 tok=79.8M loss=3.1864 lr=2.62e-04 grad=0.57 tok/s=12840 elapsed=103.6m
|
| 242 |
+
step=9791 tok=80.2M loss=3.1897 lr=2.62e-04 grad=0.52 tok/s=12845 elapsed=104.1m
|
| 243 |
+
step=9842 tok=80.6M loss=3.1713 lr=2.61e-04 grad=0.61 tok/s=12849 elapsed=104.6m
|
| 244 |
+
step=9893 tok=81.0M loss=3.2056 lr=2.61e-04 grad=0.53 tok/s=12854 elapsed=105.1m
|
| 245 |
+
step=9944 tok=81.5M loss=3.1305 lr=2.61e-04 grad=0.60 tok/s=12859 elapsed=105.6m
|
| 246 |
+
step=9995 tok=81.9M loss=3.1271 lr=2.60e-04 grad=0.56 tok/s=12863 elapsed=106.1m
|
| 247 |
+
step=10046 tok=82.3M loss=3.1927 lr=2.60e-04 grad=0.55 tok/s=12868 elapsed=106.6m
|
| 248 |
+
step=10097 tok=82.7M loss=3.0936 lr=2.59e-04 grad=0.66 tok/s=12872 elapsed=107.1m
|
| 249 |
+
step=10148 tok=83.1M loss=3.1267 lr=2.59e-04 grad=0.53 tok/s=12877 elapsed=107.6m
|
| 250 |
+
step=10199 tok=83.6M loss=3.2119 lr=2.59e-04 grad=0.52 tok/s=12881 elapsed=108.1m
|
| 251 |
+
step=10250 tok=84.0M loss=3.1325 lr=2.58e-04 grad=0.50 tok/s=12885 elapsed=108.6m
|
| 252 |
+
step=10301 tok=84.4M loss=3.1953 lr=2.58e-04 grad=0.64 tok/s=12890 elapsed=109.1m
|
| 253 |
+
step=10352 tok=84.8M loss=3.1705 lr=2.57e-04 grad=0.54 tok/s=12894 elapsed=109.6m
|
| 254 |
+
step=10403 tok=85.2M loss=3.2171 lr=2.57e-04 grad=0.57 tok/s=12898 elapsed=110.1m
|
| 255 |
+
step=10454 tok=85.6M loss=3.1114 lr=2.57e-04 grad=0.55 tok/s=12902 elapsed=110.6m
|
| 256 |
+
step=10505 tok=86.1M loss=3.1661 lr=2.56e-04 grad=0.58 tok/s=12907 elapsed=111.1m
|
| 257 |
+
step=10556 tok=86.5M loss=3.1555 lr=2.56e-04 grad=0.53 tok/s=12911 elapsed=111.6m
|
| 258 |
+
step=10607 tok=86.9M loss=3.1130 lr=2.55e-04 grad=0.60 tok/s=12915 elapsed=112.1m
|
| 259 |
+
step=10658 tok=87.3M loss=3.1434 lr=2.55e-04 grad=0.54 tok/s=12919 elapsed=112.6m
|
| 260 |
+
step=10709 tok=87.7M loss=3.0946 lr=2.55e-04 grad=0.58 tok/s=12923 elapsed=113.1m
|
| 261 |
+
step=10760 tok=88.1M loss=3.0947 lr=2.54e-04 grad=0.51 tok/s=12927 elapsed=113.6m
|
| 262 |
+
step=10810 tok=88.6M loss=3.1688 lr=2.54e-04 grad=0.56 tok/s=12929 elapsed=114.2m
|
| 263 |
+
step=10861 tok=89.0M loss=3.0846 lr=2.53e-04 grad=0.73 tok/s=12933 elapsed=114.7m
|
| 264 |
+
step=10912 tok=89.4M loss=3.1265 lr=2.53e-04 grad=0.54 tok/s=12937 elapsed=115.2m
|
| 265 |
+
step=10963 tok=89.8M loss=3.0453 lr=2.53e-04 grad=0.52 tok/s=12941 elapsed=115.7m
|
| 266 |
+
step=11014 tok=90.2M loss=3.0763 lr=2.52e-04 grad=0.59 tok/s=12945 elapsed=116.2m
|
| 267 |
+
step=11065 tok=90.6M loss=3.1788 lr=2.52e-04 grad=0.55 tok/s=12948 elapsed=116.7m
|
| 268 |
+
step=11116 tok=91.1M loss=3.0771 lr=2.51e-04 grad=0.80 tok/s=12952 elapsed=117.2m
|
| 269 |
+
step=11167 tok=91.5M loss=3.1344 lr=2.51e-04 grad=0.51 tok/s=12956 elapsed=117.7m
|
| 270 |
+
step=11218 tok=91.9M loss=3.0758 lr=2.50e-04 grad=0.56 tok/s=12960 elapsed=118.2m
|
| 271 |
+
step=11269 tok=92.3M loss=3.1473 lr=2.50e-04 grad=0.57 tok/s=12963 elapsed=118.7m
|
| 272 |
+
step=11320 tok=92.7M loss=3.0561 lr=2.50e-04 grad=0.53 tok/s=12967 elapsed=119.2m
|
| 273 |
+
step=11371 tok=93.2M loss=3.1232 lr=2.49e-04 grad=0.50 tok/s=12971 elapsed=119.7m
|
| 274 |
+
step=11422 tok=93.6M loss=3.1387 lr=2.49e-04 grad=0.58 tok/s=12974 elapsed=120.2m
|
| 275 |
+
step=11473 tok=94.0M loss=3.1672 lr=2.48e-04 grad=0.54 tok/s=12978 elapsed=120.7m
|
| 276 |
+
step=11524 tok=94.4M loss=3.1309 lr=2.48e-04 grad=0.51 tok/s=12981 elapsed=121.2m
|
| 277 |
+
step=11575 tok=94.8M loss=3.1022 lr=2.47e-04 grad=0.75 tok/s=12985 elapsed=121.7m
|
| 278 |
+
step=11626 tok=95.2M loss=3.0854 lr=2.47e-04 grad=0.53 tok/s=12988 elapsed=122.2m
|
| 279 |
+
step=11677 tok=95.7M loss=2.9982 lr=2.47e-04 grad=0.54 tok/s=12992 elapsed=122.7m
|
| 280 |
+
step=11728 tok=96.1M loss=3.1676 lr=2.46e-04 grad=0.53 tok/s=12995 elapsed=123.2m
|
| 281 |
+
step=11779 tok=96.5M loss=3.1191 lr=2.46e-04 grad=0.54 tok/s=12999 elapsed=123.7m
|
| 282 |
+
step=11830 tok=96.9M loss=3.1729 lr=2.45e-04 grad=0.53 tok/s=13002 elapsed=124.2m
|
| 283 |
+
step=11881 tok=97.3M loss=3.1855 lr=2.45e-04 grad=0.57 tok/s=13005 elapsed=124.7m
|
| 284 |
+
step=11932 tok=97.7M loss=3.0123 lr=2.44e-04 grad=0.54 tok/s=13009 elapsed=125.2m
|
| 285 |
+
step=11983 tok=98.2M loss=2.9989 lr=2.44e-04 grad=0.57 tok/s=13012 elapsed=125.7m
|
| 286 |
+
step=12034 tok=98.6M loss=3.0833 lr=2.43e-04 grad=0.55 tok/s=13015 elapsed=126.2m
|
| 287 |
+
|
| 288 |
+
=== Eval 4/12 at step 12061 ===
|
| 289 |
+
val_loss=3.0922
|
| 290 |
+
ar: 2.8425
|
| 291 |
+
en: 3.4273
|
| 292 |
+
fr: 3.6063
|
| 293 |
+
es: 4.0531
|
| 294 |
+
ru: 4.2617
|
| 295 |
+
zh: 5.7396
|
| 296 |
+
tr: 4.4112
|
| 297 |
+
code: 2.0174
|
| 298 |
+
math: 3.2752
|
| 299 |
+
classical: 2.9448
|
| 300 |
+
Generation on 140 prompts done in 112s
|
| 301 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_04.pt
|
| 302 |
+
|
| 303 |
+
step=12062 tok=98.8M loss=3.1384 lr=2.43e-04 grad=0.57 tok/s=12828 elapsed=128.4m
|
| 304 |
+
step=12113 tok=99.2M loss=3.1511 lr=2.43e-04 grad=0.51 tok/s=12832 elapsed=128.9m
|
| 305 |
+
step=12164 tok=99.6M loss=3.1365 lr=2.42e-04 grad=0.55 tok/s=12836 elapsed=129.4m
|
| 306 |
+
step=12215 tok=100.1M loss=3.0913 lr=2.42e-04 grad=0.53 tok/s=12840 elapsed=129.9m
|
| 307 |
+
step=12266 tok=100.5M loss=3.0597 lr=2.41e-04 grad=0.62 tok/s=12844 elapsed=130.4m
|
| 308 |
+
step=12317 tok=100.9M loss=2.9818 lr=2.41e-04 grad=0.47 tok/s=12848 elapsed=130.9m
|
| 309 |
+
step=12368 tok=101.3M loss=3.0344 lr=2.41e-04 grad=0.57 tok/s=12851 elapsed=131.4m
|
| 310 |
+
step=12419 tok=101.7M loss=3.0762 lr=2.40e-04 grad=0.54 tok/s=12855 elapsed=131.9m
|
| 311 |
+
step=12470 tok=102.2M loss=3.0318 lr=2.40e-04 grad=0.54 tok/s=12859 elapsed=132.4m
|
| 312 |
+
step=12521 tok=102.6M loss=3.0814 lr=2.39e-04 grad=0.62 tok/s=12863 elapsed=132.9m
|
| 313 |
+
step=12572 tok=103.0M loss=2.9994 lr=2.39e-04 grad=0.53 tok/s=12866 elapsed=133.4m
|
| 314 |
+
step=12623 tok=103.4M loss=3.0640 lr=2.38e-04 grad=0.50 tok/s=12870 elapsed=133.9m
|
| 315 |
+
step=12674 tok=103.8M loss=3.0772 lr=2.38e-04 grad=0.52 tok/s=12874 elapsed=134.4m
|
| 316 |
+
step=12725 tok=104.2M loss=3.0392 lr=2.37e-04 grad=0.58 tok/s=12877 elapsed=134.9m
|
| 317 |
+
step=12776 tok=104.7M loss=3.0437 lr=2.37e-04 grad=0.51 tok/s=12881 elapsed=135.4m
|
| 318 |
+
step=12827 tok=105.1M loss=3.0470 lr=2.36e-04 grad=0.49 tok/s=12884 elapsed=135.9m
|
| 319 |
+
step=12878 tok=105.5M loss=2.9949 lr=2.36e-04 grad=0.53 tok/s=12888 elapsed=136.4m
|
| 320 |
+
step=12929 tok=105.9M loss=3.0292 lr=2.35e-04 grad=0.54 tok/s=12891 elapsed=136.9m
|
| 321 |
+
step=12980 tok=106.3M loss=3.1301 lr=2.35e-04 grad=0.52 tok/s=12895 elapsed=137.4m
|
| 322 |
+
step=13031 tok=106.7M loss=3.0437 lr=2.35e-04 grad=0.53 tok/s=12898 elapsed=137.9m
|
| 323 |
+
step=13082 tok=107.2M loss=3.0746 lr=2.34e-04 grad=0.62 tok/s=12902 elapsed=138.4m
|
| 324 |
+
step=13133 tok=107.6M loss=3.0901 lr=2.34e-04 grad=0.56 tok/s=12905 elapsed=138.9m
|
| 325 |
+
step=13184 tok=108.0M loss=2.9805 lr=2.33e-04 grad=0.55 tok/s=12909 elapsed=139.4m
|
| 326 |
+
step=13235 tok=108.4M loss=3.0733 lr=2.33e-04 grad=0.66 tok/s=12912 elapsed=140.0m
|
| 327 |
+
step=13286 tok=108.8M loss=3.0180 lr=2.32e-04 grad=0.61 tok/s=12915 elapsed=140.5m
|
| 328 |
+
step=13337 tok=109.3M loss=3.0154 lr=2.32e-04 grad=0.56 tok/s=12919 elapsed=141.0m
|
| 329 |
+
step=13388 tok=109.7M loss=3.0439 lr=2.31e-04 grad=0.60 tok/s=12922 elapsed=141.5m
|
| 330 |
+
step=13439 tok=110.1M loss=2.9735 lr=2.31e-04 grad=0.52 tok/s=12925 elapsed=142.0m
|
| 331 |
+
step=13490 tok=110.5M loss=3.0065 lr=2.30e-04 grad=0.50 tok/s=12928 elapsed=142.5m
|
| 332 |
+
step=13541 tok=110.9M loss=3.0683 lr=2.30e-04 grad=0.50 tok/s=12932 elapsed=143.0m
|
| 333 |
+
step=13592 tok=111.3M loss=2.9514 lr=2.29e-04 grad=0.53 tok/s=12935 elapsed=143.5m
|
| 334 |
+
step=13643 tok=111.8M loss=3.0176 lr=2.29e-04 grad=0.52 tok/s=12938 elapsed=144.0m
|
| 335 |
+
step=13693 tok=112.2M loss=3.0521 lr=2.28e-04 grad=0.56 tok/s=12940 elapsed=144.5m
|
| 336 |
+
step=13744 tok=112.6M loss=3.0335 lr=2.28e-04 grad=0.53 tok/s=12943 elapsed=145.0m
|
| 337 |
+
step=13795 tok=113.0M loss=2.9735 lr=2.27e-04 grad=0.62 tok/s=12946 elapsed=145.5m
|
| 338 |
+
step=13846 tok=113.4M loss=2.9868 lr=2.27e-04 grad=0.46 tok/s=12949 elapsed=146.0m
|
| 339 |
+
step=13897 tok=113.8M loss=3.0010 lr=2.26e-04 grad=0.57 tok/s=12953 elapsed=146.5m
|
| 340 |
+
step=13948 tok=114.3M loss=3.0562 lr=2.26e-04 grad=0.59 tok/s=12956 elapsed=147.0m
|
| 341 |
+
step=13999 tok=114.7M loss=2.9834 lr=2.25e-04 grad=0.56 tok/s=12959 elapsed=147.5m
|
| 342 |
+
step=14050 tok=115.1M loss=3.0068 lr=2.25e-04 grad=0.54 tok/s=12962 elapsed=148.0m
|
| 343 |
+
step=14101 tok=115.5M loss=3.0620 lr=2.24e-04 grad=0.52 tok/s=12965 elapsed=148.5m
|
| 344 |
+
step=14152 tok=115.9M loss=3.0307 lr=2.24e-04 grad=0.57 tok/s=12968 elapsed=149.0m
|
| 345 |
+
step=14203 tok=116.4M loss=2.9859 lr=2.23e-04 grad=0.52 tok/s=12971 elapsed=149.5m
|
| 346 |
+
step=14254 tok=116.8M loss=2.9676 lr=2.23e-04 grad=0.51 tok/s=12973 elapsed=150.0m
|
| 347 |
+
step=14305 tok=117.2M loss=2.9742 lr=2.22e-04 grad=0.53 tok/s=12976 elapsed=150.5m
|
| 348 |
+
step=14356 tok=117.6M loss=3.0184 lr=2.22e-04 grad=0.52 tok/s=12979 elapsed=151.0m
|
| 349 |
+
step=14407 tok=118.0M loss=3.0655 lr=2.21e-04 grad=0.75 tok/s=12982 elapsed=151.5m
|
| 350 |
+
step=14458 tok=118.4M loss=2.9029 lr=2.21e-04 grad=0.57 tok/s=12985 elapsed=152.0m
|
| 351 |
+
step=14509 tok=118.9M loss=2.9556 lr=2.20e-04 grad=0.56 tok/s=12988 elapsed=152.5m
|
| 352 |
+
step=14560 tok=119.3M loss=3.0209 lr=2.20e-04 grad=0.53 tok/s=12991 elapsed=153.0m
|
| 353 |
+
step=14611 tok=119.7M loss=3.0433 lr=2.19e-04 grad=0.59 tok/s=12994 elapsed=153.5m
|
| 354 |
+
step=14662 tok=120.1M loss=2.9296 lr=2.19e-04 grad=0.64 tok/s=12996 elapsed=154.0m
|
| 355 |
+
step=14713 tok=120.5M loss=2.8776 lr=2.18e-04 grad=0.50 tok/s=12999 elapsed=154.5m
|
| 356 |
+
step=14764 tok=120.9M loss=2.9298 lr=2.18e-04 grad=0.51 tok/s=13002 elapsed=155.0m
|
| 357 |
+
step=14815 tok=121.4M loss=2.9967 lr=2.17e-04 grad=0.63 tok/s=13005 elapsed=155.5m
|
| 358 |
+
step=14866 tok=121.8M loss=2.8732 lr=2.17e-04 grad=0.55 tok/s=13007 elapsed=156.0m
|
| 359 |
+
step=14917 tok=122.2M loss=2.9845 lr=2.16e-04 grad=0.62 tok/s=13010 elapsed=156.5m
|
| 360 |
+
step=14968 tok=122.6M loss=2.9743 lr=2.16e-04 grad=0.53 tok/s=13013 elapsed=157.0m
|
| 361 |
+
step=15019 tok=123.0M loss=3.0283 lr=2.15e-04 grad=0.65 tok/s=13015 elapsed=157.6m
|
| 362 |
+
step=15070 tok=123.5M loss=3.0133 lr=2.15e-04 grad=0.58 tok/s=13018 elapsed=158.1m
|
| 363 |
+
|
| 364 |
+
=== Eval 5/12 at step 15103 ===
|
| 365 |
+
val_loss=3.0250
|
| 366 |
+
ar: 2.8748
|
| 367 |
+
en: 3.2946
|
| 368 |
+
fr: 3.7188
|
| 369 |
+
es: 4.1580
|
| 370 |
+
ru: 4.5312
|
| 371 |
+
zh: 5.2109
|
| 372 |
+
tr: 4.2627
|
| 373 |
+
code: 1.8239
|
| 374 |
+
math: 3.0963
|
| 375 |
+
classical: 2.6982
|
| 376 |
+
Generation on 140 prompts done in 111s
|
| 377 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_05.pt
|
| 378 |
+
|
| 379 |
+
step=15104 tok=123.7M loss=2.9327 lr=2.15e-04 grad=0.59 tok/s=12869 elapsed=160.3m
|
| 380 |
+
step=15156 tok=124.2M loss=2.9755 lr=2.14e-04 grad=0.52 tok/s=12872 elapsed=160.8m
|
| 381 |
+
step=15207 tok=124.6M loss=2.9663 lr=2.14e-04 grad=0.53 tok/s=12875 elapsed=161.3m
|
| 382 |
+
step=15258 tok=125.0M loss=2.9702 lr=2.13e-04 grad=0.60 tok/s=12878 elapsed=161.8m
|
| 383 |
+
step=15309 tok=125.4M loss=2.8839 lr=2.13e-04 grad=0.50 tok/s=12881 elapsed=162.3m
|
| 384 |
+
step=15360 tok=125.8M loss=2.9323 lr=2.12e-04 grad=0.55 tok/s=12884 elapsed=162.8m
|
| 385 |
+
step=15411 tok=126.2M loss=2.9767 lr=2.11e-04 grad=0.57 tok/s=12887 elapsed=163.3m
|
| 386 |
+
step=15462 tok=126.7M loss=3.0394 lr=2.11e-04 grad=0.56 tok/s=12890 elapsed=163.8m
|
| 387 |
+
step=15513 tok=127.1M loss=2.8930 lr=2.10e-04 grad=0.56 tok/s=12893 elapsed=164.3m
|
| 388 |
+
step=15564 tok=127.5M loss=2.9442 lr=2.10e-04 grad=0.48 tok/s=12896 elapsed=164.8m
|
| 389 |
+
step=15615 tok=127.9M loss=2.9326 lr=2.09e-04 grad=0.66 tok/s=12899 elapsed=165.3m
|
| 390 |
+
step=15666 tok=128.3M loss=2.9343 lr=2.09e-04 grad=0.54 tok/s=12901 elapsed=165.8m
|
| 391 |
+
step=15717 tok=128.8M loss=2.9276 lr=2.08e-04 grad=0.61 tok/s=12904 elapsed=166.3m
|
| 392 |
+
step=15768 tok=129.2M loss=2.8812 lr=2.08e-04 grad=0.53 tok/s=12907 elapsed=166.8m
|
| 393 |
+
step=15819 tok=129.6M loss=2.8640 lr=2.07e-04 grad=0.55 tok/s=12910 elapsed=167.3m
|
| 394 |
+
step=15870 tok=130.0M loss=2.9768 lr=2.07e-04 grad=0.64 tok/s=12913 elapsed=167.8m
|
| 395 |
+
step=15921 tok=130.4M loss=2.9967 lr=2.06e-04 grad=0.58 tok/s=12915 elapsed=168.3m
|
| 396 |
+
step=15972 tok=130.8M loss=2.9106 lr=2.06e-04 grad=0.51 tok/s=12918 elapsed=168.8m
|
| 397 |
+
step=16023 tok=131.3M loss=2.9829 lr=2.05e-04 grad=0.55 tok/s=12921 elapsed=169.3m
|
| 398 |
+
step=16074 tok=131.7M loss=2.8367 lr=2.05e-04 grad=0.63 tok/s=12924 elapsed=169.8m
|
| 399 |
+
step=16125 tok=132.1M loss=2.8917 lr=2.04e-04 grad=0.56 tok/s=12927 elapsed=170.3m
|
| 400 |
+
step=16176 tok=132.5M loss=2.9571 lr=2.04e-04 grad=0.59 tok/s=12929 elapsed=170.8m
|
| 401 |
+
step=16227 tok=132.9M loss=2.9155 lr=2.03e-04 grad=0.61 tok/s=12932 elapsed=171.3m
|
| 402 |
+
step=16278 tok=133.3M loss=2.8900 lr=2.03e-04 grad=1.47 tok/s=12935 elapsed=171.8m
|
| 403 |
+
step=16329 tok=133.8M loss=2.9398 lr=2.02e-04 grad=0.55 tok/s=12937 elapsed=172.3m
|
| 404 |
+
step=16380 tok=134.2M loss=2.9244 lr=2.02e-04 grad=0.58 tok/s=12940 elapsed=172.8m
|
| 405 |
+
step=16431 tok=134.6M loss=2.9556 lr=2.01e-04 grad=0.60 tok/s=12943 elapsed=173.3m
|
| 406 |
+
step=16482 tok=135.0M loss=2.8921 lr=2.01e-04 grad=0.61 tok/s=12945 elapsed=173.8m
|
| 407 |
+
step=16533 tok=135.4M loss=2.9338 lr=2.00e-04 grad=0.53 tok/s=12948 elapsed=174.3m
|
| 408 |
+
step=16584 tok=135.9M loss=2.9382 lr=1.99e-04 grad=0.53 tok/s=12951 elapsed=174.8m
|
| 409 |
+
step=16634 tok=136.3M loss=2.9033 lr=1.99e-04 grad=0.52 tok/s=12952 elapsed=175.3m
|
| 410 |
+
step=16685 tok=136.7M loss=2.8940 lr=1.98e-04 grad=0.56 tok/s=12955 elapsed=175.8m
|
| 411 |
+
step=16736 tok=137.1M loss=2.8735 lr=1.98e-04 grad=1.93 tok/s=12957 elapsed=176.3m
|
| 412 |
+
step=16787 tok=137.5M loss=2.9334 lr=1.97e-04 grad=0.56 tok/s=12960 elapsed=176.9m
|
| 413 |
+
step=16838 tok=137.9M loss=2.9607 lr=1.97e-04 grad=0.54 tok/s=12963 elapsed=177.4m
|
| 414 |
+
step=16889 tok=138.4M loss=2.8247 lr=1.96e-04 grad=0.52 tok/s=12965 elapsed=177.9m
|
| 415 |
+
step=16940 tok=138.8M loss=2.9567 lr=1.96e-04 grad=0.52 tok/s=12968 elapsed=178.4m
|
| 416 |
+
step=16991 tok=139.2M loss=2.9430 lr=1.95e-04 grad=0.61 tok/s=12970 elapsed=178.9m
|
| 417 |
+
step=17042 tok=139.6M loss=2.8536 lr=1.95e-04 grad=0.62 tok/s=12973 elapsed=179.4m
|
| 418 |
+
step=17093 tok=140.0M loss=2.9271 lr=1.94e-04 grad=0.49 tok/s=12975 elapsed=179.9m
|
| 419 |
+
step=17144 tok=140.4M loss=2.8519 lr=1.94e-04 grad=0.55 tok/s=12978 elapsed=180.4m
|
| 420 |
+
step=17195 tok=140.9M loss=2.8666 lr=1.93e-04 grad=0.57 tok/s=12980 elapsed=180.9m
|
| 421 |
+
step=17246 tok=141.3M loss=2.9764 lr=1.93e-04 grad=0.52 tok/s=12983 elapsed=181.4m
|
| 422 |
+
step=17297 tok=141.7M loss=2.9025 lr=1.92e-04 grad=0.54 tok/s=12985 elapsed=181.9m
|
| 423 |
+
step=17348 tok=142.1M loss=2.9094 lr=1.91e-04 grad=0.52 tok/s=12987 elapsed=182.4m
|
| 424 |
+
step=17399 tok=142.5M loss=2.8937 lr=1.91e-04 grad=0.52 tok/s=12990 elapsed=182.9m
|
| 425 |
+
step=17450 tok=143.0M loss=2.8823 lr=1.90e-04 grad=0.53 tok/s=12992 elapsed=183.4m
|
| 426 |
+
step=17501 tok=143.4M loss=2.8767 lr=1.90e-04 grad=0.62 tok/s=12995 elapsed=183.9m
|
| 427 |
+
step=17552 tok=143.8M loss=2.8779 lr=1.89e-04 grad=0.61 tok/s=12997 elapsed=184.4m
|
| 428 |
+
step=17603 tok=144.2M loss=2.8209 lr=1.89e-04 grad=0.57 tok/s=12999 elapsed=184.9m
|
| 429 |
+
step=17654 tok=144.6M loss=2.8998 lr=1.88e-04 grad=0.55 tok/s=13002 elapsed=185.4m
|
| 430 |
+
step=17705 tok=145.0M loss=2.9550 lr=1.88e-04 grad=0.53 tok/s=13004 elapsed=185.9m
|
| 431 |
+
step=17756 tok=145.5M loss=2.8924 lr=1.87e-04 grad=0.52 tok/s=13006 elapsed=186.4m
|
| 432 |
+
step=17807 tok=145.9M loss=2.9410 lr=1.87e-04 grad=0.52 tok/s=13009 elapsed=186.9m
|
| 433 |
+
step=17858 tok=146.3M loss=2.8784 lr=1.86e-04 grad=0.69 tok/s=13011 elapsed=187.4m
|
| 434 |
+
step=17909 tok=146.7M loss=2.8394 lr=1.86e-04 grad=0.60 tok/s=13013 elapsed=187.9m
|
| 435 |
+
step=17960 tok=147.1M loss=2.9271 lr=1.85e-04 grad=0.54 tok/s=13015 elapsed=188.4m
|
| 436 |
+
step=18011 tok=147.5M loss=2.8977 lr=1.84e-04 grad=0.54 tok/s=13018 elapsed=188.9m
|
| 437 |
+
step=18062 tok=148.0M loss=2.9217 lr=1.84e-04 grad=0.57 tok/s=13020 elapsed=189.4m
|
| 438 |
+
step=18113 tok=148.4M loss=2.9233 lr=1.83e-04 grad=0.52 tok/s=13022 elapsed=189.9m
|
| 439 |
+
|
| 440 |
+
=== Eval 6/12 at step 18147 ===
|
| 441 |
+
val_loss=2.9397
|
| 442 |
+
ar: 2.7501
|
| 443 |
+
en: 3.1113
|
| 444 |
+
es: 4.0865
|
| 445 |
+
ru: 5.0234
|
| 446 |
+
zh: 5.6786
|
| 447 |
+
tr: 3.7370
|
| 448 |
+
code: 1.7600
|
| 449 |
+
math: 3.1434
|
| 450 |
+
classical: 2.7190
|
| 451 |
+
Generation on 140 prompts done in 112s
|
| 452 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_06.pt
|
| 453 |
+
|
| 454 |
+
step=18148 tok=148.7M loss=2.8416 lr=1.83e-04 grad=0.52 tok/s=12897 elapsed=192.1m
|
| 455 |
+
step=18200 tok=149.1M loss=2.8874 lr=1.82e-04 grad=0.66 tok/s=12900 elapsed=192.6m
|
| 456 |
+
step=18251 tok=149.5M loss=2.9504 lr=1.82e-04 grad=0.55 tok/s=12902 elapsed=193.1m
|
| 457 |
+
step=18302 tok=149.9M loss=2.8830 lr=1.81e-04 grad=0.59 tok/s=12905 elapsed=193.6m
|
| 458 |
+
step=18353 tok=150.3M loss=2.8687 lr=1.81e-04 grad=0.63 tok/s=12907 elapsed=194.1m
|
| 459 |
+
step=18404 tok=150.8M loss=2.9377 lr=1.80e-04 grad=0.61 tok/s=12910 elapsed=194.6m
|
| 460 |
+
step=18455 tok=151.2M loss=2.8028 lr=1.80e-04 grad=0.52 tok/s=12912 elapsed=195.1m
|
| 461 |
+
step=18506 tok=151.6M loss=2.8408 lr=1.79e-04 grad=0.51 tok/s=12915 elapsed=195.6m
|
| 462 |
+
step=18557 tok=152.0M loss=2.9252 lr=1.79e-04 grad=0.62 tok/s=12917 elapsed=196.1m
|
| 463 |
+
step=18608 tok=152.4M loss=2.9007 lr=1.78e-04 grad=0.52 tok/s=12920 elapsed=196.6m
|
| 464 |
+
step=18659 tok=152.9M loss=2.8233 lr=1.78e-04 grad=0.53 tok/s=12922 elapsed=197.2m
|
| 465 |
+
step=18710 tok=153.3M loss=2.7880 lr=1.77e-04 grad=0.53 tok/s=12924 elapsed=197.7m
|
| 466 |
+
step=18761 tok=153.7M loss=2.8649 lr=1.76e-04 grad=0.57 tok/s=12927 elapsed=198.2m
|
| 467 |
+
step=18812 tok=154.1M loss=2.9031 lr=1.76e-04 grad=0.54 tok/s=12929 elapsed=198.7m
|
| 468 |
+
step=18863 tok=154.5M loss=2.8032 lr=1.75e-04 grad=0.60 tok/s=12932 elapsed=199.2m
|
| 469 |
+
step=18914 tok=154.9M loss=2.7820 lr=1.75e-04 grad=0.56 tok/s=12934 elapsed=199.7m
|
| 470 |
+
step=18965 tok=155.4M loss=2.8419 lr=1.74e-04 grad=0.57 tok/s=12936 elapsed=200.2m
|
| 471 |
+
step=19016 tok=155.8M loss=2.9224 lr=1.74e-04 grad=0.54 tok/s=12939 elapsed=200.7m
|
| 472 |
+
step=19067 tok=156.2M loss=2.8691 lr=1.73e-04 grad=0.58 tok/s=12941 elapsed=201.2m
|
| 473 |
+
step=19118 tok=156.6M loss=2.8666 lr=1.73e-04 grad=0.52 tok/s=12943 elapsed=201.7m
|
| 474 |
+
step=19169 tok=157.0M loss=2.8298 lr=1.72e-04 grad=0.55 tok/s=12945 elapsed=202.2m
|
| 475 |
+
step=19220 tok=157.5M loss=2.8788 lr=1.72e-04 grad=0.57 tok/s=12948 elapsed=202.7m
|
| 476 |
+
step=19271 tok=157.9M loss=2.8701 lr=1.71e-04 grad=0.61 tok/s=12950 elapsed=203.2m
|
| 477 |
+
step=19322 tok=158.3M loss=2.8799 lr=1.70e-04 grad=0.55 tok/s=12952 elapsed=203.7m
|
| 478 |
+
step=19373 tok=158.7M loss=2.8907 lr=1.70e-04 grad=0.60 tok/s=12954 elapsed=204.2m
|
| 479 |
+
step=19424 tok=159.1M loss=2.8718 lr=1.69e-04 grad=0.66 tok/s=12957 elapsed=204.7m
|
| 480 |
+
step=19475 tok=159.5M loss=2.8347 lr=1.69e-04 grad=0.59 tok/s=12959 elapsed=205.2m
|
| 481 |
+
step=19526 tok=160.0M loss=2.7802 lr=1.68e-04 grad=0.59 tok/s=12961 elapsed=205.7m
|
| 482 |
+
step=19576 tok=160.4M loss=2.9076 lr=1.68e-04 grad=0.57 tok/s=12962 elapsed=206.2m
|
| 483 |
+
step=19627 tok=160.8M loss=2.9208 lr=1.67e-04 grad=0.57 tok/s=12965 elapsed=206.7m
|
| 484 |
+
step=19678 tok=161.2M loss=2.8137 lr=1.67e-04 grad=0.53 tok/s=12967 elapsed=207.2m
|
| 485 |
+
step=19729 tok=161.6M loss=2.8250 lr=1.66e-04 grad=0.54 tok/s=12969 elapsed=207.7m
|
| 486 |
+
step=19780 tok=162.0M loss=2.8012 lr=1.65e-04 grad=0.53 tok/s=12971 elapsed=208.2m
|
| 487 |
+
step=19831 tok=162.5M loss=2.7923 lr=1.65e-04 grad=0.55 tok/s=12973 elapsed=208.7m
|
| 488 |
+
step=19882 tok=162.9M loss=2.8951 lr=1.64e-04 grad=0.66 tok/s=12976 elapsed=209.2m
|
| 489 |
+
step=19933 tok=163.3M loss=2.8896 lr=1.64e-04 grad=0.67 tok/s=12978 elapsed=209.7m
|
| 490 |
+
step=19984 tok=163.7M loss=2.7634 lr=1.63e-04 grad=0.58 tok/s=12980 elapsed=210.2m
|
| 491 |
+
step=20035 tok=164.1M loss=2.8623 lr=1.63e-04 grad=0.54 tok/s=12982 elapsed=210.7m
|
| 492 |
+
step=20086 tok=164.5M loss=2.8470 lr=1.62e-04 grad=0.53 tok/s=12984 elapsed=211.2m
|
| 493 |
+
step=20137 tok=165.0M loss=2.7955 lr=1.62e-04 grad=0.56 tok/s=12986 elapsed=211.7m
|
| 494 |
+
step=20188 tok=165.4M loss=2.8326 lr=1.61e-04 grad=0.47 tok/s=12988 elapsed=212.2m
|
| 495 |
+
step=20239 tok=165.8M loss=2.7090 lr=1.61e-04 grad=0.52 tok/s=12990 elapsed=212.7m
|
| 496 |
+
step=20290 tok=166.2M loss=2.8182 lr=1.60e-04 grad=0.54 tok/s=12992 elapsed=213.2m
|
| 497 |
+
step=20341 tok=166.6M loss=2.8232 lr=1.59e-04 grad=0.54 tok/s=12994 elapsed=213.7m
|
| 498 |
+
step=20392 tok=167.1M loss=2.8172 lr=1.59e-04 grad=0.53 tok/s=12996 elapsed=214.2m
|
| 499 |
+
step=20443 tok=167.5M loss=2.8223 lr=1.58e-04 grad=0.73 tok/s=12998 elapsed=214.7m
|
| 500 |
+
step=20494 tok=167.9M loss=2.8321 lr=1.58e-04 grad=0.54 tok/s=13001 elapsed=215.2m
|
| 501 |
+
step=20545 tok=168.3M loss=2.8312 lr=1.57e-04 grad=0.56 tok/s=13003 elapsed=215.7m
|
| 502 |
+
step=20596 tok=168.7M loss=2.8118 lr=1.57e-04 grad=0.58 tok/s=13005 elapsed=216.2m
|
| 503 |
+
step=20647 tok=169.1M loss=2.8048 lr=1.56e-04 grad=0.56 tok/s=13007 elapsed=216.7m
|
| 504 |
+
step=20698 tok=169.6M loss=2.8269 lr=1.56e-04 grad=0.52 tok/s=13009 elapsed=217.2m
|
| 505 |
+
step=20749 tok=170.0M loss=2.8395 lr=1.55e-04 grad=0.62 tok/s=13011 elapsed=217.7m
|
| 506 |
+
step=20800 tok=170.4M loss=2.7757 lr=1.55e-04 grad=0.54 tok/s=13013 elapsed=218.2m
|
| 507 |
+
step=20851 tok=170.8M loss=2.7076 lr=1.54e-04 grad=0.52 tok/s=13015 elapsed=218.7m
|
| 508 |
+
step=20902 tok=171.2M loss=2.7882 lr=1.53e-04 grad=0.58 tok/s=13017 elapsed=219.2m
|
| 509 |
+
step=20953 tok=171.6M loss=2.8328 lr=1.53e-04 grad=0.55 tok/s=13019 elapsed=219.7m
|
| 510 |
+
step=21004 tok=172.1M loss=2.8453 lr=1.52e-04 grad=0.61 tok/s=13020 elapsed=220.2m
|
| 511 |
+
step=21055 tok=172.5M loss=2.7964 lr=1.52e-04 grad=0.54 tok/s=13022 elapsed=220.8m
|
| 512 |
+
step=21106 tok=172.9M loss=2.8101 lr=1.51e-04 grad=0.51 tok/s=13024 elapsed=221.3m
|
| 513 |
+
step=21157 tok=173.3M loss=2.8085 lr=1.51e-04 grad=0.57 tok/s=13026 elapsed=221.8m
|
| 514 |
+
|
| 515 |
+
=== Eval 7/12 at step 21194 ===
|
| 516 |
+
val_loss=2.7981
|
| 517 |
+
ar: 2.6185
|
| 518 |
+
en: 3.1091
|
| 519 |
+
fr: 3.9531
|
| 520 |
+
es: 3.9635
|
| 521 |
+
ru: 4.1750
|
| 522 |
+
zh: 5.4688
|
| 523 |
+
tr: 3.8877
|
| 524 |
+
code: 1.6938
|
| 525 |
+
math: 3.1078
|
| 526 |
+
classical: 2.5073
|
| 527 |
+
Generation on 140 prompts done in 112s
|
| 528 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_07.pt
|
| 529 |
+
|
| 530 |
+
step=21195 tok=173.6M loss=2.7731 lr=1.50e-04 grad=0.55 tok/s=12919 elapsed=224.0m
|
| 531 |
+
step=21247 tok=174.1M loss=2.8818 lr=1.50e-04 grad=0.53 tok/s=12921 elapsed=224.5m
|
| 532 |
+
step=21298 tok=174.5M loss=2.7322 lr=1.49e-04 grad=0.52 tok/s=12924 elapsed=225.0m
|
| 533 |
+
step=21349 tok=174.9M loss=2.8215 lr=1.49e-04 grad=0.58 tok/s=12926 elapsed=225.5m
|
| 534 |
+
step=21400 tok=175.3M loss=2.7818 lr=1.48e-04 grad=0.59 tok/s=12928 elapsed=226.0m
|
| 535 |
+
step=21451 tok=175.7M loss=2.8680 lr=1.48e-04 grad=0.55 tok/s=12930 elapsed=226.5m
|
| 536 |
+
step=21502 tok=176.1M loss=2.8080 lr=1.47e-04 grad=0.70 tok/s=12932 elapsed=227.0m
|
| 537 |
+
step=21553 tok=176.6M loss=2.7379 lr=1.46e-04 grad=0.55 tok/s=12934 elapsed=227.5m
|
| 538 |
+
step=21604 tok=177.0M loss=2.7677 lr=1.46e-04 grad=0.61 tok/s=12936 elapsed=228.0m
|
| 539 |
+
step=21655 tok=177.4M loss=2.7922 lr=1.45e-04 grad=0.52 tok/s=12938 elapsed=228.5m
|
| 540 |
+
step=21706 tok=177.8M loss=2.7912 lr=1.45e-04 grad=0.49 tok/s=12940 elapsed=229.0m
|
| 541 |
+
step=21757 tok=178.2M loss=2.7891 lr=1.44e-04 grad=0.54 tok/s=12942 elapsed=229.5m
|
| 542 |
+
step=21808 tok=178.7M loss=2.7091 lr=1.44e-04 grad=0.55 tok/s=12944 elapsed=230.0m
|
| 543 |
+
step=21859 tok=179.1M loss=2.8348 lr=1.43e-04 grad=0.87 tok/s=12946 elapsed=230.5m
|
| 544 |
+
step=21910 tok=179.5M loss=2.7627 lr=1.43e-04 grad=0.51 tok/s=12948 elapsed=231.0m
|
| 545 |
+
step=21961 tok=179.9M loss=2.8436 lr=1.42e-04 grad=0.51 tok/s=12950 elapsed=231.5m
|
| 546 |
+
step=22012 tok=180.3M loss=2.7206 lr=1.42e-04 grad=0.52 tok/s=12952 elapsed=232.0m
|
| 547 |
+
step=22063 tok=180.7M loss=2.7977 lr=1.41e-04 grad=0.54 tok/s=12954 elapsed=232.5m
|
| 548 |
+
step=22114 tok=181.2M loss=2.8109 lr=1.41e-04 grad=0.57 tok/s=12956 elapsed=233.0m
|
| 549 |
+
step=22165 tok=181.6M loss=2.8081 lr=1.40e-04 grad=0.59 tok/s=12958 elapsed=233.5m
|
| 550 |
+
step=22216 tok=182.0M loss=2.7944 lr=1.39e-04 grad=0.53 tok/s=12960 elapsed=234.0m
|
| 551 |
+
step=22267 tok=182.4M loss=2.6751 lr=1.39e-04 grad=0.55 tok/s=12962 elapsed=234.5m
|
| 552 |
+
step=22318 tok=182.8M loss=2.8545 lr=1.38e-04 grad=0.60 tok/s=12964 elapsed=235.0m
|
| 553 |
+
step=22369 tok=183.2M loss=2.7928 lr=1.38e-04 grad=0.76 tok/s=12966 elapsed=235.6m
|
| 554 |
+
step=22420 tok=183.7M loss=2.7787 lr=1.37e-04 grad=0.55 tok/s=12968 elapsed=236.1m
|
| 555 |
+
step=22471 tok=184.1M loss=2.7591 lr=1.37e-04 grad=0.55 tok/s=12970 elapsed=236.6m
|
| 556 |
+
step=22522 tok=184.5M loss=2.8114 lr=1.36e-04 grad=0.57 tok/s=12971 elapsed=237.1m
|
| 557 |
+
step=22573 tok=184.9M loss=2.7778 lr=1.36e-04 grad=0.55 tok/s=12973 elapsed=237.6m
|
| 558 |
+
step=22623 tok=185.3M loss=2.8189 lr=1.35e-04 grad=0.74 tok/s=12974 elapsed=238.1m
|
| 559 |
+
step=22674 tok=185.7M loss=2.8103 lr=1.35e-04 grad=0.77 tok/s=12976 elapsed=238.6m
|
| 560 |
+
step=22725 tok=186.2M loss=2.8056 lr=1.34e-04 grad=0.55 tok/s=12978 elapsed=239.1m
|
| 561 |
+
step=22776 tok=186.6M loss=2.7690 lr=1.34e-04 grad=0.53 tok/s=12980 elapsed=239.6m
|
| 562 |
+
step=22827 tok=187.0M loss=2.7387 lr=1.33e-04 grad=0.54 tok/s=12982 elapsed=240.1m
|
| 563 |
+
step=22878 tok=187.4M loss=2.8227 lr=1.33e-04 grad=0.65 tok/s=12984 elapsed=240.6m
|
| 564 |
+
step=22929 tok=187.8M loss=2.7507 lr=1.32e-04 grad=0.56 tok/s=12986 elapsed=241.1m
|
| 565 |
+
step=22980 tok=188.3M loss=2.7623 lr=1.31e-04 grad=0.52 tok/s=12987 elapsed=241.6m
|
| 566 |
+
step=23031 tok=188.7M loss=2.7472 lr=1.31e-04 grad=0.57 tok/s=12989 elapsed=242.1m
|
| 567 |
+
step=23082 tok=189.1M loss=2.7353 lr=1.30e-04 grad=0.54 tok/s=12991 elapsed=242.6m
|
| 568 |
+
step=23133 tok=189.5M loss=2.7509 lr=1.30e-04 grad=0.54 tok/s=12993 elapsed=243.1m
|
| 569 |
+
step=23184 tok=189.9M loss=2.6869 lr=1.29e-04 grad=0.53 tok/s=12995 elapsed=243.6m
|
| 570 |
+
step=23235 tok=190.3M loss=2.7978 lr=1.29e-04 grad=0.54 tok/s=12997 elapsed=244.1m
|
| 571 |
+
step=23286 tok=190.8M loss=2.7560 lr=1.28e-04 grad=0.56 tok/s=12998 elapsed=244.6m
|
| 572 |
+
step=23337 tok=191.2M loss=2.8022 lr=1.28e-04 grad=0.55 tok/s=13000 elapsed=245.1m
|
| 573 |
+
step=23388 tok=191.6M loss=2.7787 lr=1.27e-04 grad=0.59 tok/s=13002 elapsed=245.6m
|
| 574 |
+
step=23439 tok=192.0M loss=2.7585 lr=1.27e-04 grad=0.59 tok/s=13004 elapsed=246.1m
|
| 575 |
+
step=23490 tok=192.4M loss=2.7478 lr=1.26e-04 grad=0.61 tok/s=13005 elapsed=246.6m
|
| 576 |
+
step=23541 tok=192.8M loss=2.7702 lr=1.26e-04 grad=0.57 tok/s=13007 elapsed=247.1m
|
| 577 |
+
step=23592 tok=193.3M loss=2.8115 lr=1.25e-04 grad=0.57 tok/s=13009 elapsed=247.6m
|
| 578 |
+
step=23643 tok=193.7M loss=2.7725 lr=1.25e-04 grad=0.70 tok/s=13011 elapsed=248.1m
|
| 579 |
+
step=23694 tok=194.1M loss=2.7042 lr=1.24e-04 grad=0.58 tok/s=13012 elapsed=248.6m
|
| 580 |
+
step=23745 tok=194.5M loss=2.7547 lr=1.24e-04 grad=0.59 tok/s=13014 elapsed=249.1m
|
| 581 |
+
step=23796 tok=194.9M loss=2.8160 lr=1.23e-04 grad=0.64 tok/s=13016 elapsed=249.6m
|
| 582 |
+
step=23847 tok=195.4M loss=2.7757 lr=1.23e-04 grad=0.66 tok/s=13018 elapsed=250.1m
|
| 583 |
+
step=23898 tok=195.8M loss=2.7737 lr=1.22e-04 grad=0.55 tok/s=13019 elapsed=250.6m
|
| 584 |
+
step=23949 tok=196.2M loss=2.7803 lr=1.21e-04 grad=0.54 tok/s=13021 elapsed=251.1m
|
| 585 |
+
step=24000 tok=196.6M loss=2.7175 lr=1.21e-04 grad=0.62 tok/s=13023 elapsed=251.6m
|
| 586 |
+
step=24051 tok=197.0M loss=2.6972 lr=1.20e-04 grad=0.70 tok/s=13024 elapsed=252.1m
|
| 587 |
+
step=24102 tok=197.4M loss=2.7568 lr=1.20e-04 grad=0.57 tok/s=13026 elapsed=252.6m
|
| 588 |
+
step=24153 tok=197.9M loss=2.7059 lr=1.19e-04 grad=0.63 tok/s=13028 elapsed=253.1m
|
| 589 |
+
step=24204 tok=198.3M loss=2.7956 lr=1.19e-04 grad=0.58 tok/s=13029 elapsed=253.6m
|
| 590 |
+
|
| 591 |
+
=== Eval 8/12 at step 24240 ===
|
| 592 |
+
val_loss=2.8359
|
| 593 |
+
ar: 2.6462
|
| 594 |
+
en: 3.0823
|
| 595 |
+
fr: 3.3326
|
| 596 |
+
es: 3.8056
|
| 597 |
+
ru: 4.8438
|
| 598 |
+
zh: 5.3937
|
| 599 |
+
tr: 3.7914
|
| 600 |
+
code: 1.6448
|
| 601 |
+
math: 3.0799
|
| 602 |
+
classical: 2.6897
|
| 603 |
+
Generation on 140 prompts done in 111s
|
| 604 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_08.pt
|
| 605 |
+
|
| 606 |
+
step=24241 tok=198.6M loss=2.7982 lr=1.19e-04 grad=0.55 tok/s=12936 elapsed=255.9m
|
| 607 |
+
step=24293 tok=199.0M loss=2.7978 lr=1.18e-04 grad=0.55 tok/s=12938 elapsed=256.4m
|
| 608 |
+
step=24344 tok=199.4M loss=2.7555 lr=1.17e-04 grad=0.66 tok/s=12940 elapsed=256.9m
|
| 609 |
+
step=24395 tok=199.8M loss=2.7029 lr=1.17e-04 grad=0.55 tok/s=12941 elapsed=257.4m
|
| 610 |
+
step=24446 tok=200.3M loss=2.7518 lr=1.16e-04 grad=0.57 tok/s=12943 elapsed=257.9m
|
| 611 |
+
step=24497 tok=200.7M loss=2.7268 lr=1.16e-04 grad=0.54 tok/s=12945 elapsed=258.4m
|
| 612 |
+
step=24548 tok=201.1M loss=2.7139 lr=1.15e-04 grad=0.64 tok/s=12947 elapsed=258.9m
|
| 613 |
+
step=24599 tok=201.5M loss=2.7136 lr=1.15e-04 grad=0.52 tok/s=12948 elapsed=259.4m
|
| 614 |
+
step=24650 tok=201.9M loss=2.7652 lr=1.14e-04 grad=0.53 tok/s=12950 elapsed=259.9m
|
| 615 |
+
step=24701 tok=202.4M loss=2.7839 lr=1.14e-04 grad=0.61 tok/s=12952 elapsed=260.4m
|
| 616 |
+
step=24752 tok=202.8M loss=2.7729 lr=1.13e-04 grad=0.71 tok/s=12954 elapsed=260.9m
|
| 617 |
+
step=24803 tok=203.2M loss=2.7753 lr=1.13e-04 grad=0.59 tok/s=12955 elapsed=261.4m
|
| 618 |
+
step=24854 tok=203.6M loss=2.7207 lr=1.12e-04 grad=0.61 tok/s=12957 elapsed=261.9m
|
| 619 |
+
step=24905 tok=204.0M loss=2.7338 lr=1.12e-04 grad=0.61 tok/s=12959 elapsed=262.4m
|
| 620 |
+
step=24956 tok=204.4M loss=2.7666 lr=1.11e-04 grad=0.63 tok/s=12960 elapsed=262.9m
|
| 621 |
+
step=25007 tok=204.9M loss=2.7328 lr=1.11e-04 grad=0.64 tok/s=12962 elapsed=263.4m
|
| 622 |
+
step=25058 tok=205.3M loss=2.7625 lr=1.10e-04 grad=0.55 tok/s=12964 elapsed=263.9m
|
| 623 |
+
step=25109 tok=205.7M loss=2.7149 lr=1.10e-04 grad=0.63 tok/s=12965 elapsed=264.4m
|
| 624 |
+
step=25160 tok=206.1M loss=2.7617 lr=1.09e-04 grad=0.57 tok/s=12967 elapsed=264.9m
|
| 625 |
+
step=25211 tok=206.5M loss=2.7318 lr=1.09e-04 grad=0.65 tok/s=12969 elapsed=265.4m
|
| 626 |
+
step=25262 tok=206.9M loss=2.7195 lr=1.08e-04 grad=0.56 tok/s=12970 elapsed=265.9m
|
| 627 |
+
step=25313 tok=207.4M loss=2.7374 lr=1.08e-04 grad=0.65 tok/s=12972 elapsed=266.4m
|
| 628 |
+
step=25364 tok=207.8M loss=2.7423 lr=1.07e-04 grad=0.56 tok/s=12973 elapsed=266.9m
|
| 629 |
+
step=25415 tok=208.2M loss=2.7176 lr=1.07e-04 grad=0.57 tok/s=12975 elapsed=267.4m
|
| 630 |
+
step=25466 tok=208.6M loss=2.6527 lr=1.06e-04 grad=0.63 tok/s=12977 elapsed=267.9m
|
| 631 |
+
step=25517 tok=209.0M loss=2.7190 lr=1.06e-04 grad=0.57 tok/s=12978 elapsed=268.4m
|
| 632 |
+
step=25568 tok=209.5M loss=2.6860 lr=1.05e-04 grad=0.65 tok/s=12980 elapsed=268.9m
|
| 633 |
+
step=25619 tok=209.9M loss=2.7723 lr=1.05e-04 grad=0.65 tok/s=12981 elapsed=269.4m
|
| 634 |
+
step=25669 tok=210.3M loss=2.7335 lr=1.04e-04 grad=0.70 tok/s=12982 elapsed=270.0m
|
| 635 |
+
step=25720 tok=210.7M loss=2.7262 lr=1.04e-04 grad=0.56 tok/s=12984 elapsed=270.5m
|
| 636 |
+
step=25771 tok=211.1M loss=2.6754 lr=1.03e-04 grad=0.63 tok/s=12986 elapsed=271.0m
|
| 637 |
+
step=25822 tok=211.5M loss=2.6575 lr=1.03e-04 grad=0.57 tok/s=12987 elapsed=271.5m
|
| 638 |
+
step=25873 tok=212.0M loss=2.6798 lr=1.02e-04 grad=0.60 tok/s=12989 elapsed=272.0m
|
| 639 |
+
step=25924 tok=212.4M loss=2.7749 lr=1.02e-04 grad=0.71 tok/s=12990 elapsed=272.5m
|
| 640 |
+
step=25975 tok=212.8M loss=2.7589 lr=1.02e-04 grad=0.58 tok/s=12992 elapsed=273.0m
|
| 641 |
+
step=26026 tok=213.2M loss=2.6749 lr=1.01e-04 grad=0.59 tok/s=12994 elapsed=273.5m
|
| 642 |
+
step=26077 tok=213.6M loss=2.7371 lr=1.01e-04 grad=0.59 tok/s=12995 elapsed=274.0m
|
| 643 |
+
step=26128 tok=214.0M loss=2.7649 lr=1.00e-04 grad=0.68 tok/s=12997 elapsed=274.5m
|
| 644 |
+
step=26179 tok=214.5M loss=2.7782 lr=9.96e-05 grad=0.79 tok/s=12998 elapsed=275.0m
|
| 645 |
+
step=26230 tok=214.9M loss=2.6483 lr=9.91e-05 grad=0.64 tok/s=13000 elapsed=275.5m
|
| 646 |
+
step=26281 tok=215.3M loss=2.8054 lr=9.86e-05 grad=0.54 tok/s=13001 elapsed=276.0m
|
| 647 |
+
step=26332 tok=215.7M loss=2.7233 lr=9.81e-05 grad=0.61 tok/s=13003 elapsed=276.5m
|
| 648 |
+
step=26383 tok=216.1M loss=2.6663 lr=9.77e-05 grad=0.57 tok/s=13004 elapsed=277.0m
|
| 649 |
+
step=26434 tok=216.5M loss=2.7439 lr=9.72e-05 grad=0.60 tok/s=13006 elapsed=277.5m
|
| 650 |
+
step=26485 tok=217.0M loss=2.6851 lr=9.67e-05 grad=0.59 tok/s=13008 elapsed=278.0m
|
| 651 |
+
step=26536 tok=217.4M loss=2.7681 lr=9.62e-05 grad=0.64 tok/s=13009 elapsed=278.5m
|
| 652 |
+
step=26587 tok=217.8M loss=2.6882 lr=9.58e-05 grad=0.55 tok/s=13011 elapsed=279.0m
|
| 653 |
+
step=26638 tok=218.2M loss=2.7066 lr=9.53e-05 grad=0.59 tok/s=13012 elapsed=279.5m
|
| 654 |
+
step=26689 tok=218.6M loss=2.6896 lr=9.48e-05 grad=0.54 tok/s=13014 elapsed=280.0m
|
| 655 |
+
step=26740 tok=219.1M loss=2.6944 lr=9.44e-05 grad=0.54 tok/s=13015 elapsed=280.5m
|
| 656 |
+
step=26791 tok=219.5M loss=2.7102 lr=9.39e-05 grad=0.70 tok/s=13016 elapsed=281.0m
|
| 657 |
+
step=26842 tok=219.9M loss=2.7165 lr=9.34e-05 grad=0.77 tok/s=13018 elapsed=281.5m
|
| 658 |
+
step=26893 tok=220.3M loss=2.6502 lr=9.30e-05 grad=0.55 tok/s=13019 elapsed=282.0m
|
| 659 |
+
step=26944 tok=220.7M loss=2.6546 lr=9.25e-05 grad=0.57 tok/s=13021 elapsed=282.5m
|
| 660 |
+
step=26995 tok=221.1M loss=2.7595 lr=9.20e-05 grad=0.61 tok/s=13022 elapsed=283.0m
|
| 661 |
+
step=27046 tok=221.6M loss=2.6999 lr=9.16e-05 grad=0.59 tok/s=13024 elapsed=283.5m
|
| 662 |
+
step=27097 tok=222.0M loss=2.7419 lr=9.11e-05 grad=0.65 tok/s=13025 elapsed=284.0m
|
| 663 |
+
step=27148 tok=222.4M loss=2.7230 lr=9.07e-05 grad=0.55 tok/s=13027 elapsed=284.5m
|
| 664 |
+
step=27199 tok=222.8M loss=2.7301 lr=9.02e-05 grad=0.55 tok/s=13028 elapsed=285.0m
|
| 665 |
+
step=27250 tok=223.2M loss=2.7566 lr=8.98e-05 grad=0.58 tok/s=13029 elapsed=285.5m
|
| 666 |
+
|
| 667 |
+
=== Eval 9/12 at step 27281 ===
|
| 668 |
+
val_loss=2.8306
|
| 669 |
+
ar: 2.5561
|
| 670 |
+
en: 3.0455
|
| 671 |
+
fr: 3.0990
|
| 672 |
+
es: 3.8887
|
| 673 |
+
ru: 4.3438
|
| 674 |
+
zh: 5.3153
|
| 675 |
+
tr: 4.0199
|
| 676 |
+
code: 1.5828
|
| 677 |
+
math: 3.1989
|
| 678 |
+
classical: 2.4880
|
| 679 |
+
Generation on 140 prompts done in 111s
|
| 680 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_09.pt
|
| 681 |
+
|
| 682 |
+
step=27282 tok=223.5M loss=2.6982 lr=8.95e-05 grad=0.59 tok/s=12947 elapsed=287.7m
|
| 683 |
+
step=27333 tok=223.9M loss=2.7164 lr=8.90e-05 grad=0.61 tok/s=12948 elapsed=288.2m
|
| 684 |
+
step=27384 tok=224.3M loss=2.7466 lr=8.86e-05 grad=0.57 tok/s=12950 elapsed=288.7m
|
| 685 |
+
step=27435 tok=224.7M loss=2.7170 lr=8.81e-05 grad=0.61 tok/s=12951 elapsed=289.2m
|
| 686 |
+
step=27486 tok=225.2M loss=2.7079 lr=8.77e-05 grad=0.60 tok/s=12953 elapsed=289.7m
|
| 687 |
+
step=27537 tok=225.6M loss=2.7889 lr=8.72e-05 grad=0.57 tok/s=12954 elapsed=290.2m
|
| 688 |
+
step=27588 tok=226.0M loss=2.6543 lr=8.68e-05 grad=0.59 tok/s=12956 elapsed=290.7m
|
| 689 |
+
step=27639 tok=226.4M loss=2.6345 lr=8.63e-05 grad=0.72 tok/s=12957 elapsed=291.2m
|
| 690 |
+
step=27690 tok=226.8M loss=2.6559 lr=8.59e-05 grad=0.65 tok/s=12959 elapsed=291.7m
|
| 691 |
+
step=27741 tok=227.3M loss=2.6746 lr=8.54e-05 grad=0.61 tok/s=12960 elapsed=292.2m
|
| 692 |
+
step=27792 tok=227.7M loss=2.6498 lr=8.50e-05 grad=0.56 tok/s=12962 elapsed=292.7m
|
| 693 |
+
step=27843 tok=228.1M loss=2.6391 lr=8.45e-05 grad=0.61 tok/s=12963 elapsed=293.3m
|
| 694 |
+
step=27894 tok=228.5M loss=2.7152 lr=8.41e-05 grad=0.62 tok/s=12965 elapsed=293.8m
|
| 695 |
+
step=27945 tok=228.9M loss=2.6506 lr=8.37e-05 grad=0.57 tok/s=12966 elapsed=294.3m
|
| 696 |
+
step=27996 tok=229.3M loss=2.7354 lr=8.32e-05 grad=0.52 tok/s=12968 elapsed=294.8m
|
| 697 |
+
step=28047 tok=229.8M loss=2.6645 lr=8.28e-05 grad=0.60 tok/s=12969 elapsed=295.3m
|
| 698 |
+
step=28098 tok=230.2M loss=2.6803 lr=8.24e-05 grad=0.53 tok/s=12971 elapsed=295.8m
|
| 699 |
+
step=28149 tok=230.6M loss=2.7080 lr=8.19e-05 grad=0.61 tok/s=12972 elapsed=296.3m
|
| 700 |
+
step=28200 tok=231.0M loss=2.7184 lr=8.15e-05 grad=0.56 tok/s=12974 elapsed=296.8m
|
| 701 |
+
step=28251 tok=231.4M loss=2.7128 lr=8.11e-05 grad=0.55 tok/s=12975 elapsed=297.3m
|
| 702 |
+
step=28302 tok=231.8M loss=2.7700 lr=8.06e-05 grad=0.59 tok/s=12976 elapsed=297.8m
|
| 703 |
+
step=28353 tok=232.3M loss=2.6943 lr=8.02e-05 grad=0.56 tok/s=12978 elapsed=298.3m
|
| 704 |
+
step=28404 tok=232.7M loss=2.7509 lr=7.98e-05 grad=0.59 tok/s=12979 elapsed=298.8m
|
| 705 |
+
step=28455 tok=233.1M loss=2.6688 lr=7.94e-05 grad=0.61 tok/s=12981 elapsed=299.3m
|
| 706 |
+
step=28506 tok=233.5M loss=2.6594 lr=7.89e-05 grad=0.52 tok/s=12982 elapsed=299.8m
|
| 707 |
+
step=28557 tok=233.9M loss=2.6505 lr=7.85e-05 grad=0.56 tok/s=12984 elapsed=300.3m
|
| 708 |
+
step=28608 tok=234.4M loss=2.7274 lr=7.81e-05 grad=0.59 tok/s=12985 elapsed=300.8m
|
| 709 |
+
step=28658 tok=234.8M loss=2.7440 lr=7.77e-05 grad=0.57 tok/s=12986 elapsed=301.3m
|
| 710 |
+
step=28709 tok=235.2M loss=2.6913 lr=7.73e-05 grad=0.61 tok/s=12987 elapsed=301.8m
|
| 711 |
+
step=28760 tok=235.6M loss=2.6750 lr=7.69e-05 grad=0.61 tok/s=12988 elapsed=302.3m
|
| 712 |
+
step=28811 tok=236.0M loss=2.6915 lr=7.64e-05 grad=0.68 tok/s=12990 elapsed=302.8m
|
| 713 |
+
step=28862 tok=236.4M loss=2.6500 lr=7.60e-05 grad=0.61 tok/s=12991 elapsed=303.3m
|
| 714 |
+
step=28913 tok=236.9M loss=2.6869 lr=7.56e-05 grad=0.66 tok/s=12993 elapsed=303.8m
|
| 715 |
+
step=28964 tok=237.3M loss=2.7037 lr=7.52e-05 grad=0.57 tok/s=12994 elapsed=304.3m
|
| 716 |
+
step=29015 tok=237.7M loss=2.6236 lr=7.48e-05 grad=0.61 tok/s=12995 elapsed=304.8m
|
| 717 |
+
step=29066 tok=238.1M loss=2.6719 lr=7.44e-05 grad=0.55 tok/s=12997 elapsed=305.3m
|
| 718 |
+
step=29117 tok=238.5M loss=2.7101 lr=7.40e-05 grad=0.57 tok/s=12998 elapsed=305.8m
|
| 719 |
+
step=29168 tok=238.9M loss=2.6453 lr=7.36e-05 grad=0.58 tok/s=12999 elapsed=306.4m
|
| 720 |
+
step=29219 tok=239.4M loss=2.5733 lr=7.32e-05 grad=0.52 tok/s=13001 elapsed=306.9m
|
| 721 |
+
step=29270 tok=239.8M loss=2.6823 lr=7.28e-05 grad=0.63 tok/s=13002 elapsed=307.4m
|
| 722 |
+
step=29321 tok=240.2M loss=2.6960 lr=7.24e-05 grad=0.58 tok/s=13003 elapsed=307.9m
|
| 723 |
+
step=29372 tok=240.6M loss=2.6964 lr=7.20e-05 grad=0.57 tok/s=13005 elapsed=308.4m
|
| 724 |
+
step=29423 tok=241.0M loss=2.6901 lr=7.16e-05 grad=0.71 tok/s=13006 elapsed=308.9m
|
| 725 |
+
step=29474 tok=241.5M loss=2.6620 lr=7.12e-05 grad=0.60 tok/s=13007 elapsed=309.4m
|
| 726 |
+
step=29525 tok=241.9M loss=2.6733 lr=7.08e-05 grad=0.59 tok/s=13009 elapsed=309.9m
|
| 727 |
+
step=29576 tok=242.3M loss=2.7231 lr=7.04e-05 grad=0.64 tok/s=13010 elapsed=310.4m
|
| 728 |
+
step=29627 tok=242.7M loss=2.6935 lr=7.00e-05 grad=0.62 tok/s=13011 elapsed=310.9m
|
| 729 |
+
step=29678 tok=243.1M loss=2.6719 lr=6.96e-05 grad=0.62 tok/s=13013 elapsed=311.4m
|
| 730 |
+
step=29729 tok=243.5M loss=2.6744 lr=6.92e-05 grad=0.68 tok/s=13014 elapsed=311.9m
|
| 731 |
+
step=29780 tok=244.0M loss=2.6197 lr=6.88e-05 grad=0.58 tok/s=13015 elapsed=312.4m
|
| 732 |
+
step=29831 tok=244.4M loss=2.6728 lr=6.85e-05 grad=0.65 tok/s=13017 elapsed=312.9m
|
| 733 |
+
step=29882 tok=244.8M loss=2.7175 lr=6.81e-05 grad=0.62 tok/s=13018 elapsed=313.4m
|
| 734 |
+
step=29933 tok=245.2M loss=2.7120 lr=6.77e-05 grad=0.53 tok/s=13019 elapsed=313.9m
|
| 735 |
+
step=29984 tok=245.6M loss=2.7124 lr=6.73e-05 grad=0.59 tok/s=13020 elapsed=314.4m
|
| 736 |
+
step=30035 tok=246.0M loss=2.7305 lr=6.69e-05 grad=0.70 tok/s=13022 elapsed=314.9m
|
| 737 |
+
step=30086 tok=246.5M loss=2.6999 lr=6.66e-05 grad=0.60 tok/s=13023 elapsed=315.4m
|
| 738 |
+
step=30137 tok=246.9M loss=2.6659 lr=6.62e-05 grad=0.64 tok/s=13024 elapsed=315.9m
|
| 739 |
+
step=30188 tok=247.3M loss=2.6508 lr=6.58e-05 grad=0.68 tok/s=13025 elapsed=316.4m
|
| 740 |
+
step=30239 tok=247.7M loss=2.7241 lr=6.54e-05 grad=0.57 tok/s=13027 elapsed=316.9m
|
| 741 |
+
step=30290 tok=248.1M loss=2.7479 lr=6.51e-05 grad=0.60 tok/s=13028 elapsed=317.4m
|
| 742 |
+
|
| 743 |
+
=== Eval 10/12 at step 30317 ===
|
| 744 |
+
val_loss=2.7550
|
| 745 |
+
ar: 2.5587
|
| 746 |
+
en: 2.9619
|
| 747 |
+
fr: 3.3698
|
| 748 |
+
es: 3.6222
|
| 749 |
+
ru: 3.8594
|
| 750 |
+
zh: 5.2719
|
| 751 |
+
tr: 3.7670
|
| 752 |
+
code: 1.6761
|
| 753 |
+
math: 2.9169
|
| 754 |
+
classical: 2.5482
|
| 755 |
+
Generation on 140 prompts done in 112s
|
| 756 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_10.pt
|
| 757 |
+
|
| 758 |
+
step=30318 tok=248.4M loss=2.6091 lr=6.49e-05 grad=0.58 tok/s=12953 elapsed=319.6m
|
| 759 |
+
step=30369 tok=248.8M loss=2.7115 lr=6.45e-05 grad=0.61 tok/s=12954 elapsed=320.1m
|
| 760 |
+
step=30420 tok=249.2M loss=2.7287 lr=6.41e-05 grad=0.57 tok/s=12955 elapsed=320.6m
|
| 761 |
+
step=30471 tok=249.6M loss=2.6802 lr=6.38e-05 grad=0.81 tok/s=12957 elapsed=321.1m
|
| 762 |
+
step=30522 tok=250.0M loss=2.7123 lr=6.34e-05 grad=0.57 tok/s=12958 elapsed=321.6m
|
| 763 |
+
step=30573 tok=250.5M loss=2.7323 lr=6.31e-05 grad=0.59 tok/s=12959 elapsed=322.1m
|
| 764 |
+
step=30624 tok=250.9M loss=2.6867 lr=6.27e-05 grad=0.62 tok/s=12961 elapsed=322.6m
|
| 765 |
+
step=30675 tok=251.3M loss=2.7179 lr=6.23e-05 grad=0.61 tok/s=12962 elapsed=323.1m
|
| 766 |
+
step=30726 tok=251.7M loss=2.6739 lr=6.20e-05 grad=0.63 tok/s=12963 elapsed=323.6m
|
| 767 |
+
step=30777 tok=252.1M loss=2.6725 lr=6.16e-05 grad=0.59 tok/s=12965 elapsed=324.1m
|
| 768 |
+
step=30828 tok=252.5M loss=2.6956 lr=6.13e-05 grad=0.58 tok/s=12966 elapsed=324.6m
|
| 769 |
+
step=30879 tok=253.0M loss=2.5900 lr=6.09e-05 grad=0.64 tok/s=12967 elapsed=325.1m
|
| 770 |
+
step=30930 tok=253.4M loss=2.6241 lr=6.06e-05 grad=0.56 tok/s=12969 elapsed=325.6m
|
| 771 |
+
step=30981 tok=253.8M loss=2.6526 lr=6.02e-05 grad=0.59 tok/s=12970 elapsed=326.1m
|
| 772 |
+
step=31032 tok=254.2M loss=2.6243 lr=5.99e-05 grad=0.58 tok/s=12971 elapsed=326.6m
|
| 773 |
+
step=31083 tok=254.6M loss=2.6883 lr=5.95e-05 grad=0.64 tok/s=12973 elapsed=327.1m
|
| 774 |
+
step=31134 tok=255.0M loss=2.6844 lr=5.92e-05 grad=0.65 tok/s=12974 elapsed=327.6m
|
| 775 |
+
step=31185 tok=255.5M loss=2.6749 lr=5.89e-05 grad=0.56 tok/s=12975 elapsed=328.1m
|
| 776 |
+
step=31236 tok=255.9M loss=2.6861 lr=5.85e-05 grad=0.57 tok/s=12976 elapsed=328.7m
|
| 777 |
+
step=31287 tok=256.3M loss=2.7235 lr=5.82e-05 grad=0.63 tok/s=12978 elapsed=329.2m
|
| 778 |
+
step=31338 tok=256.7M loss=2.6554 lr=5.79e-05 grad=0.61 tok/s=12979 elapsed=329.7m
|
| 779 |
+
step=31389 tok=257.1M loss=2.6678 lr=5.75e-05 grad=0.63 tok/s=12980 elapsed=330.2m
|
| 780 |
+
step=31440 tok=257.6M loss=2.7250 lr=5.72e-05 grad=0.58 tok/s=12982 elapsed=330.7m
|
| 781 |
+
step=31491 tok=258.0M loss=2.6583 lr=5.69e-05 grad=0.60 tok/s=12983 elapsed=331.2m
|
| 782 |
+
step=31542 tok=258.4M loss=2.6066 lr=5.65e-05 grad=0.59 tok/s=12984 elapsed=331.7m
|
| 783 |
+
step=31593 tok=258.8M loss=2.6150 lr=5.62e-05 grad=0.63 tok/s=12985 elapsed=332.2m
|
| 784 |
+
step=31644 tok=259.2M loss=2.6674 lr=5.59e-05 grad=0.58 tok/s=12987 elapsed=332.7m
|
| 785 |
+
step=31694 tok=259.6M loss=2.6464 lr=5.56e-05 grad=0.63 tok/s=12987 elapsed=333.2m
|
| 786 |
+
step=31745 tok=260.1M loss=2.6840 lr=5.53e-05 grad=0.74 tok/s=12989 elapsed=333.7m
|
| 787 |
+
step=31796 tok=260.5M loss=2.6549 lr=5.49e-05 grad=0.60 tok/s=12990 elapsed=334.2m
|
| 788 |
+
step=31847 tok=260.9M loss=2.5835 lr=5.46e-05 grad=0.66 tok/s=12991 elapsed=334.7m
|
| 789 |
+
step=31898 tok=261.3M loss=2.6483 lr=5.43e-05 grad=0.62 tok/s=12992 elapsed=335.2m
|
| 790 |
+
step=31949 tok=261.7M loss=2.6030 lr=5.40e-05 grad=0.58 tok/s=12994 elapsed=335.7m
|
| 791 |
+
step=32000 tok=262.1M loss=2.6956 lr=5.37e-05 grad=0.64 tok/s=12995 elapsed=336.2m
|
| 792 |
+
step=32051 tok=262.6M loss=2.6125 lr=5.34e-05 grad=0.57 tok/s=12996 elapsed=336.7m
|
| 793 |
+
step=32102 tok=263.0M loss=2.6941 lr=5.31e-05 grad=0.57 tok/s=12997 elapsed=337.2m
|
| 794 |
+
step=32153 tok=263.4M loss=2.6824 lr=5.28e-05 grad=0.66 tok/s=12998 elapsed=337.7m
|
| 795 |
+
step=32204 tok=263.8M loss=2.6792 lr=5.25e-05 grad=0.61 tok/s=13000 elapsed=338.2m
|
| 796 |
+
step=32255 tok=264.2M loss=2.6123 lr=5.22e-05 grad=0.60 tok/s=13001 elapsed=338.7m
|
| 797 |
+
step=32306 tok=264.7M loss=2.6626 lr=5.19e-05 grad=0.59 tok/s=13002 elapsed=339.2m
|
| 798 |
+
step=32357 tok=265.1M loss=2.6681 lr=5.16e-05 grad=0.59 tok/s=13003 elapsed=339.7m
|
| 799 |
+
step=32408 tok=265.5M loss=2.6381 lr=5.13e-05 grad=0.57 tok/s=13004 elapsed=340.2m
|
| 800 |
+
step=32459 tok=265.9M loss=2.7616 lr=5.10e-05 grad=0.61 tok/s=13006 elapsed=340.8m
|
| 801 |
+
step=32510 tok=266.3M loss=2.6928 lr=5.07e-05 grad=0.58 tok/s=13007 elapsed=341.3m
|
| 802 |
+
step=32561 tok=266.7M loss=2.6519 lr=5.04e-05 grad=0.62 tok/s=13008 elapsed=341.8m
|
| 803 |
+
step=32612 tok=267.2M loss=2.6476 lr=5.01e-05 grad=0.59 tok/s=13009 elapsed=342.3m
|
| 804 |
+
step=32663 tok=267.6M loss=2.6675 lr=4.98e-05 grad=0.66 tok/s=13010 elapsed=342.8m
|
| 805 |
+
step=32714 tok=268.0M loss=2.6406 lr=4.95e-05 grad=0.57 tok/s=13012 elapsed=343.3m
|
| 806 |
+
step=32765 tok=268.4M loss=2.6230 lr=4.92e-05 grad=0.61 tok/s=13013 elapsed=343.8m
|
| 807 |
+
step=32816 tok=268.8M loss=2.6184 lr=4.90e-05 grad=0.66 tok/s=13014 elapsed=344.3m
|
| 808 |
+
step=32867 tok=269.2M loss=2.6721 lr=4.87e-05 grad=0.54 tok/s=13015 elapsed=344.8m
|
| 809 |
+
step=32918 tok=269.7M loss=2.6217 lr=4.84e-05 grad=0.64 tok/s=13016 elapsed=345.3m
|
| 810 |
+
step=32969 tok=270.1M loss=2.7452 lr=4.81e-05 grad=0.62 tok/s=13018 elapsed=345.8m
|
| 811 |
+
step=33020 tok=270.5M loss=2.6887 lr=4.79e-05 grad=0.65 tok/s=13019 elapsed=346.3m
|
| 812 |
+
step=33071 tok=270.9M loss=2.6410 lr=4.76e-05 grad=0.57 tok/s=13020 elapsed=346.8m
|
| 813 |
+
step=33122 tok=271.3M loss=2.6373 lr=4.73e-05 grad=0.71 tok/s=13021 elapsed=347.3m
|
| 814 |
+
step=33173 tok=271.8M loss=2.5999 lr=4.70e-05 grad=0.54 tok/s=13022 elapsed=347.8m
|
| 815 |
+
step=33224 tok=272.2M loss=2.6905 lr=4.68e-05 grad=0.65 tok/s=13023 elapsed=348.3m
|
| 816 |
+
step=33275 tok=272.6M loss=2.6646 lr=4.65e-05 grad=0.55 tok/s=13024 elapsed=348.8m
|
| 817 |
+
step=33326 tok=273.0M loss=2.6588 lr=4.63e-05 grad=0.59 tok/s=13026 elapsed=349.3m
|
| 818 |
+
|
| 819 |
+
=== Eval 11/12 at step 33352 ===
|
| 820 |
+
val_loss=2.7341
|
| 821 |
+
ar: 2.5185
|
| 822 |
+
en: 2.9525
|
| 823 |
+
fr: 3.4330
|
| 824 |
+
es: 3.9414
|
| 825 |
+
ru: 4.1979
|
| 826 |
+
zh: 5.0580
|
| 827 |
+
tr: 3.3487
|
| 828 |
+
code: 1.7310
|
| 829 |
+
math: 2.8827
|
| 830 |
+
classical: 2.4728
|
| 831 |
+
Generation on 140 prompts done in 112s
|
| 832 |
+
Saved /home/aessam/arkadiko/output/checkpoints/llm/v4_a/ckpt_11.pt
|
| 833 |
+
|
| 834 |
+
step=33353 tok=273.2M loss=2.6957 lr=4.61e-05 grad=0.59 tok/s=12957 elapsed=351.5m
|
| 835 |
+
step=33404 tok=273.6M loss=2.6843 lr=4.59e-05 grad=0.61 tok/s=12958 elapsed=352.0m
|
| 836 |
+
step=33455 tok=274.1M loss=2.6993 lr=4.56e-05 grad=0.59 tok/s=12959 elapsed=352.5m
|
| 837 |
+
step=33506 tok=274.5M loss=2.6224 lr=4.53e-05 grad=0.62 tok/s=12961 elapsed=353.0m
|
| 838 |
+
step=33557 tok=274.9M loss=2.6439 lr=4.51e-05 grad=0.63 tok/s=12962 elapsed=353.5m
|
| 839 |
+
step=33608 tok=275.3M loss=2.7276 lr=4.48e-05 grad=0.72 tok/s=12963 elapsed=354.0m
|
| 840 |
+
step=33659 tok=275.7M loss=2.6838 lr=4.46e-05 grad=0.63 tok/s=12964 elapsed=354.5m
|
| 841 |
+
step=33710 tok=276.2M loss=2.6683 lr=4.43e-05 grad=0.59 tok/s=12965 elapsed=355.0m
|
| 842 |
+
step=33761 tok=276.6M loss=2.6793 lr=4.41e-05 grad=0.62 tok/s=12967 elapsed=355.5m
|
| 843 |
+
step=33812 tok=277.0M loss=2.6300 lr=4.39e-05 grad=0.66 tok/s=12968 elapsed=356.0m
|
| 844 |
+
step=33863 tok=277.4M loss=2.6601 lr=4.36e-05 grad=0.76 tok/s=12969 elapsed=356.5m
|
| 845 |
+
step=33914 tok=277.8M loss=2.6790 lr=4.34e-05 grad=0.64 tok/s=12970 elapsed=357.0m
|
| 846 |
+
step=33965 tok=278.2M loss=2.6010 lr=4.31e-05 grad=0.61 tok/s=12971 elapsed=357.5m
|
| 847 |
+
step=34016 tok=278.7M loss=2.6422 lr=4.29e-05 grad=0.62 tok/s=12973 elapsed=358.0m
|
| 848 |
+
step=34067 tok=279.1M loss=2.7317 lr=4.27e-05 grad=0.59 tok/s=12974 elapsed=358.5m
|
| 849 |
+
step=34118 tok=279.5M loss=2.7067 lr=4.24e-05 grad=0.63 tok/s=12975 elapsed=359.0m
|
| 850 |
+
step=34169 tok=279.9M loss=2.6949 lr=4.22e-05 grad=0.65 tok/s=12976 elapsed=359.5m
|
| 851 |
+
|
| 852 |
+
============================================================
|
| 853 |
+
TRAINING COMPLETE (6.00h)
|
| 854 |
+
Total steps: 34218
|
| 855 |
+
Total tokens: 280.3M
|
| 856 |
+
Avg tok/s: 12977
|
| 857 |
+
Variant: a
|
| 858 |
+
============================================================
|