", line 198 in _run_module_as_main
+
+Extension modules: numpy._core._multiarray_umath, numpy.linalg._umath_linalg, torch._C, torch._C._dynamo.autograd_compiler, torch._C._dynamo.eval_frame, torch._C._dynamo.guards, torch._C._dynamo.utils, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, zstandard.backend_c, pyarrow.lib, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._pcg64, numpy.random._mt19937, numpy.random._generator, numpy.random._philox, numpy.random._sfc64, numpy.random.mtrand, pandas._libs.tslibs.ccalendar, pandas._libs.tslibs.np_datetime, pandas._libs.tslibs.dtypes, pandas._libs.tslibs.base, pandas._libs.tslibs.nattype, pandas._libs.tslibs.timezones, pandas._libs.tslibs.fields, pandas._libs.tslibs.timedeltas, pandas._libs.tslibs.tzconversion, pandas._libs.tslibs.timestamps, pandas._libs.properties, pandas._libs.tslibs.offsets, pandas._libs.tslibs.strptime, pandas._libs.tslibs.parsing, pandas._libs.tslibs.conversion, pandas._libs.tslibs.period, pandas._libs.tslibs.vectorized, pandas._libs.ops_dispatch, pandas._libs.missing, pandas._libs.hashtable, pandas._libs.algos, pandas._libs.interval, pandas._libs.lib, pyarrow._compute, pandas._libs.ops, numexpr.interpreter, pandas._libs.hashing, pandas._libs.arrays, pandas._libs.tslib, pandas._libs.sparse, pandas._libs.internals, pandas._libs.indexing, pandas._libs.index, pandas._libs.writers, pandas._libs.join, pandas._libs.window.aggregations, pandas._libs.window.indexers, pandas._libs.reshape, pandas._libs.groupby, pandas._libs.json, pandas._libs.parsers, pandas._libs.testing, charset_normalizer.md, yaml._yaml, pyarrow._parquet, pyarrow._fs, pyarrow._azurefs, pyarrow._hdfs, pyarrow._gcsfs, pyarrow._s3fs, multidict._multidict, yarl._quoting_c, propcache._helpers_c, aiohttp._http_writer, aiohttp._http_parser, aiohttp._websocket.mask, aiohttp._websocket.reader_c, frozenlist._frozenlist, xxhash._xxhash, pyarrow._acero, pyarrow._csv, pyarrow._json, pyarrow._substrait, pyarrow._dataset, pyarrow._dataset_orc, pyarrow._parquet_encryption, pyarrow._dataset_parquet_encryption, pyarrow._dataset_parquet, markupsafe._speedups, PIL._imaging, sklearn.__check_build._check_build, scipy._lib._ccallback_c, scipy.sparse._sparsetools, _csparsetools, _cyutility, scipy._cyutility, scipy.sparse._csparsetools, psutil._psutil_linux, psutil._psutil_posix, scipy.special._ufuncs_cxx, scipy.special._ellip_harm_2, scipy.special._special_ufuncs, scipy.special._gufuncs, scipy.special._ufuncs, scipy.special._specfun, scipy.special._comb, scipy.linalg._fblas, scipy.linalg._flapack, scipy.linalg.cython_lapack, scipy.linalg._cythonized_array_utils, scipy.linalg._solve_toeplitz, scipy.linalg._decomp_lu_cython, scipy.linalg._matfuncs_schur_sqrtm, scipy.linalg._matfuncs_expm, scipy.linalg._linalg_pythran, scipy.linalg.cython_blas, scipy.linalg._decomp_update, scipy.sparse.linalg._dsolve._superlu, scipy.sparse.linalg._eigen.arpack._arpack, scipy.sparse.linalg._propack._spropack, scipy.sparse.linalg._propack._dpropack, scipy.sparse.linalg._propack._cpropack, scipy.sparse.linalg._propack._zpropack, scipy.spatial._ckdtree, scipy._lib.messagestream, scipy.spatial._qhull, scipy.spatial._voronoi, scipy.spatial._hausdorff, scipy.spatial._distance_wrap, scipy.spatial.transform._rotation, scipy.spatial.transform._rigid_transform, scipy.optimize._group_columns, scipy.optimize._trlib._trlib, scipy.optimize._lbfgsb, _moduleTNC, scipy.optimize._moduleTNC, scipy.optimize._slsqplib, scipy.optimize._minpack, scipy.optimize._lsq.givens_elimination, scipy.optimize._zeros, scipy._lib._uarray._uarray, scipy.linalg._decomp_interpolative, scipy.optimize._bglu_dense, scipy.optimize._lsap, scipy.optimize._direct, scipy.integrate._odepack, scipy.integrate._quadpack, scipy.integrate._vode, scipy.integrate._dop, scipy.integrate._lsoda, scipy.interpolate._fitpack, scipy.interpolate._dfitpack, scipy.interpolate._dierckx, scipy.interpolate._ppoly, scipy.interpolate._interpnd, scipy.interpolate._rbfinterp_pythran, scipy.interpolate._rgi_cython, scipy.special.cython_special, scipy.stats._stats, scipy.stats._biasedurn, scipy.stats._stats_pythran, scipy.stats._levy_stable.levyst, scipy.stats._ansari_swilk_statistics, scipy.sparse.csgraph._tools, scipy.sparse.csgraph._shortest_path, scipy.sparse.csgraph._traversal, scipy.sparse.csgraph._min_spanning_tree, scipy.sparse.csgraph._flow, scipy.sparse.csgraph._matching, scipy.sparse.csgraph._reordering, scipy.stats._sobol, scipy.stats._qmc_cy, scipy.stats._rcont.rcont, scipy.stats._qmvnt_cy, scipy.ndimage._nd_image, scipy.ndimage._rank_filter_1d, _ni_label, scipy.ndimage._ni_label, sklearn._cyutility, sklearn.utils._isfinite, sklearn.utils.sparsefuncs_fast, sklearn.utils.murmurhash, sklearn.utils._openmp_helpers, sklearn.metrics.cluster._expected_mutual_info_fast, sklearn.preprocessing._csr_polynomial_expansion, sklearn.preprocessing._target_encoder_fast, sklearn.metrics._dist_metrics, sklearn.metrics._pairwise_distances_reduction._datasets_pair, sklearn.utils._cython_blas, sklearn.metrics._pairwise_distances_reduction._base, sklearn.metrics._pairwise_distances_reduction._middle_term_computer, sklearn.utils._heap, sklearn.utils._sorting, sklearn.metrics._pairwise_distances_reduction._argkmin, sklearn.metrics._pairwise_distances_reduction._argkmin_classmode, sklearn.utils._vector_sentinel, sklearn.metrics._pairwise_distances_reduction._radius_neighbors, sklearn.metrics._pairwise_distances_reduction._radius_neighbors_classmode, sklearn.metrics._pairwise_fast, cuda_utils, google._upb._message, h5py._errors, h5py.defs, h5py._objects, h5py.h5, h5py.utils, h5py.h5t, h5py.h5s, h5py.h5ac, h5py.h5p, h5py.h5r, h5py._npystrings, h5py._proxy, h5py._conv, h5py.h5z, h5py.h5a, h5py.h5d, h5py.h5ds, h5py.h5g, h5py.h5i, h5py.h5o, h5py.h5f, h5py.h5fd, h5py.h5pl, h5py.h5l, h5py._selector, kiwisolver._cext, regex._regex, sentencepiece._sentencepiece, scipy.io.matlab._mio_utils, scipy.io.matlab._streams, scipy.io.matlab._mio5_utils, __triton_launcher (total: 237)
diff --git a/logs/none_lyv0rec_/attempt_0/7/stdout.log b/logs/none_lyv0rec_/attempt_0/7/stdout.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tb/20250909-0619/wandb/debug.log b/tb/20250909-0619/wandb/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..b1fe6180502b5e0c9a480097879c081c907d671c
--- /dev/null
+++ b/tb/20250909-0619/wandb/debug.log
@@ -0,0 +1,21 @@
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Configure stats pid to 795439
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/.config/wandb/settings
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/flame/wandb/settings
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:setup_run_log_directory():703] Logging user logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():830] calling init triggers
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():871] starting backend
+2025-09-09 06:19:20,025 INFO MainThread:795439 [wandb_init.py:init():874] sending inform_init request
+2025-09-09 06:19:20,027 INFO MainThread:795439 [wandb_init.py:init():882] backend started and connected
+2025-09-09 06:19:20,033 INFO MainThread:795439 [wandb_init.py:init():953] updated telemetry
+2025-09-09 06:19:20,039 INFO MainThread:795439 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
+2025-09-09 06:19:20,682 INFO MainThread:795439 [wandb_init.py:init():1029] starting run threads in backend
+2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_console_start():2458] atexit reg
+2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2306] redirect: wrap_raw
+2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2375] Wrapping output streams.
+2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2398] Redirects installed.
+2025-09-09 06:19:20,817 INFO MainThread:795439 [wandb_init.py:init():1075] run started, returning control to user process
diff --git a/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/output.log b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..f4926fcbc9c41d7b06f6280f438620828a91c0ff
--- /dev/null
+++ b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/output.log
@@ -0,0 +1,4285 @@
+[titan] 2025-09-09 06:19:20,817 - root - INFO - WandB logging enabled
+[titan] 2025-09-09 06:19:20,890 - root - INFO - CUDA capacity: NVIDIA H200 with 139.36GiB memory
+[titan] 2025-09-09 06:19:28,408 - root - INFO - [31m***** Running training *****[39m
+[titan] 2025-09-09 06:19:28,442 - root - INFO - [32m Training starts at step 20001
+[titan] 2025-09-09 06:19:28,442 - root - INFO - [32m Number of tokens per sequence = 4,096
+[titan] 2025-09-09 06:19:28,442 - root - INFO - [32m Gradient Accumulation steps = 2
+[titan] 2025-09-09 06:19:28,442 - root - INFO - [32m Instantaneous batch size (per device) = 8
+[titan] 2025-09-09 06:19:28,442 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (524,288 tokens)
+[titan] 2025-09-09 06:19:28,443 - root - INFO - [32m Total optimization steps = 40,000 (20,971,520,000 tokens)
+[titan] 2025-09-09 06:19:28,443 - root - INFO - [32m Warmup steps = 400 (209,715,200 tokens)
+[titan] 2025-09-09 06:19:28,443 - root - INFO - [32m Number of parameters = 6,936,580,096 [39m
+[titan] 2025-09-09 06:19:28,443 - root - INFO - Profiling active. Traces will be saved at exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/profile_trace
+[titan] 2025-09-09 06:20:20,235 - root - INFO - [31mstep: 20005 [32mloss: 2.7469 [33mmemory: 122.03GiB(87.57%) [34mtps: 5,524 [36mtflops: 263.28 [35mmfu: 26.62%[39m [37mglobal_avg_ntp_loss: 0.7818 [37mglobal_avg_top_loss: 1.9651
+[titan] 2025-09-09 06:20:20,236 - root - INFO - [34mlr: 1.1139e-05 gnorm: 0.33 [35m[1 day, 12:44:53<1 day, 12:43:46][39m
+[titan] 2025-09-09 06:20:50,050 - root - INFO - [31mstep: 20010 [32mloss: 2.9600 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,991 [36mtflops: 523.83 [35mmfu: 52.97%[39m [37mglobal_avg_ntp_loss: 0.8794 [37mglobal_avg_top_loss: 2.0806
+[titan] 2025-09-09 06:20:50,050 - root - INFO - [34mlr: 1.1135e-05 gnorm: 0.44 [35m[1 day, 12:45:22<1 day, 12:43:10][39m
+[titan] 2025-09-09 06:21:19,936 - root - INFO - [31mstep: 20015 [32mloss: 2.7626 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,965 [36mtflops: 522.56 [35mmfu: 52.84%[39m [37mglobal_avg_ntp_loss: 0.7871 [37mglobal_avg_top_loss: 1.9755
+[titan] 2025-09-09 06:21:19,937 - root - INFO - [34mlr: 1.1132e-05 gnorm: 0.34 [35m[1 day, 12:45:52<1 day, 12:42:34][39m
+[titan] 2025-09-09 06:21:49,995 - root - INFO - [31mstep: 20020 [32mloss: 2.7556 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,902 [36mtflops: 519.56 [35mmfu: 52.53%[39m [37mglobal_avg_ntp_loss: 0.7861 [37mglobal_avg_top_loss: 1.9694
+[titan] 2025-09-09 06:21:49,996 - root - INFO - [34mlr: 1.1128e-05 gnorm: 0.34 [35m[1 day, 12:46:22<1 day, 12:41:58][39m
+[titan] 2025-09-09 06:22:20,199 - root - INFO - [31mstep: 20025 [32mloss: 2.8442 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,849 [36mtflops: 517.07 [35mmfu: 52.28%[39m [37mglobal_avg_ntp_loss: 0.8268 [37mglobal_avg_top_loss: 2.0174
+[titan] 2025-09-09 06:22:20,200 - root - INFO - [34mlr: 1.1125e-05 gnorm: 0.34 [35m[1 day, 12:46:53<1 day, 12:41:22][39m
+[titan] 2025-09-09 06:22:50,511 - root - INFO - [31mstep: 20030 [32mloss: 2.7813 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,811 [36mtflops: 515.23 [35mmfu: 52.10%[39m [37mglobal_avg_ntp_loss: 0.7966 [37mglobal_avg_top_loss: 1.9848
+[titan] 2025-09-09 06:22:50,511 - root - INFO - [34mlr: 1.1121e-05 gnorm: 0.34 [35m[1 day, 12:47:23<1 day, 12:40:46][39m
+[titan] 2025-09-09 06:23:21,005 - root - INFO - [31mstep: 20035 [32mloss: 3.3298 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,746 [36mtflops: 512.14 [35mmfu: 51.78%[39m [37mglobal_avg_ntp_loss: 1.1042 [37mglobal_avg_top_loss: 2.2255
+[titan] 2025-09-09 06:23:21,006 - root - INFO - [34mlr: 1.1117e-05 gnorm: 0.34 [35m[1 day, 12:47:53<1 day, 12:40:10][39m
+[titan] 2025-09-09 06:23:51,586 - root - INFO - [31mstep: 20040 [32mloss: 2.8476 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,715 [36mtflops: 510.69 [35mmfu: 51.64%[39m [37mglobal_avg_ntp_loss: 0.8270 [37mglobal_avg_top_loss: 2.0206
+[titan] 2025-09-09 06:23:51,587 - root - INFO - [34mlr: 1.1114e-05 gnorm: 0.32 [35m[1 day, 12:48:24<1 day, 12:39:35][39m
+[titan] 2025-09-09 06:24:22,337 - root - INFO - [31mstep: 20045 [32mloss: 2.7900 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,656 [36mtflops: 507.87 [35mmfu: 51.35%[39m [37mglobal_avg_ntp_loss: 0.8005 [37mglobal_avg_top_loss: 1.9896
+[titan] 2025-09-09 06:24:22,338 - root - INFO - [34mlr: 1.1110e-05 gnorm: 0.34 [35m[1 day, 12:48:55<1 day, 12:39:00][39m
+[titan] 2025-09-09 06:24:47,252 - root - INFO - [GC] Peforming periodical GC collection. 0.13 seconds.
+[titan] 2025-09-09 06:24:53,455 - root - INFO - [31mstep: 20050 [32mloss: 2.6039 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,531 [36mtflops: 501.89 [35mmfu: 50.75%[39m [37mglobal_avg_ntp_loss: 0.7205 [37mglobal_avg_top_loss: 1.8833
+[titan] 2025-09-09 06:24:53,456 - root - INFO - [34mlr: 1.1107e-05 gnorm: 0.33 [35m[1 day, 12:49:26<1 day, 12:38:25][39m
+[titan] 2025-09-09 06:25:24,419 - root - INFO - [31mstep: 20055 [32mloss: 2.7943 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,583 [36mtflops: 504.38 [35mmfu: 51.00%[39m [37mglobal_avg_ntp_loss: 0.8078 [37mglobal_avg_top_loss: 1.9865
+[titan] 2025-09-09 06:25:24,419 - root - INFO - [34mlr: 1.1103e-05 gnorm: 0.33 [35m[1 day, 12:49:57<1 day, 12:37:49][39m
+[titan] 2025-09-09 06:25:55,659 - root - INFO - [31mstep: 20060 [32mloss: 2.8892 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,489 [36mtflops: 499.92 [35mmfu: 50.55%[39m [37mglobal_avg_ntp_loss: 0.8490 [37mglobal_avg_top_loss: 2.0401
+[titan] 2025-09-09 06:25:55,659 - root - INFO - [34mlr: 1.1100e-05 gnorm: 0.33 [35m[1 day, 12:50:28<1 day, 12:37:15][39m
+[titan] 2025-09-09 06:26:27,033 - root - INFO - [31mstep: 20065 [32mloss: 2.8216 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,445 [36mtflops: 497.78 [35mmfu: 50.33%[39m [37mglobal_avg_ntp_loss: 0.8154 [37mglobal_avg_top_loss: 2.0062
+[titan] 2025-09-09 06:26:27,033 - root - INFO - [34mlr: 1.1096e-05 gnorm: 0.35 [35m[1 day, 12:50:59<1 day, 12:36:40][39m
+[titan] 2025-09-09 06:26:58,429 - root - INFO - [31mstep: 20070 [32mloss: 2.7793 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,437 [36mtflops: 497.43 [35mmfu: 50.30%[39m [37mglobal_avg_ntp_loss: 0.7949 [37mglobal_avg_top_loss: 1.9844
+[titan] 2025-09-09 06:26:58,429 - root - INFO - [34mlr: 1.1092e-05 gnorm: 0.32 [35m[1 day, 12:51:31<1 day, 12:36:05][39m
+[titan] 2025-09-09 06:27:29,891 - root - INFO - [31mstep: 20075 [32mloss: 2.7812 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,416 [36mtflops: 496.40 [35mmfu: 50.19%[39m [37mglobal_avg_ntp_loss: 0.7926 [37mglobal_avg_top_loss: 1.9885
+[titan] 2025-09-09 06:27:29,891 - root - INFO - [34mlr: 1.1089e-05 gnorm: 0.36 [35m[1 day, 12:52:02<1 day, 12:35:30][39m
+[titan] 2025-09-09 06:28:01,444 - root - INFO - [31mstep: 20080 [32mloss: 2.7163 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,385 [36mtflops: 494.96 [35mmfu: 50.05%[39m [37mglobal_avg_ntp_loss: 0.7680 [37mglobal_avg_top_loss: 1.9483
+[titan] 2025-09-09 06:28:01,444 - root - INFO - [34mlr: 1.1085e-05 gnorm: 0.35 [35m[1 day, 12:52:34<1 day, 12:34:56][39m
+[titan] 2025-09-09 06:28:32,958 - root - INFO - [31mstep: 20085 [32mloss: 2.8179 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,398 [36mtflops: 495.57 [35mmfu: 50.11%[39m [37mglobal_avg_ntp_loss: 0.8203 [37mglobal_avg_top_loss: 1.9976
+[titan] 2025-09-09 06:28:32,959 - root - INFO - [34mlr: 1.1082e-05 gnorm: 0.34 [35m[1 day, 12:53:05<1 day, 12:34:21][39m
+[titan] 2025-09-09 06:29:04,816 - root - INFO - [31mstep: 20090 [32mloss: 2.7528 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.22 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7862 [37mglobal_avg_top_loss: 1.9666
+[titan] 2025-09-09 06:29:04,817 - root - INFO - [34mlr: 1.1078e-05 gnorm: 0.33 [35m[1 day, 12:53:37<1 day, 12:33:47][39m
+[titan] 2025-09-09 06:29:36,424 - root - INFO - [31mstep: 20095 [32mloss: 2.9881 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,367 [36mtflops: 494.10 [35mmfu: 49.96%[39m [37mglobal_avg_ntp_loss: 0.9035 [37mglobal_avg_top_loss: 2.0846
+[titan] 2025-09-09 06:29:36,425 - root - INFO - [34mlr: 1.1075e-05 gnorm: 0.36 [35m[1 day, 12:54:09<1 day, 12:33:13][39m
+[titan] 2025-09-09 06:30:01,842 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 06:30:08,210 - root - INFO - [31mstep: 20100 [32mloss: 2.6944 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,309 [36mtflops: 491.33 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.7563 [37mglobal_avg_top_loss: 1.9381
+[titan] 2025-09-09 06:30:08,211 - root - INFO - [34mlr: 1.1071e-05 gnorm: 0.35 [35m[1 day, 12:54:41<1 day, 12:32:38][39m
+[titan] 2025-09-09 06:30:39,837 - root - INFO - [31mstep: 20105 [32mloss: 2.8289 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,361 [36mtflops: 493.81 [35mmfu: 49.93%[39m [37mglobal_avg_ntp_loss: 0.8182 [37mglobal_avg_top_loss: 2.0108
+[titan] 2025-09-09 06:30:39,837 - root - INFO - [34mlr: 1.1067e-05 gnorm: 0.33 [35m[1 day, 12:55:12<1 day, 12:32:04][39m
+[titan] 2025-09-09 06:31:11,551 - root - INFO - [31mstep: 20110 [32mloss: 2.7379 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,333 [36mtflops: 492.45 [35mmfu: 49.79%[39m [37mglobal_avg_ntp_loss: 0.7768 [37mglobal_avg_top_loss: 1.9611
+[titan] 2025-09-09 06:31:11,551 - root - INFO - [34mlr: 1.1064e-05 gnorm: 0.34 [35m[1 day, 12:55:44<1 day, 12:31:29][39m
+[titan] 2025-09-09 06:31:43,421 - root - INFO - [31mstep: 20115 [32mloss: 3.2744 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.02 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 1.0718 [37mglobal_avg_top_loss: 2.2025
+[titan] 2025-09-09 06:31:43,422 - root - INFO - [34mlr: 1.1060e-05 gnorm: 0.34 [35m[1 day, 12:56:16<1 day, 12:30:55][39m
+[titan] 2025-09-09 06:32:15,207 - root - INFO - [31mstep: 20120 [32mloss: 2.8005 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,309 [36mtflops: 491.34 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.8055 [37mglobal_avg_top_loss: 1.9950
+[titan] 2025-09-09 06:32:15,207 - root - INFO - [34mlr: 1.1057e-05 gnorm: 0.32 [35m[1 day, 12:56:47<1 day, 12:30:21][39m
+[titan] 2025-09-09 06:32:47,105 - root - INFO - [31mstep: 20125 [32mloss: 2.7452 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.61 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7798 [37mglobal_avg_top_loss: 1.9654
+[titan] 2025-09-09 06:32:47,105 - root - INFO - [34mlr: 1.1053e-05 gnorm: 0.33 [35m[1 day, 12:57:19<1 day, 12:29:47][39m
+[titan] 2025-09-09 06:33:18,816 - root - INFO - [31mstep: 20130 [32mloss: 2.6240 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,334 [36mtflops: 492.49 [35mmfu: 49.80%[39m [37mglobal_avg_ntp_loss: 0.7311 [37mglobal_avg_top_loss: 1.8929
+[titan] 2025-09-09 06:33:18,816 - root - INFO - [34mlr: 1.1050e-05 gnorm: 0.33 [35m[1 day, 12:57:51<1 day, 12:29:12][39m
+[titan] 2025-09-09 06:33:50,801 - root - INFO - [31mstep: 20135 [32mloss: 2.8719 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.26 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.8376 [37mglobal_avg_top_loss: 2.0343
+[titan] 2025-09-09 06:33:50,802 - root - INFO - [34mlr: 1.1046e-05 gnorm: 0.37 [35m[1 day, 12:58:23<1 day, 12:28:38][39m
+[titan] 2025-09-09 06:34:22,522 - root - INFO - [31mstep: 20140 [32mloss: 2.7833 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,331 [36mtflops: 492.35 [35mmfu: 49.78%[39m [37mglobal_avg_ntp_loss: 0.7952 [37mglobal_avg_top_loss: 1.9881
+[titan] 2025-09-09 06:34:22,522 - root - INFO - [34mlr: 1.1042e-05 gnorm: 0.35 [35m[1 day, 12:58:55<1 day, 12:28:04][39m
+[titan] 2025-09-09 06:34:54,407 - root - INFO - [31mstep: 20145 [32mloss: 2.8903 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.80 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8482 [37mglobal_avg_top_loss: 2.0421
+[titan] 2025-09-09 06:34:54,408 - root - INFO - [34mlr: 1.1039e-05 gnorm: 0.37 [35m[1 day, 12:59:27<1 day, 12:27:30][39m
+[titan] 2025-09-09 06:35:19,986 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 06:35:26,373 - root - INFO - [31mstep: 20150 [32mloss: 2.9203 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.57 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.8794 [37mglobal_avg_top_loss: 2.0409
+[titan] 2025-09-09 06:35:26,373 - root - INFO - [34mlr: 1.1035e-05 gnorm: 0.37 [35m[1 day, 12:59:59<1 day, 12:26:56][39m
+[titan] 2025-09-09 06:35:58,179 - root - INFO - [31mstep: 20155 [32mloss: 2.8514 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.01 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.8292 [37mglobal_avg_top_loss: 2.0222
+[titan] 2025-09-09 06:35:58,180 - root - INFO - [34mlr: 1.1032e-05 gnorm: 0.33 [35m[1 day, 13:00:30<1 day, 12:26:21][39m
+[titan] 2025-09-09 06:36:30,067 - root - INFO - [31mstep: 20160 [32mloss: 2.5929 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.77 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7032 [37mglobal_avg_top_loss: 1.8897
+[titan] 2025-09-09 06:36:30,067 - root - INFO - [34mlr: 1.1028e-05 gnorm: 0.76 [35m[1 day, 13:01:02<1 day, 12:25:47][39m
+[titan] 2025-09-09 06:37:01,877 - root - INFO - [31mstep: 20165 [32mloss: 2.7879 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,301 [36mtflops: 490.96 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.8091 [37mglobal_avg_top_loss: 1.9788
+[titan] 2025-09-09 06:37:01,878 - root - INFO - [34mlr: 1.1025e-05 gnorm: 0.41 [35m[1 day, 13:01:34<1 day, 12:25:13][39m
+[titan] 2025-09-09 06:37:33,811 - root - INFO - [31mstep: 20170 [32mloss: 2.7937 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.06 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8013 [37mglobal_avg_top_loss: 1.9924
+[titan] 2025-09-09 06:37:33,811 - root - INFO - [34mlr: 1.1021e-05 gnorm: 0.52 [35m[1 day, 13:02:06<1 day, 12:24:39][39m
+[titan] 2025-09-09 06:38:05,708 - root - INFO - [31mstep: 20175 [32mloss: 2.8139 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.62 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.8099 [37mglobal_avg_top_loss: 2.0040
+[titan] 2025-09-09 06:38:05,709 - root - INFO - [34mlr: 1.1017e-05 gnorm: 0.34 [35m[1 day, 13:02:38<1 day, 12:24:04][39m
+[titan] 2025-09-09 06:38:37,468 - root - INFO - [31mstep: 20180 [32mloss: 2.8027 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,318 [36mtflops: 491.74 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 0.8096 [37mglobal_avg_top_loss: 1.9930
+[titan] 2025-09-09 06:38:37,468 - root - INFO - [34mlr: 1.1014e-05 gnorm: 0.37 [35m[1 day, 13:03:10<1 day, 12:23:30][39m
+[titan] 2025-09-09 06:39:09,190 - root - INFO - [31mstep: 20185 [32mloss: 2.7779 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,330 [36mtflops: 492.32 [35mmfu: 49.78%[39m [37mglobal_avg_ntp_loss: 0.7954 [37mglobal_avg_top_loss: 1.9826
+[titan] 2025-09-09 06:39:09,190 - root - INFO - [34mlr: 1.1010e-05 gnorm: 0.36 [35m[1 day, 13:03:41<1 day, 12:22:56][39m
+[titan] 2025-09-09 06:39:41,087 - root - INFO - [31mstep: 20190 [32mloss: 2.8220 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.63 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.8130 [37mglobal_avg_top_loss: 2.0090
+[titan] 2025-09-09 06:39:41,087 - root - INFO - [34mlr: 1.1007e-05 gnorm: 1.26 [35m[1 day, 13:04:13<1 day, 12:22:22][39m
+[titan] 2025-09-09 06:40:13,085 - root - INFO - [31mstep: 20195 [32mloss: 2.7428 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.07 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7792 [37mglobal_avg_top_loss: 1.9636
+[titan] 2025-09-09 06:40:13,085 - root - INFO - [34mlr: 1.1003e-05 gnorm: 0.37 [35m[1 day, 13:04:45<1 day, 12:21:48][39m
+[titan] 2025-09-09 06:40:38,715 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 06:40:45,093 - root - INFO - [31mstep: 20200 [32mloss: 2.7984 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.92 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.8073 [37mglobal_avg_top_loss: 1.9910
+[titan] 2025-09-09 06:40:45,094 - root - INFO - [34mlr: 1.1000e-05 gnorm: 0.37 [35m[1 day, 13:05:17<1 day, 12:21:13][39m
+[titan] 2025-09-09 06:41:16,990 - root - INFO - [31mstep: 20205 [32mloss: 2.8747 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.63 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.8392 [37mglobal_avg_top_loss: 2.0355
+[titan] 2025-09-09 06:41:16,990 - root - INFO - [34mlr: 1.0996e-05 gnorm: 0.35 [35m[1 day, 13:05:49<1 day, 12:20:39][39m
+[titan] 2025-09-09 06:41:48,710 - root - INFO - [31mstep: 20210 [32mloss: 2.8652 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,331 [36mtflops: 492.35 [35mmfu: 49.78%[39m [37mglobal_avg_ntp_loss: 0.8291 [37mglobal_avg_top_loss: 2.0361
+[titan] 2025-09-09 06:41:48,711 - root - INFO - [34mlr: 1.0993e-05 gnorm: 0.36 [35m[1 day, 13:06:21<1 day, 12:20:05][39m
+[titan] 2025-09-09 06:42:20,571 - root - INFO - [31mstep: 20215 [32mloss: 2.9655 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.17 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.9175 [37mglobal_avg_top_loss: 2.0480
+[titan] 2025-09-09 06:42:20,572 - root - INFO - [34mlr: 1.0989e-05 gnorm: 0.35 [35m[1 day, 13:06:53<1 day, 12:19:31][39m
+[titan] 2025-09-09 06:42:52,277 - root - INFO - [31mstep: 20220 [32mloss: 2.7629 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,336 [36mtflops: 492.58 [35mmfu: 49.81%[39m [37mglobal_avg_ntp_loss: 0.7864 [37mglobal_avg_top_loss: 1.9764
+[titan] 2025-09-09 06:42:52,277 - root - INFO - [34mlr: 1.0985e-05 gnorm: 0.34 [35m[1 day, 13:07:25<1 day, 12:18:56][39m
+[titan] 2025-09-09 06:43:24,073 - root - INFO - [31mstep: 20225 [32mloss: 2.7892 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.18 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.8035 [37mglobal_avg_top_loss: 1.9857
+[titan] 2025-09-09 06:43:24,073 - root - INFO - [34mlr: 1.0982e-05 gnorm: 0.36 [35m[1 day, 13:07:56<1 day, 12:18:22][39m
+[titan] 2025-09-09 06:43:56,140 - root - INFO - [31mstep: 20230 [32mloss: 2.8211 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.02 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.8123 [37mglobal_avg_top_loss: 2.0087
+[titan] 2025-09-09 06:43:56,141 - root - INFO - [34mlr: 1.0978e-05 gnorm: 0.34 [35m[1 day, 13:08:28<1 day, 12:17:48][39m
+[titan] 2025-09-09 06:44:27,809 - root - INFO - [31mstep: 20235 [32mloss: 2.8625 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,347 [36mtflops: 493.15 [35mmfu: 49.86%[39m [37mglobal_avg_ntp_loss: 0.8326 [37mglobal_avg_top_loss: 2.0299
+[titan] 2025-09-09 06:44:27,809 - root - INFO - [34mlr: 1.0975e-05 gnorm: 0.33 [35m[1 day, 13:09:00<1 day, 12:17:14][39m
+[titan] 2025-09-09 06:44:59,720 - root - INFO - [31mstep: 20240 [32mloss: 2.7057 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.40 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7563 [37mglobal_avg_top_loss: 1.9494
+[titan] 2025-09-09 06:44:59,721 - root - INFO - [34mlr: 1.0971e-05 gnorm: 0.49 [35m[1 day, 13:09:32<1 day, 12:16:40][39m
+[titan] 2025-09-09 06:45:31,632 - root - INFO - [31mstep: 20245 [32mloss: 2.6970 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.39 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7653 [37mglobal_avg_top_loss: 1.9316
+[titan] 2025-09-09 06:45:31,633 - root - INFO - [34mlr: 1.0968e-05 gnorm: 0.37 [35m[1 day, 13:10:04<1 day, 12:16:05][39m
+[titan] 2025-09-09 06:45:57,235 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 06:46:03,715 - root - INFO - [31mstep: 20250 [32mloss: 2.7878 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.78 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.8011 [37mglobal_avg_top_loss: 1.9867
+[titan] 2025-09-09 06:46:03,716 - root - INFO - [34mlr: 1.0964e-05 gnorm: 0.36 [35m[1 day, 13:10:36<1 day, 12:15:31][39m
+[titan] 2025-09-09 06:46:35,720 - root - INFO - [31mstep: 20255 [32mloss: 2.8541 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.98 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8296 [37mglobal_avg_top_loss: 2.0246
+[titan] 2025-09-09 06:46:35,720 - root - INFO - [34mlr: 1.0960e-05 gnorm: 0.37 [35m[1 day, 13:11:08<1 day, 12:14:57][39m
+[titan] 2025-09-09 06:47:07,630 - root - INFO - [31mstep: 20260 [32mloss: 2.7089 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.42 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7656 [37mglobal_avg_top_loss: 1.9432
+[titan] 2025-09-09 06:47:07,630 - root - INFO - [34mlr: 1.0957e-05 gnorm: 0.35 [35m[1 day, 13:11:40<1 day, 12:14:23][39m
+[titan] 2025-09-09 06:47:39,813 - root - INFO - [31mstep: 20265 [32mloss: 2.6907 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,182 [36mtflops: 485.27 [35mmfu: 49.07%[39m [37mglobal_avg_ntp_loss: 0.7592 [37mglobal_avg_top_loss: 1.9315
+[titan] 2025-09-09 06:47:39,813 - root - INFO - [34mlr: 1.0953e-05 gnorm: 0.34 [35m[1 day, 13:12:12<1 day, 12:13:49][39m
+[titan] 2025-09-09 06:48:11,816 - root - INFO - [31mstep: 20270 [32mloss: 2.8668 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,239 [36mtflops: 488.00 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8402 [37mglobal_avg_top_loss: 2.0267
+[titan] 2025-09-09 06:48:11,816 - root - INFO - [34mlr: 1.0950e-05 gnorm: 0.33 [35m[1 day, 13:12:44<1 day, 12:13:15][39m
+[titan] 2025-09-09 06:48:44,044 - root - INFO - [31mstep: 20275 [32mloss: 2.7622 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,168 [36mtflops: 484.60 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 0.8053 [37mglobal_avg_top_loss: 1.9569
+[titan] 2025-09-09 06:48:44,044 - root - INFO - [34mlr: 1.0946e-05 gnorm: 0.41 [35m[1 day, 13:13:16<1 day, 12:12:41][39m
+[titan] 2025-09-09 06:49:16,114 - root - INFO - [31mstep: 20280 [32mloss: 2.8485 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.98 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.8292 [37mglobal_avg_top_loss: 2.0193
+[titan] 2025-09-09 06:49:16,114 - root - INFO - [34mlr: 1.0943e-05 gnorm: 0.39 [35m[1 day, 13:13:48<1 day, 12:12:07][39m
+[titan] 2025-09-09 06:49:48,067 - root - INFO - [31mstep: 20285 [32mloss: 2.8332 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.75 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.8222 [37mglobal_avg_top_loss: 2.0110
+[titan] 2025-09-09 06:49:48,068 - root - INFO - [34mlr: 1.0939e-05 gnorm: 0.40 [35m[1 day, 13:14:20<1 day, 12:11:33][39m
+[titan] 2025-09-09 06:50:19,957 - root - INFO - [31mstep: 20290 [32mloss: 2.6742 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.74 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7495 [37mglobal_avg_top_loss: 1.9247
+[titan] 2025-09-09 06:50:19,957 - root - INFO - [34mlr: 1.0935e-05 gnorm: 0.37 [35m[1 day, 13:14:52<1 day, 12:10:59][39m
+[titan] 2025-09-09 06:50:51,969 - root - INFO - [31mstep: 20295 [32mloss: 2.6844 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.85 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7538 [37mglobal_avg_top_loss: 1.9306
+[titan] 2025-09-09 06:50:51,970 - root - INFO - [34mlr: 1.0932e-05 gnorm: 0.37 [35m[1 day, 13:15:24<1 day, 12:10:25][39m
+[titan] 2025-09-09 06:51:17,413 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 06:51:23,905 - root - INFO - [31mstep: 20300 [32mloss: 2.7561 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.03 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7872 [37mglobal_avg_top_loss: 1.9689
+[titan] 2025-09-09 06:51:23,905 - root - INFO - [34mlr: 1.0928e-05 gnorm: 0.37 [35m[1 day, 13:15:56<1 day, 12:09:51][39m
+[titan] 2025-09-09 06:51:55,716 - root - INFO - [31mstep: 20305 [32mloss: 2.7211 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,301 [36mtflops: 490.94 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7694 [37mglobal_avg_top_loss: 1.9517
+[titan] 2025-09-09 06:51:55,717 - root - INFO - [34mlr: 1.0925e-05 gnorm: 0.35 [35m[1 day, 13:16:28<1 day, 12:09:17][39m
+[titan] 2025-09-09 06:52:27,727 - root - INFO - [31mstep: 20310 [32mloss: 2.7744 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7937 [37mglobal_avg_top_loss: 1.9807
+[titan] 2025-09-09 06:52:27,727 - root - INFO - [34mlr: 1.0921e-05 gnorm: 0.34 [35m[1 day, 13:17:00<1 day, 12:08:43][39m
+[titan] 2025-09-09 06:52:59,560 - root - INFO - [31mstep: 20315 [32mloss: 3.2540 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.60 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 1.0623 [37mglobal_avg_top_loss: 2.1917
+[titan] 2025-09-09 06:52:59,561 - root - INFO - [34mlr: 1.0918e-05 gnorm: 0.38 [35m[1 day, 13:17:32<1 day, 12:08:08][39m
+[titan] 2025-09-09 06:53:31,551 - root - INFO - [31mstep: 20320 [32mloss: 2.7771 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.19 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7907 [37mglobal_avg_top_loss: 1.9865
+[titan] 2025-09-09 06:53:31,551 - root - INFO - [34mlr: 1.0914e-05 gnorm: 0.42 [35m[1 day, 13:18:04<1 day, 12:07:34][39m
+[titan] 2025-09-09 06:54:03,315 - root - INFO - [31mstep: 20325 [32mloss: 2.8222 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,316 [36mtflops: 491.66 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.8125 [37mglobal_avg_top_loss: 2.0098
+[titan] 2025-09-09 06:54:03,316 - root - INFO - [34mlr: 1.0910e-05 gnorm: 0.44 [35m[1 day, 13:18:36<1 day, 12:07:00][39m
+[titan] 2025-09-09 06:54:35,253 - root - INFO - [31mstep: 20330 [32mloss: 3.1607 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.99 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 1.0078 [37mglobal_avg_top_loss: 2.1529
+[titan] 2025-09-09 06:54:35,254 - root - INFO - [34mlr: 1.0907e-05 gnorm: 0.37 [35m[1 day, 13:19:07<1 day, 12:06:26][39m
+[titan] 2025-09-09 06:55:07,024 - root - INFO - [31mstep: 20335 [32mloss: 2.7996 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,314 [36mtflops: 491.58 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.8026 [37mglobal_avg_top_loss: 1.9970
+[titan] 2025-09-09 06:55:07,024 - root - INFO - [34mlr: 1.0903e-05 gnorm: 0.34 [35m[1 day, 13:19:39<1 day, 12:05:52][39m
+[titan] 2025-09-09 06:55:38,864 - root - INFO - [31mstep: 20340 [32mloss: 2.7147 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.49 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7647 [37mglobal_avg_top_loss: 1.9500
+[titan] 2025-09-09 06:55:38,864 - root - INFO - [34mlr: 1.0900e-05 gnorm: 0.34 [35m[1 day, 13:20:11<1 day, 12:05:18][39m
+[titan] 2025-09-09 06:56:10,802 - root - INFO - [31mstep: 20345 [32mloss: 2.8070 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.99 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8076 [37mglobal_avg_top_loss: 1.9994
+[titan] 2025-09-09 06:56:10,802 - root - INFO - [34mlr: 1.0896e-05 gnorm: 0.33 [35m[1 day, 13:20:43<1 day, 12:04:43][39m
+[titan] 2025-09-09 06:56:36,147 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 06:56:42,482 - root - INFO - [31mstep: 20350 [32mloss: 2.8160 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,344 [36mtflops: 492.98 [35mmfu: 49.85%[39m [37mglobal_avg_ntp_loss: 0.8141 [37mglobal_avg_top_loss: 2.0020
+[titan] 2025-09-09 06:56:42,482 - root - INFO - [34mlr: 1.0893e-05 gnorm: 0.34 [35m[1 day, 13:21:15<1 day, 12:04:09][39m
+[titan] 2025-09-09 06:57:14,496 - root - INFO - [31mstep: 20355 [32mloss: 2.7763 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.83 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7943 [37mglobal_avg_top_loss: 1.9820
+[titan] 2025-09-09 06:57:14,496 - root - INFO - [34mlr: 1.0889e-05 gnorm: 0.33 [35m[1 day, 13:21:47<1 day, 12:03:35][39m
+[titan] 2025-09-09 06:57:46,528 - root - INFO - [31mstep: 20360 [32mloss: 2.7609 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.55 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7851 [37mglobal_avg_top_loss: 1.9758
+[titan] 2025-09-09 06:57:46,529 - root - INFO - [34mlr: 1.0885e-05 gnorm: 0.34 [35m[1 day, 13:22:19<1 day, 12:03:01][39m
+[titan] 2025-09-09 06:58:18,434 - root - INFO - [31mstep: 20365 [32mloss: 2.8526 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.49 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8304 [37mglobal_avg_top_loss: 2.0222
+[titan] 2025-09-09 06:58:18,435 - root - INFO - [34mlr: 1.0882e-05 gnorm: 0.35 [35m[1 day, 13:22:51<1 day, 12:02:27][39m
+[titan] 2025-09-09 06:58:50,498 - root - INFO - [31mstep: 20370 [32mloss: 2.7777 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.07 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.8031 [37mglobal_avg_top_loss: 1.9746
+[titan] 2025-09-09 06:58:50,499 - root - INFO - [34mlr: 1.0878e-05 gnorm: 0.33 [35m[1 day, 13:23:23<1 day, 12:01:53][39m
+[titan] 2025-09-09 06:59:22,427 - root - INFO - [31mstep: 20375 [32mloss: 2.6894 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.14 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7562 [37mglobal_avg_top_loss: 1.9333
+[titan] 2025-09-09 06:59:22,427 - root - INFO - [34mlr: 1.0875e-05 gnorm: 0.35 [35m[1 day, 13:23:55<1 day, 12:01:19][39m
+[titan] 2025-09-09 06:59:54,616 - root - INFO - [31mstep: 20380 [32mloss: 2.7796 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,180 [36mtflops: 485.17 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7999 [37mglobal_avg_top_loss: 1.9797
+[titan] 2025-09-09 06:59:54,616 - root - INFO - [34mlr: 1.0871e-05 gnorm: 0.33 [35m[1 day, 13:24:27<1 day, 12:00:45][39m
+[titan] 2025-09-09 07:00:26,523 - root - INFO - [31mstep: 20385 [32mloss: 2.7060 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.47 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7625 [37mglobal_avg_top_loss: 1.9435
+[titan] 2025-09-09 07:00:26,523 - root - INFO - [34mlr: 1.0868e-05 gnorm: 0.34 [35m[1 day, 13:24:59<1 day, 12:00:11][39m
+[titan] 2025-09-09 07:00:58,531 - root - INFO - [31mstep: 20390 [32mloss: 3.0371 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.92 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.9383 [37mglobal_avg_top_loss: 2.0988
+[titan] 2025-09-09 07:00:58,531 - root - INFO - [34mlr: 1.0864e-05 gnorm: 0.35 [35m[1 day, 13:25:31<1 day, 11:59:37][39m
+[titan] 2025-09-09 07:01:30,383 - root - INFO - [31mstep: 20395 [32mloss: 3.2952 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.30 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 1.0824 [37mglobal_avg_top_loss: 2.2127
+[titan] 2025-09-09 07:01:30,384 - root - INFO - [34mlr: 1.0860e-05 gnorm: 0.39 [35m[1 day, 13:26:03<1 day, 11:59:03][39m
+[titan] 2025-09-09 07:01:55,825 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:02:02,212 - root - INFO - [31mstep: 20400 [32mloss: 2.7289 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.67 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7767 [37mglobal_avg_top_loss: 1.9522
+[titan] 2025-09-09 07:02:02,212 - root - INFO - [34mlr: 1.0857e-05 gnorm: 0.36 [35m[1 day, 13:26:34<1 day, 11:58:28][39m
+[titan] 2025-09-09 07:02:34,203 - root - INFO - [31mstep: 20405 [32mloss: 2.8002 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8053 [37mglobal_avg_top_loss: 1.9949
+[titan] 2025-09-09 07:02:34,204 - root - INFO - [34mlr: 1.0853e-05 gnorm: 0.34 [35m[1 day, 13:27:06<1 day, 11:57:54][39m
+[titan] 2025-09-09 07:03:06,198 - root - INFO - [31mstep: 20410 [32mloss: 2.8048 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.12 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8092 [37mglobal_avg_top_loss: 1.9956
+[titan] 2025-09-09 07:03:06,198 - root - INFO - [34mlr: 1.0850e-05 gnorm: 0.36 [35m[1 day, 13:27:38<1 day, 11:57:20][39m
+[titan] 2025-09-09 07:03:38,243 - root - INFO - [31mstep: 20415 [32mloss: 2.7784 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.35 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7953 [37mglobal_avg_top_loss: 1.9830
+[titan] 2025-09-09 07:03:38,244 - root - INFO - [34mlr: 1.0846e-05 gnorm: 0.42 [35m[1 day, 13:28:10<1 day, 11:56:46][39m
+[titan] 2025-09-09 07:04:10,130 - root - INFO - [31mstep: 20420 [32mloss: 2.7288 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.79 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7734 [37mglobal_avg_top_loss: 1.9554
+[titan] 2025-09-09 07:04:10,130 - root - INFO - [34mlr: 1.0843e-05 gnorm: 0.34 [35m[1 day, 13:28:42<1 day, 11:56:12][39m
+[titan] 2025-09-09 07:04:42,052 - root - INFO - [31mstep: 20425 [32mloss: 2.8111 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.23 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.8155 [37mglobal_avg_top_loss: 1.9956
+[titan] 2025-09-09 07:04:42,053 - root - INFO - [34mlr: 1.0839e-05 gnorm: 0.34 [35m[1 day, 13:29:14<1 day, 11:55:38][39m
+[titan] 2025-09-09 07:05:13,849 - root - INFO - [31mstep: 20430 [32mloss: 2.7851 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.16 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7994 [37mglobal_avg_top_loss: 1.9857
+[titan] 2025-09-09 07:05:13,850 - root - INFO - [34mlr: 1.0835e-05 gnorm: 0.33 [35m[1 day, 13:29:46<1 day, 11:55:04][39m
+[titan] 2025-09-09 07:05:46,094 - root - INFO - [31mstep: 20435 [32mloss: 3.1016 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,163 [36mtflops: 484.34 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.9714 [37mglobal_avg_top_loss: 2.1302
+[titan] 2025-09-09 07:05:46,094 - root - INFO - [34mlr: 1.0832e-05 gnorm: 0.35 [35m[1 day, 13:30:18<1 day, 11:54:30][39m
+[titan] 2025-09-09 07:06:17,966 - root - INFO - [31mstep: 20440 [32mloss: 2.8745 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.99 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.8395 [37mglobal_avg_top_loss: 2.0351
+[titan] 2025-09-09 07:06:17,967 - root - INFO - [34mlr: 1.0828e-05 gnorm: 0.34 [35m[1 day, 13:30:50<1 day, 11:53:56][39m
+[titan] 2025-09-09 07:06:50,158 - root - INFO - [31mstep: 20445 [32mloss: 2.7692 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,179 [36mtflops: 485.14 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7921 [37mglobal_avg_top_loss: 1.9771
+[titan] 2025-09-09 07:06:50,158 - root - INFO - [34mlr: 1.0825e-05 gnorm: 0.34 [35m[1 day, 13:31:22<1 day, 11:53:22][39m
+[titan] 2025-09-09 07:07:15,784 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:07:22,090 - root - INFO - [31mstep: 20450 [32mloss: 2.7641 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.08 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7916 [37mglobal_avg_top_loss: 1.9725
+[titan] 2025-09-09 07:07:22,091 - root - INFO - [34mlr: 1.0821e-05 gnorm: 0.33 [35m[1 day, 13:31:54<1 day, 11:52:48][39m
+[titan] 2025-09-09 07:07:54,405 - root - INFO - [31mstep: 20455 [32mloss: 2.6634 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,141 [36mtflops: 483.29 [35mmfu: 48.87%[39m [37mglobal_avg_ntp_loss: 0.7446 [37mglobal_avg_top_loss: 1.9188
+[titan] 2025-09-09 07:07:54,405 - root - INFO - [34mlr: 1.0818e-05 gnorm: 0.35 [35m[1 day, 13:32:27<1 day, 11:52:14][39m
+[titan] 2025-09-09 07:08:26,513 - root - INFO - [31mstep: 20460 [32mloss: 2.8403 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.41 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.8255 [37mglobal_avg_top_loss: 2.0148
+[titan] 2025-09-09 07:08:26,513 - root - INFO - [34mlr: 1.0814e-05 gnorm: 0.34 [35m[1 day, 13:32:59<1 day, 11:51:40][39m
+[titan] 2025-09-09 07:08:58,482 - root - INFO - [31mstep: 20465 [32mloss: 2.7725 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.51 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7930 [37mglobal_avg_top_loss: 1.9796
+[titan] 2025-09-09 07:08:58,483 - root - INFO - [34mlr: 1.0810e-05 gnorm: 0.33 [35m[1 day, 13:33:31<1 day, 11:51:06][39m
+[titan] 2025-09-09 07:09:30,461 - root - INFO - [31mstep: 20470 [32mloss: 2.7779 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.36 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7936 [37mglobal_avg_top_loss: 1.9842
+[titan] 2025-09-09 07:09:30,462 - root - INFO - [34mlr: 1.0807e-05 gnorm: 0.34 [35m[1 day, 13:34:03<1 day, 11:50:32][39m
+[titan] 2025-09-09 07:10:02,599 - root - INFO - [31mstep: 20475 [32mloss: 3.2187 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,197 [36mtflops: 485.96 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 1.0490 [37mglobal_avg_top_loss: 2.1697
+[titan] 2025-09-09 07:10:02,599 - root - INFO - [34mlr: 1.0803e-05 gnorm: 0.36 [35m[1 day, 13:34:35<1 day, 11:49:58][39m
+[titan] 2025-09-09 07:10:35,023 - root - INFO - [31mstep: 20480 [32mloss: 2.7769 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,106 [36mtflops: 481.65 [35mmfu: 48.70%[39m [37mglobal_avg_ntp_loss: 0.7952 [37mglobal_avg_top_loss: 1.9817
+[titan] 2025-09-09 07:10:35,024 - root - INFO - [34mlr: 1.0800e-05 gnorm: 0.33 [35m[1 day, 13:35:07<1 day, 11:49:25][39m
+[titan] 2025-09-09 07:10:35,360 - root - INFO - Dumping profiler traces at step 20480
+[titan] 2025-09-09 07:10:35,414 - root - INFO - Finished dumping profiler traces in 0.05 seconds
+[titan] 2025-09-09 07:11:07,257 - root - INFO - [31mstep: 20485 [32mloss: 3.0455 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,166 [36mtflops: 484.50 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.9270 [37mglobal_avg_top_loss: 2.1186
+[titan] 2025-09-09 07:11:07,258 - root - INFO - [34mlr: 1.0796e-05 gnorm: 1.11 [35m[1 day, 13:35:39<1 day, 11:48:51][39m
+[titan] 2025-09-09 07:11:39,391 - root - INFO - [31mstep: 20490 [32mloss: 3.0723 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.02 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.9626 [37mglobal_avg_top_loss: 2.1097
+[titan] 2025-09-09 07:11:39,391 - root - INFO - [34mlr: 1.0793e-05 gnorm: 0.36 [35m[1 day, 13:36:12<1 day, 11:48:17][39m
+[titan] 2025-09-09 07:12:11,223 - root - INFO - [31mstep: 20495 [32mloss: 2.7660 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.63 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7899 [37mglobal_avg_top_loss: 1.9761
+[titan] 2025-09-09 07:12:11,223 - root - INFO - [34mlr: 1.0789e-05 gnorm: 0.35 [35m[1 day, 13:36:43<1 day, 11:47:43][39m
+[titan] 2025-09-09 07:12:36,467 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:12:42,946 - root - INFO - [31mstep: 20500 [32mloss: 2.8090 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,330 [36mtflops: 492.31 [35mmfu: 49.78%[39m [37mglobal_avg_ntp_loss: 0.8068 [37mglobal_avg_top_loss: 2.0023
+[titan] 2025-09-09 07:12:42,946 - root - INFO - [34mlr: 1.0785e-05 gnorm: 0.36 [35m[1 day, 13:37:15<1 day, 11:47:09][39m
+[titan] 2025-09-09 07:13:14,870 - root - INFO - [31mstep: 20505 [32mloss: 2.7942 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.19 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.8034 [37mglobal_avg_top_loss: 1.9908
+[titan] 2025-09-09 07:13:14,871 - root - INFO - [34mlr: 1.0782e-05 gnorm: 0.34 [35m[1 day, 13:37:47<1 day, 11:46:34][39m
+[titan] 2025-09-09 07:13:46,506 - root - INFO - [31mstep: 20510 [32mloss: 3.1287 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,358 [36mtflops: 493.66 [35mmfu: 49.92%[39m [37mglobal_avg_ntp_loss: 0.9606 [37mglobal_avg_top_loss: 2.1681
+[titan] 2025-09-09 07:13:46,507 - root - INFO - [34mlr: 1.0778e-05 gnorm: 0.66 [35m[1 day, 13:38:19<1 day, 11:46:00][39m
+[titan] 2025-09-09 07:14:18,461 - root - INFO - [31mstep: 20515 [32mloss: 2.7824 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.74 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7989 [37mglobal_avg_top_loss: 1.9835
+[titan] 2025-09-09 07:14:18,461 - root - INFO - [34mlr: 1.0775e-05 gnorm: 0.34 [35m[1 day, 13:38:51<1 day, 11:45:26][39m
+[titan] 2025-09-09 07:14:50,164 - root - INFO - [31mstep: 20520 [32mloss: 2.7884 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,336 [36mtflops: 492.61 [35mmfu: 49.81%[39m [37mglobal_avg_ntp_loss: 0.7993 [37mglobal_avg_top_loss: 1.9891
+[titan] 2025-09-09 07:14:50,164 - root - INFO - [34mlr: 1.0771e-05 gnorm: 0.38 [35m[1 day, 13:39:22<1 day, 11:44:52][39m
+[titan] 2025-09-09 07:15:22,156 - root - INFO - [31mstep: 20525 [32mloss: 2.7545 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.16 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7855 [37mglobal_avg_top_loss: 1.9689
+[titan] 2025-09-09 07:15:22,157 - root - INFO - [34mlr: 1.0768e-05 gnorm: 0.36 [35m[1 day, 13:39:54<1 day, 11:44:18][39m
+[titan] 2025-09-09 07:15:53,886 - root - INFO - [31mstep: 20530 [32mloss: 2.8623 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,328 [36mtflops: 492.21 [35mmfu: 49.77%[39m [37mglobal_avg_ntp_loss: 0.8337 [37mglobal_avg_top_loss: 2.0287
+[titan] 2025-09-09 07:15:53,886 - root - INFO - [34mlr: 1.0764e-05 gnorm: 0.37 [35m[1 day, 13:40:26<1 day, 11:43:43][39m
+[titan] 2025-09-09 07:16:25,766 - root - INFO - [31mstep: 20535 [32mloss: 2.6397 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.87 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7316 [37mglobal_avg_top_loss: 1.9082
+[titan] 2025-09-09 07:16:25,767 - root - INFO - [34mlr: 1.0760e-05 gnorm: 0.40 [35m[1 day, 13:40:58<1 day, 11:43:09][39m
+[titan] 2025-09-09 07:16:57,719 - root - INFO - [31mstep: 20540 [32mloss: 2.7389 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.76 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7752 [37mglobal_avg_top_loss: 1.9637
+[titan] 2025-09-09 07:16:57,720 - root - INFO - [34mlr: 1.0757e-05 gnorm: 0.34 [35m[1 day, 13:41:30<1 day, 11:42:35][39m
+[titan] 2025-09-09 07:17:29,790 - root - INFO - [31mstep: 20545 [32mloss: 2.7142 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.98 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7712 [37mglobal_avg_top_loss: 1.9429
+[titan] 2025-09-09 07:17:29,790 - root - INFO - [34mlr: 1.0753e-05 gnorm: 0.34 [35m[1 day, 13:42:02<1 day, 11:42:01][39m
+[titan] 2025-09-09 07:17:55,293 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:18:01,633 - root - INFO - [31mstep: 20550 [32mloss: 2.7682 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.45 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7911 [37mglobal_avg_top_loss: 1.9771
+[titan] 2025-09-09 07:18:01,633 - root - INFO - [34mlr: 1.0750e-05 gnorm: 0.34 [35m[1 day, 13:42:34<1 day, 11:41:27][39m
+[titan] 2025-09-09 07:18:33,495 - root - INFO - [31mstep: 20555 [32mloss: 3.2850 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.16 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 1.0778 [37mglobal_avg_top_loss: 2.2072
+[titan] 2025-09-09 07:18:33,495 - root - INFO - [34mlr: 1.0746e-05 gnorm: 0.39 [35m[1 day, 13:43:06<1 day, 11:40:53][39m
+[titan] 2025-09-09 07:19:05,267 - root - INFO - [31mstep: 20560 [32mloss: 2.8272 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,314 [36mtflops: 491.54 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.8203 [37mglobal_avg_top_loss: 2.0069
+[titan] 2025-09-09 07:19:05,267 - root - INFO - [34mlr: 1.0743e-05 gnorm: 0.37 [35m[1 day, 13:43:37<1 day, 11:40:19][39m
+[titan] 2025-09-09 07:19:37,135 - root - INFO - [31mstep: 20565 [32mloss: 2.7941 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.07 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.8025 [37mglobal_avg_top_loss: 1.9916
+[titan] 2025-09-09 07:19:37,135 - root - INFO - [34mlr: 1.0739e-05 gnorm: 0.35 [35m[1 day, 13:44:09<1 day, 11:39:45][39m
+[titan] 2025-09-09 07:20:09,312 - root - INFO - [31mstep: 20570 [32mloss: 2.9126 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,184 [36mtflops: 485.36 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 0.8736 [37mglobal_avg_top_loss: 2.0389
+[titan] 2025-09-09 07:20:09,312 - root - INFO - [34mlr: 1.0736e-05 gnorm: 0.38 [35m[1 day, 13:44:42<1 day, 11:39:11][39m
+[titan] 2025-09-09 07:20:41,308 - root - INFO - [31mstep: 20575 [32mloss: 2.7576 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.10 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7856 [37mglobal_avg_top_loss: 1.9721
+[titan] 2025-09-09 07:20:41,309 - root - INFO - [34mlr: 1.0732e-05 gnorm: 0.36 [35m[1 day, 13:45:14<1 day, 11:38:37][39m
+[titan] 2025-09-09 07:21:13,132 - root - INFO - [31mstep: 20580 [32mloss: 2.8376 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,297 [36mtflops: 490.75 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.8256 [37mglobal_avg_top_loss: 2.0120
+[titan] 2025-09-09 07:21:13,133 - root - INFO - [34mlr: 1.0728e-05 gnorm: 0.34 [35m[1 day, 13:45:45<1 day, 11:38:03][39m
+[titan] 2025-09-09 07:21:45,124 - root - INFO - [31mstep: 20585 [32mloss: 2.8509 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.17 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8310 [37mglobal_avg_top_loss: 2.0199
+[titan] 2025-09-09 07:21:45,124 - root - INFO - [34mlr: 1.0725e-05 gnorm: 0.34 [35m[1 day, 13:46:17<1 day, 11:37:29][39m
+[titan] 2025-09-09 07:22:16,997 - root - INFO - [31mstep: 20590 [32mloss: 3.1311 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.98 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.9648 [37mglobal_avg_top_loss: 2.1662
+[titan] 2025-09-09 07:22:16,998 - root - INFO - [34mlr: 1.0721e-05 gnorm: 0.69 [35m[1 day, 13:46:49<1 day, 11:36:55][39m
+[titan] 2025-09-09 07:22:49,017 - root - INFO - [31mstep: 20595 [32mloss: 2.7975 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.74 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.8041 [37mglobal_avg_top_loss: 1.9933
+[titan] 2025-09-09 07:22:49,017 - root - INFO - [34mlr: 1.0718e-05 gnorm: 0.36 [35m[1 day, 13:47:21<1 day, 11:36:21][39m
+[titan] 2025-09-09 07:23:14,280 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:23:20,736 - root - INFO - [31mstep: 20600 [32mloss: 2.7198 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,331 [36mtflops: 492.37 [35mmfu: 49.78%[39m [37mglobal_avg_ntp_loss: 0.7719 [37mglobal_avg_top_loss: 1.9479
+[titan] 2025-09-09 07:23:20,737 - root - INFO - [34mlr: 1.0714e-05 gnorm: 0.35 [35m[1 day, 13:47:53<1 day, 11:35:46][39m
+[titan] 2025-09-09 07:23:52,653 - root - INFO - [31mstep: 20605 [32mloss: 2.6982 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.31 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7595 [37mglobal_avg_top_loss: 1.9387
+[titan] 2025-09-09 07:23:52,654 - root - INFO - [34mlr: 1.0711e-05 gnorm: 0.36 [35m[1 day, 13:48:25<1 day, 11:35:12][39m
+[titan] 2025-09-09 07:24:24,493 - root - INFO - [31mstep: 20610 [32mloss: 2.6344 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.50 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7300 [37mglobal_avg_top_loss: 1.9045
+[titan] 2025-09-09 07:24:24,493 - root - INFO - [34mlr: 1.0707e-05 gnorm: 0.35 [35m[1 day, 13:48:57<1 day, 11:34:38][39m
+[titan] 2025-09-09 07:24:56,361 - root - INFO - [31mstep: 20615 [32mloss: 2.8545 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.07 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.8316 [37mglobal_avg_top_loss: 2.0229
+[titan] 2025-09-09 07:24:56,361 - root - INFO - [34mlr: 1.0703e-05 gnorm: 0.36 [35m[1 day, 13:49:29<1 day, 11:34:04][39m
+[titan] 2025-09-09 07:25:28,166 - root - INFO - [31mstep: 20620 [32mloss: 2.7952 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.03 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.8033 [37mglobal_avg_top_loss: 1.9919
+[titan] 2025-09-09 07:25:28,167 - root - INFO - [34mlr: 1.0700e-05 gnorm: 0.35 [35m[1 day, 13:50:00<1 day, 11:33:30][39m
+[titan] 2025-09-09 07:26:00,181 - root - INFO - [31mstep: 20625 [32mloss: 2.7567 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.82 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7954 [37mglobal_avg_top_loss: 1.9613
+[titan] 2025-09-09 07:26:00,182 - root - INFO - [34mlr: 1.0696e-05 gnorm: 0.36 [35m[1 day, 13:50:32<1 day, 11:32:56][39m
+[titan] 2025-09-09 07:26:32,202 - root - INFO - [31mstep: 20630 [32mloss: 3.6945 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.74 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 1.3172 [37mglobal_avg_top_loss: 2.3773
+[titan] 2025-09-09 07:26:32,202 - root - INFO - [34mlr: 1.0693e-05 gnorm: 0.34 [35m[1 day, 13:51:04<1 day, 11:32:22][39m
+[titan] 2025-09-09 07:27:04,024 - root - INFO - [31mstep: 20635 [32mloss: 3.1787 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,297 [36mtflops: 490.77 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 1.0293 [37mglobal_avg_top_loss: 2.1493
+[titan] 2025-09-09 07:27:04,024 - root - INFO - [34mlr: 1.0689e-05 gnorm: 0.49 [35m[1 day, 13:51:36<1 day, 11:31:48][39m
+[titan] 2025-09-09 07:27:35,971 - root - INFO - [31mstep: 20640 [32mloss: 2.8221 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.85 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.8122 [37mglobal_avg_top_loss: 2.0099
+[titan] 2025-09-09 07:27:35,972 - root - INFO - [34mlr: 1.0686e-05 gnorm: 0.33 [35m[1 day, 13:52:08<1 day, 11:31:14][39m
+[titan] 2025-09-09 07:28:08,044 - root - INFO - [31mstep: 20645 [32mloss: 2.7387 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.94 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7786 [37mglobal_avg_top_loss: 1.9601
+[titan] 2025-09-09 07:28:08,044 - root - INFO - [34mlr: 1.0682e-05 gnorm: 0.33 [35m[1 day, 13:52:40<1 day, 11:30:40][39m
+[titan] 2025-09-09 07:28:33,580 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:28:39,993 - root - INFO - [31mstep: 20650 [32mloss: 2.8459 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.82 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.8255 [37mglobal_avg_top_loss: 2.0204
+[titan] 2025-09-09 07:28:39,993 - root - INFO - [34mlr: 1.0678e-05 gnorm: 0.38 [35m[1 day, 13:53:12<1 day, 11:30:06][39m
+[titan] 2025-09-09 07:29:11,711 - root - INFO - [31mstep: 20655 [32mloss: 2.9157 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,331 [36mtflops: 492.37 [35mmfu: 49.79%[39m [37mglobal_avg_ntp_loss: 0.8784 [37mglobal_avg_top_loss: 2.0373
+[titan] 2025-09-09 07:29:11,712 - root - INFO - [34mlr: 1.0675e-05 gnorm: 0.38 [35m[1 day, 13:53:44<1 day, 11:29:31][39m
+[titan] 2025-09-09 07:29:43,746 - root - INFO - [31mstep: 20660 [32mloss: 2.7643 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.52 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7887 [37mglobal_avg_top_loss: 1.9755
+[titan] 2025-09-09 07:29:43,746 - root - INFO - [34mlr: 1.0671e-05 gnorm: 0.38 [35m[1 day, 13:54:16<1 day, 11:28:58][39m
+[titan] 2025-09-09 07:30:15,581 - root - INFO - [31mstep: 20665 [32mloss: 2.8318 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.58 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.8232 [37mglobal_avg_top_loss: 2.0086
+[titan] 2025-09-09 07:30:15,581 - root - INFO - [34mlr: 1.0668e-05 gnorm: 0.35 [35m[1 day, 13:54:48<1 day, 11:28:23][39m
+[titan] 2025-09-09 07:30:47,813 - root - INFO - [31mstep: 20670 [32mloss: 2.7860 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,167 [36mtflops: 484.53 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7990 [37mglobal_avg_top_loss: 1.9870
+[titan] 2025-09-09 07:30:47,813 - root - INFO - [34mlr: 1.0664e-05 gnorm: 0.40 [35m[1 day, 13:55:20<1 day, 11:27:50][39m
+[titan] 2025-09-09 07:31:20,089 - root - INFO - [31mstep: 20675 [32mloss: 3.0578 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,153 [36mtflops: 483.87 [35mmfu: 48.93%[39m [37mglobal_avg_ntp_loss: 0.9446 [37mglobal_avg_top_loss: 2.1133
+[titan] 2025-09-09 07:31:20,089 - root - INFO - [34mlr: 1.0661e-05 gnorm: 0.38 [35m[1 day, 13:55:52<1 day, 11:27:16][39m
+[titan] 2025-09-09 07:31:52,030 - root - INFO - [31mstep: 20680 [32mloss: 2.8441 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.95 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8248 [37mglobal_avg_top_loss: 2.0194
+[titan] 2025-09-09 07:31:52,030 - root - INFO - [34mlr: 1.0657e-05 gnorm: 0.36 [35m[1 day, 13:56:24<1 day, 11:26:42][39m
+[titan] 2025-09-09 07:32:23,838 - root - INFO - [31mstep: 20685 [32mloss: 2.8750 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.98 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.8328 [37mglobal_avg_top_loss: 2.0422
+[titan] 2025-09-09 07:32:23,839 - root - INFO - [34mlr: 1.0653e-05 gnorm: 1.13 [35m[1 day, 13:56:56<1 day, 11:26:08][39m
+[titan] 2025-09-09 07:32:55,829 - root - INFO - [31mstep: 20690 [32mloss: 2.7702 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.19 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7990 [37mglobal_avg_top_loss: 1.9712
+[titan] 2025-09-09 07:32:55,829 - root - INFO - [34mlr: 1.0650e-05 gnorm: 0.39 [35m[1 day, 13:57:28<1 day, 11:25:34][39m
+[titan] 2025-09-09 07:33:27,566 - root - INFO - [31mstep: 20695 [32mloss: 2.7366 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,325 [36mtflops: 492.09 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 0.7745 [37mglobal_avg_top_loss: 1.9621
+[titan] 2025-09-09 07:33:27,566 - root - INFO - [34mlr: 1.0646e-05 gnorm: 0.36 [35m[1 day, 13:58:00<1 day, 11:24:59][39m
+[titan] 2025-09-09 07:33:53,012 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:33:59,395 - root - INFO - [31mstep: 20700 [32mloss: 2.9222 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.67 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.8573 [37mglobal_avg_top_loss: 2.0649
+[titan] 2025-09-09 07:33:59,396 - root - INFO - [34mlr: 1.0643e-05 gnorm: 0.33 [35m[1 day, 13:58:32<1 day, 11:24:25][39m
+[titan] 2025-09-09 07:34:31,149 - root - INFO - [31mstep: 20705 [32mloss: 2.7591 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,320 [36mtflops: 491.84 [35mmfu: 49.73%[39m [37mglobal_avg_ntp_loss: 0.7978 [37mglobal_avg_top_loss: 1.9613
+[titan] 2025-09-09 07:34:31,149 - root - INFO - [34mlr: 1.0639e-05 gnorm: 0.47 [35m[1 day, 13:59:03<1 day, 11:23:51][39m
+[titan] 2025-09-09 07:35:03,021 - root - INFO - [31mstep: 20710 [32mloss: 3.3606 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 490.01 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 1.1187 [37mglobal_avg_top_loss: 2.2419
+[titan] 2025-09-09 07:35:03,021 - root - INFO - [34mlr: 1.0636e-05 gnorm: 0.34 [35m[1 day, 13:59:35<1 day, 11:23:17][39m
+[titan] 2025-09-09 07:35:35,006 - root - INFO - [31mstep: 20715 [32mloss: 2.7429 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.26 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7828 [37mglobal_avg_top_loss: 1.9601
+[titan] 2025-09-09 07:35:35,007 - root - INFO - [34mlr: 1.0632e-05 gnorm: 0.33 [35m[1 day, 14:00:07<1 day, 11:22:43][39m
+[titan] 2025-09-09 07:36:06,716 - root - INFO - [31mstep: 20720 [32mloss: 2.8326 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,334 [36mtflops: 492.52 [35mmfu: 49.80%[39m [37mglobal_avg_ntp_loss: 0.8175 [37mglobal_avg_top_loss: 2.0150
+[titan] 2025-09-09 07:36:06,716 - root - INFO - [34mlr: 1.0628e-05 gnorm: 0.34 [35m[1 day, 14:00:39<1 day, 11:22:09][39m
+[titan] 2025-09-09 07:36:38,798 - root - INFO - [31mstep: 20725 [32mloss: 2.8353 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.80 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.8213 [37mglobal_avg_top_loss: 2.0140
+[titan] 2025-09-09 07:36:38,798 - root - INFO - [34mlr: 1.0625e-05 gnorm: 0.33 [35m[1 day, 14:01:11<1 day, 11:21:35][39m
+[titan] 2025-09-09 07:37:10,652 - root - INFO - [31mstep: 20730 [32mloss: 2.8775 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.28 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.8411 [37mglobal_avg_top_loss: 2.0364
+[titan] 2025-09-09 07:37:10,652 - root - INFO - [34mlr: 1.0621e-05 gnorm: 0.36 [35m[1 day, 14:01:43<1 day, 11:21:01][39m
+[titan] 2025-09-09 07:37:42,842 - root - INFO - [31mstep: 20735 [32mloss: 2.9359 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,180 [36mtflops: 485.17 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.8688 [37mglobal_avg_top_loss: 2.0671
+[titan] 2025-09-09 07:37:42,842 - root - INFO - [34mlr: 1.0618e-05 gnorm: 0.44 [35m[1 day, 14:02:15<1 day, 11:20:27][39m
+[titan] 2025-09-09 07:38:14,599 - root - INFO - [31mstep: 20740 [32mloss: 2.7190 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,319 [36mtflops: 491.77 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 0.7677 [37mglobal_avg_top_loss: 1.9513
+[titan] 2025-09-09 07:38:14,599 - root - INFO - [34mlr: 1.0614e-05 gnorm: 0.45 [35m[1 day, 14:02:47<1 day, 11:19:53][39m
+[titan] 2025-09-09 07:38:46,380 - root - INFO - [31mstep: 20745 [32mloss: 2.8439 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,311 [36mtflops: 491.41 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.8215 [37mglobal_avg_top_loss: 2.0224
+[titan] 2025-09-09 07:38:46,381 - root - INFO - [34mlr: 1.0611e-05 gnorm: 0.35 [35m[1 day, 14:03:19<1 day, 11:19:19][39m
+[titan] 2025-09-09 07:39:11,961 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:39:18,414 - root - INFO - [31mstep: 20750 [32mloss: 2.7640 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.53 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7947 [37mglobal_avg_top_loss: 1.9693
+[titan] 2025-09-09 07:39:18,415 - root - INFO - [34mlr: 1.0607e-05 gnorm: 0.35 [35m[1 day, 14:03:51<1 day, 11:18:45][39m
+[titan] 2025-09-09 07:39:50,347 - root - INFO - [31mstep: 20755 [32mloss: 2.7815 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.07 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7944 [37mglobal_avg_top_loss: 1.9871
+[titan] 2025-09-09 07:39:50,348 - root - INFO - [34mlr: 1.0604e-05 gnorm: 0.44 [35m[1 day, 14:04:23<1 day, 11:18:11][39m
+[titan] 2025-09-09 07:40:22,425 - root - INFO - [31mstep: 20760 [32mloss: 2.8046 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,215 [36mtflops: 486.86 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8071 [37mglobal_avg_top_loss: 1.9975
+[titan] 2025-09-09 07:40:22,426 - root - INFO - [34mlr: 1.0600e-05 gnorm: 0.34 [35m[1 day, 14:04:55<1 day, 11:17:37][39m
+[titan] 2025-09-09 07:40:54,274 - root - INFO - [31mstep: 20765 [32mloss: 2.8028 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,289 [36mtflops: 490.37 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.8079 [37mglobal_avg_top_loss: 1.9949
+[titan] 2025-09-09 07:40:54,274 - root - INFO - [34mlr: 1.0596e-05 gnorm: 0.34 [35m[1 day, 14:05:26<1 day, 11:17:03][39m
+[titan] 2025-09-09 07:41:26,612 - root - INFO - [31mstep: 20770 [32mloss: 2.8227 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,133 [36mtflops: 482.94 [35mmfu: 48.83%[39m [37mglobal_avg_ntp_loss: 0.8154 [37mglobal_avg_top_loss: 2.0073
+[titan] 2025-09-09 07:41:26,612 - root - INFO - [34mlr: 1.0593e-05 gnorm: 0.37 [35m[1 day, 14:05:59<1 day, 11:16:29][39m
+[titan] 2025-09-09 07:41:58,615 - root - INFO - [31mstep: 20775 [32mloss: 2.8022 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,239 [36mtflops: 488.00 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8065 [37mglobal_avg_top_loss: 1.9957
+[titan] 2025-09-09 07:41:58,615 - root - INFO - [34mlr: 1.0589e-05 gnorm: 0.35 [35m[1 day, 14:06:31<1 day, 11:15:55][39m
+[titan] 2025-09-09 07:42:30,425 - root - INFO - [31mstep: 20780 [32mloss: 2.9815 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,301 [36mtflops: 490.96 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.9007 [37mglobal_avg_top_loss: 2.0808
+[titan] 2025-09-09 07:42:30,426 - root - INFO - [34mlr: 1.0586e-05 gnorm: 0.34 [35m[1 day, 14:07:03<1 day, 11:15:21][39m
+[titan] 2025-09-09 07:43:02,471 - root - INFO - [31mstep: 20785 [32mloss: 2.6697 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.35 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7556 [37mglobal_avg_top_loss: 1.9141
+[titan] 2025-09-09 07:43:02,471 - root - INFO - [34mlr: 1.0582e-05 gnorm: 0.39 [35m[1 day, 14:07:35<1 day, 11:14:47][39m
+[titan] 2025-09-09 07:43:34,584 - root - INFO - [31mstep: 20790 [32mloss: 3.1640 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.33 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 1.0273 [37mglobal_avg_top_loss: 2.1367
+[titan] 2025-09-09 07:43:34,584 - root - INFO - [34mlr: 1.0579e-05 gnorm: 0.37 [35m[1 day, 14:08:07<1 day, 11:14:13][39m
+[titan] 2025-09-09 07:44:06,509 - root - INFO - [31mstep: 20795 [32mloss: 3.0651 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.20 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.9439 [37mglobal_avg_top_loss: 2.1211
+[titan] 2025-09-09 07:44:06,509 - root - INFO - [34mlr: 1.0575e-05 gnorm: 0.34 [35m[1 day, 14:08:39<1 day, 11:13:39][39m
+[titan] 2025-09-09 07:44:32,015 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:44:38,368 - root - INFO - [31mstep: 20800 [32mloss: 2.6540 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.20 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7441 [37mglobal_avg_top_loss: 1.9100
+[titan] 2025-09-09 07:44:38,368 - root - INFO - [34mlr: 1.0571e-05 gnorm: 0.34 [35m[1 day, 14:09:11<1 day, 11:13:05][39m
+[titan] 2025-09-09 07:45:10,207 - root - INFO - [31mstep: 20805 [32mloss: 2.7972 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.51 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.8050 [37mglobal_avg_top_loss: 1.9922
+[titan] 2025-09-09 07:45:10,207 - root - INFO - [34mlr: 1.0568e-05 gnorm: 0.35 [35m[1 day, 14:09:42<1 day, 11:12:31][39m
+[titan] 2025-09-09 07:45:41,946 - root - INFO - [31mstep: 20810 [32mloss: 2.7258 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,325 [36mtflops: 492.06 [35mmfu: 49.75%[39m [37mglobal_avg_ntp_loss: 0.7687 [37mglobal_avg_top_loss: 1.9571
+[titan] 2025-09-09 07:45:41,946 - root - INFO - [34mlr: 1.0564e-05 gnorm: 0.36 [35m[1 day, 14:10:14<1 day, 11:11:57][39m
+[titan] 2025-09-09 07:46:13,891 - root - INFO - [31mstep: 20815 [32mloss: 2.7597 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.89 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7872 [37mglobal_avg_top_loss: 1.9725
+[titan] 2025-09-09 07:46:13,891 - root - INFO - [34mlr: 1.0561e-05 gnorm: 0.40 [35m[1 day, 14:10:46<1 day, 11:11:23][39m
+[titan] 2025-09-09 07:46:46,093 - root - INFO - [31mstep: 20820 [32mloss: 2.7721 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,176 [36mtflops: 484.98 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7911 [37mglobal_avg_top_loss: 1.9811
+[titan] 2025-09-09 07:46:46,093 - root - INFO - [34mlr: 1.0557e-05 gnorm: 0.36 [35m[1 day, 14:11:18<1 day, 11:10:49][39m
+[titan] 2025-09-09 07:47:18,042 - root - INFO - [31mstep: 20825 [32mloss: 2.7175 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.82 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7694 [37mglobal_avg_top_loss: 1.9481
+[titan] 2025-09-09 07:47:18,043 - root - INFO - [34mlr: 1.0554e-05 gnorm: 0.35 [35m[1 day, 14:11:50<1 day, 11:10:15][39m
+[titan] 2025-09-09 07:47:50,006 - root - INFO - [31mstep: 20830 [32mloss: 2.8856 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.59 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.8411 [37mglobal_avg_top_loss: 2.0445
+[titan] 2025-09-09 07:47:50,007 - root - INFO - [34mlr: 1.0550e-05 gnorm: 0.35 [35m[1 day, 14:12:22<1 day, 11:09:41][39m
+[titan] 2025-09-09 07:48:21,884 - root - INFO - [31mstep: 20835 [32mloss: 2.7359 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.92 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7772 [37mglobal_avg_top_loss: 1.9587
+[titan] 2025-09-09 07:48:21,884 - root - INFO - [34mlr: 1.0546e-05 gnorm: 0.35 [35m[1 day, 14:12:54<1 day, 11:09:07][39m
+[titan] 2025-09-09 07:48:53,834 - root - INFO - [31mstep: 20840 [32mloss: 2.7794 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.80 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7969 [37mglobal_avg_top_loss: 1.9825
+[titan] 2025-09-09 07:48:53,835 - root - INFO - [34mlr: 1.0543e-05 gnorm: 0.33 [35m[1 day, 14:13:26<1 day, 11:08:33][39m
+[titan] 2025-09-09 07:49:25,664 - root - INFO - [31mstep: 20845 [32mloss: 2.7689 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.65 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7916 [37mglobal_avg_top_loss: 1.9773
+[titan] 2025-09-09 07:49:25,665 - root - INFO - [34mlr: 1.0539e-05 gnorm: 0.33 [35m[1 day, 14:13:58<1 day, 11:07:59][39m
+[titan] 2025-09-09 07:49:51,116 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:49:57,539 - root - INFO - [31mstep: 20850 [32mloss: 2.9739 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.97 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.8858 [37mglobal_avg_top_loss: 2.0881
+[titan] 2025-09-09 07:49:57,539 - root - INFO - [34mlr: 1.0536e-05 gnorm: 1.04 [35m[1 day, 14:14:30<1 day, 11:07:25][39m
+[titan] 2025-09-09 07:50:29,472 - root - INFO - [31mstep: 20855 [32mloss: 2.9238 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.06 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8636 [37mglobal_avg_top_loss: 2.0602
+[titan] 2025-09-09 07:50:29,473 - root - INFO - [34mlr: 1.0532e-05 gnorm: 0.77 [35m[1 day, 14:15:02<1 day, 11:06:51][39m
+[titan] 2025-09-09 07:51:01,615 - root - INFO - [31mstep: 20860 [32mloss: 2.7706 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,195 [36mtflops: 485.87 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7953 [37mglobal_avg_top_loss: 1.9753
+[titan] 2025-09-09 07:51:01,616 - root - INFO - [34mlr: 1.0529e-05 gnorm: 0.37 [35m[1 day, 14:15:34<1 day, 11:06:17][39m
+[titan] 2025-09-09 07:51:33,435 - root - INFO - [31mstep: 20865 [32mloss: 2.7579 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.81 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7845 [37mglobal_avg_top_loss: 1.9734
+[titan] 2025-09-09 07:51:33,435 - root - INFO - [34mlr: 1.0525e-05 gnorm: 0.34 [35m[1 day, 14:16:06<1 day, 11:05:43][39m
+[titan] 2025-09-09 07:52:05,439 - root - INFO - [31mstep: 20870 [32mloss: 2.9183 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.98 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8903 [37mglobal_avg_top_loss: 2.0279
+[titan] 2025-09-09 07:52:05,440 - root - INFO - [34mlr: 1.0522e-05 gnorm: 0.41 [35m[1 day, 14:16:38<1 day, 11:05:09][39m
+[titan] 2025-09-09 07:52:37,550 - root - INFO - [31mstep: 20875 [32mloss: 2.6673 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.36 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7462 [37mglobal_avg_top_loss: 1.9211
+[titan] 2025-09-09 07:52:37,550 - root - INFO - [34mlr: 1.0518e-05 gnorm: 0.49 [35m[1 day, 14:17:10<1 day, 11:04:35][39m
+[titan] 2025-09-09 07:53:09,605 - root - INFO - [31mstep: 20880 [32mloss: 2.6185 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.21 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7242 [37mglobal_avg_top_loss: 1.8943
+[titan] 2025-09-09 07:53:09,605 - root - INFO - [34mlr: 1.0514e-05 gnorm: 0.34 [35m[1 day, 14:17:42<1 day, 11:04:01][39m
+[titan] 2025-09-09 07:53:41,434 - root - INFO - [31mstep: 20885 [32mloss: 2.7658 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.66 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7907 [37mglobal_avg_top_loss: 1.9751
+[titan] 2025-09-09 07:53:41,435 - root - INFO - [34mlr: 1.0511e-05 gnorm: 0.33 [35m[1 day, 14:18:14<1 day, 11:03:27][39m
+[titan] 2025-09-09 07:54:13,278 - root - INFO - [31mstep: 20890 [32mloss: 2.6896 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.43 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7557 [37mglobal_avg_top_loss: 1.9340
+[titan] 2025-09-09 07:54:13,279 - root - INFO - [34mlr: 1.0507e-05 gnorm: 0.40 [35m[1 day, 14:18:45<1 day, 11:02:53][39m
+[titan] 2025-09-09 07:54:45,332 - root - INFO - [31mstep: 20895 [32mloss: 2.7992 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.23 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.8024 [37mglobal_avg_top_loss: 1.9968
+[titan] 2025-09-09 07:54:45,332 - root - INFO - [34mlr: 1.0504e-05 gnorm: 0.48 [35m[1 day, 14:19:17<1 day, 11:02:19][39m
+[titan] 2025-09-09 07:55:10,807 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 07:55:17,281 - root - INFO - [31mstep: 20900 [32mloss: 2.7383 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.83 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7758 [37mglobal_avg_top_loss: 1.9626
+[titan] 2025-09-09 07:55:17,281 - root - INFO - [34mlr: 1.0500e-05 gnorm: 0.34 [35m[1 day, 14:19:49<1 day, 11:01:45][39m
+[titan] 2025-09-09 07:55:49,358 - root - INFO - [31mstep: 20905 [32mloss: 2.8631 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.87 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8373 [37mglobal_avg_top_loss: 2.0258
+[titan] 2025-09-09 07:55:49,359 - root - INFO - [34mlr: 1.0497e-05 gnorm: 0.37 [35m[1 day, 14:20:21<1 day, 11:01:11][39m
+[titan] 2025-09-09 07:56:21,199 - root - INFO - [31mstep: 20910 [32mloss: 2.8960 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.49 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.8515 [37mglobal_avg_top_loss: 2.0445
+[titan] 2025-09-09 07:56:21,199 - root - INFO - [34mlr: 1.0493e-05 gnorm: 0.36 [35m[1 day, 14:20:53<1 day, 11:00:37][39m
+[titan] 2025-09-09 07:56:53,363 - root - INFO - [31mstep: 20915 [32mloss: 2.8280 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,188 [36mtflops: 485.55 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.8177 [37mglobal_avg_top_loss: 2.0103
+[titan] 2025-09-09 07:56:53,364 - root - INFO - [34mlr: 1.0489e-05 gnorm: 0.35 [35m[1 day, 14:21:25<1 day, 11:00:03][39m
+[titan] 2025-09-09 07:57:25,296 - root - INFO - [31mstep: 20920 [32mloss: 2.7663 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.07 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7890 [37mglobal_avg_top_loss: 1.9773
+[titan] 2025-09-09 07:57:25,297 - root - INFO - [34mlr: 1.0486e-05 gnorm: 0.34 [35m[1 day, 14:21:57<1 day, 10:59:29][39m
+[titan] 2025-09-09 07:57:57,248 - root - INFO - [31mstep: 20925 [32mloss: 2.8015 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.78 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.8027 [37mglobal_avg_top_loss: 1.9988
+[titan] 2025-09-09 07:57:57,248 - root - INFO - [34mlr: 1.0482e-05 gnorm: 0.35 [35m[1 day, 14:22:29<1 day, 10:58:55][39m
+[titan] 2025-09-09 07:58:29,155 - root - INFO - [31mstep: 20930 [32mloss: 2.9265 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.47 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8656 [37mglobal_avg_top_loss: 2.0609
+[titan] 2025-09-09 07:58:29,156 - root - INFO - [34mlr: 1.0479e-05 gnorm: 0.76 [35m[1 day, 14:23:01<1 day, 10:58:21][39m
+[titan] 2025-09-09 07:59:01,038 - root - INFO - [31mstep: 20935 [32mloss: 3.2675 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.83 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 1.0719 [37mglobal_avg_top_loss: 2.1956
+[titan] 2025-09-09 07:59:01,039 - root - INFO - [34mlr: 1.0475e-05 gnorm: 0.36 [35m[1 day, 14:23:33<1 day, 10:57:47][39m
+[titan] 2025-09-09 07:59:32,951 - root - INFO - [31mstep: 20940 [32mloss: 2.8089 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.39 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.8071 [37mglobal_avg_top_loss: 2.0018
+[titan] 2025-09-09 07:59:32,951 - root - INFO - [34mlr: 1.0472e-05 gnorm: 0.33 [35m[1 day, 14:24:05<1 day, 10:57:13][39m
+[titan] 2025-09-09 08:00:04,740 - root - INFO - [31mstep: 20945 [32mloss: 2.7405 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,308 [36mtflops: 491.28 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.7780 [37mglobal_avg_top_loss: 1.9625
+[titan] 2025-09-09 08:00:04,740 - root - INFO - [34mlr: 1.0468e-05 gnorm: 0.35 [35m[1 day, 14:24:37<1 day, 10:56:39][39m
+[titan] 2025-09-09 08:00:30,312 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:00:36,668 - root - INFO - [31mstep: 20950 [32mloss: 2.6918 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.15 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7578 [37mglobal_avg_top_loss: 1.9341
+[titan] 2025-09-09 08:00:36,668 - root - INFO - [34mlr: 1.0464e-05 gnorm: 0.41 [35m[1 day, 14:25:09<1 day, 10:56:05][39m
+[titan] 2025-09-09 08:01:08,427 - root - INFO - [31mstep: 20955 [32mloss: 2.6915 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,318 [36mtflops: 491.74 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 0.7568 [37mglobal_avg_top_loss: 1.9348
+[titan] 2025-09-09 08:01:08,428 - root - INFO - [34mlr: 1.0461e-05 gnorm: 0.41 [35m[1 day, 14:25:41<1 day, 10:55:31][39m
+[titan] 2025-09-09 08:01:40,171 - root - INFO - [31mstep: 20960 [32mloss: 2.8383 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,323 [36mtflops: 491.99 [35mmfu: 49.75%[39m [37mglobal_avg_ntp_loss: 0.8246 [37mglobal_avg_top_loss: 2.0137
+[titan] 2025-09-09 08:01:40,172 - root - INFO - [34mlr: 1.0457e-05 gnorm: 0.42 [35m[1 day, 14:26:12<1 day, 10:54:57][39m
+[titan] 2025-09-09 08:02:11,879 - root - INFO - [31mstep: 20965 [32mloss: 2.7883 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,335 [36mtflops: 492.54 [35mmfu: 49.80%[39m [37mglobal_avg_ntp_loss: 0.8007 [37mglobal_avg_top_loss: 1.9876
+[titan] 2025-09-09 08:02:11,880 - root - INFO - [34mlr: 1.0454e-05 gnorm: 0.35 [35m[1 day, 14:26:44<1 day, 10:54:23][39m
+[titan] 2025-09-09 08:02:43,657 - root - INFO - [31mstep: 20970 [32mloss: 2.7243 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,312 [36mtflops: 491.46 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.7716 [37mglobal_avg_top_loss: 1.9527
+[titan] 2025-09-09 08:02:43,658 - root - INFO - [34mlr: 1.0450e-05 gnorm: 0.35 [35m[1 day, 14:27:16<1 day, 10:53:49][39m
+[titan] 2025-09-09 08:03:15,701 - root - INFO - [31mstep: 20975 [32mloss: 2.8051 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.37 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.8077 [37mglobal_avg_top_loss: 1.9973
+[titan] 2025-09-09 08:03:15,702 - root - INFO - [34mlr: 1.0447e-05 gnorm: 0.61 [35m[1 day, 14:27:48<1 day, 10:53:15][39m
+[titan] 2025-09-09 08:03:47,824 - root - INFO - [31mstep: 20980 [32mloss: 2.8081 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,201 [36mtflops: 486.19 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.8093 [37mglobal_avg_top_loss: 1.9988
+[titan] 2025-09-09 08:03:47,824 - root - INFO - [34mlr: 1.0443e-05 gnorm: 0.34 [35m[1 day, 14:28:20<1 day, 10:52:41][39m
+[titan] 2025-09-09 08:04:19,780 - root - INFO - [31mstep: 20985 [32mloss: 2.7822 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.71 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.8019 [37mglobal_avg_top_loss: 1.9803
+[titan] 2025-09-09 08:04:19,780 - root - INFO - [34mlr: 1.0440e-05 gnorm: 0.33 [35m[1 day, 14:28:52<1 day, 10:52:07][39m
+[titan] 2025-09-09 08:04:51,586 - root - INFO - [31mstep: 20990 [32mloss: 2.7827 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.03 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7972 [37mglobal_avg_top_loss: 1.9855
+[titan] 2025-09-09 08:04:51,586 - root - INFO - [34mlr: 1.0436e-05 gnorm: 0.34 [35m[1 day, 14:29:24<1 day, 10:51:33][39m
+[titan] 2025-09-09 08:05:04,595 - root - INFO - Dumping profiler traces at step 20992
+[titan] 2025-09-09 08:05:04,661 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 08:05:23,732 - root - INFO - [31mstep: 20995 [32mloss: 2.7803 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.83 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7956 [37mglobal_avg_top_loss: 1.9847
+[titan] 2025-09-09 08:05:23,732 - root - INFO - [34mlr: 1.0432e-05 gnorm: 0.39 [35m[1 day, 14:29:56<1 day, 10:50:59][39m
+[titan] 2025-09-09 08:05:49,142 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:05:55,532 - root - INFO - [31mstep: 21000 [32mloss: 2.7204 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.11 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7681 [37mglobal_avg_top_loss: 1.9524
+[titan] 2025-09-09 08:05:55,533 - root - INFO - [34mlr: 1.0429e-05 gnorm: 0.33 [35m[1 day, 14:30:28<1 day, 10:50:25][39m
+[titan] 2025-09-09 08:06:27,528 - root - INFO - [31mstep: 21005 [32mloss: 2.9063 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.11 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.8637 [37mglobal_avg_top_loss: 2.0426
+[titan] 2025-09-09 08:06:27,529 - root - INFO - [34mlr: 1.0425e-05 gnorm: 0.39 [35m[1 day, 14:31:00<1 day, 10:49:51][39m
+[titan] 2025-09-09 08:06:59,334 - root - INFO - [31mstep: 21010 [32mloss: 2.8169 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.02 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.8137 [37mglobal_avg_top_loss: 2.0032
+[titan] 2025-09-09 08:06:59,335 - root - INFO - [34mlr: 1.0422e-05 gnorm: 0.37 [35m[1 day, 14:31:31<1 day, 10:49:17][39m
+[titan] 2025-09-09 08:07:31,218 - root - INFO - [31mstep: 21015 [32mloss: 3.2978 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.82 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 1.0833 [37mglobal_avg_top_loss: 2.2144
+[titan] 2025-09-09 08:07:31,219 - root - INFO - [34mlr: 1.0418e-05 gnorm: 0.34 [35m[1 day, 14:32:03<1 day, 10:48:43][39m
+[titan] 2025-09-09 08:08:03,278 - root - INFO - [31mstep: 21020 [32mloss: 2.8163 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.14 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.8144 [37mglobal_avg_top_loss: 2.0019
+[titan] 2025-09-09 08:08:03,278 - root - INFO - [34mlr: 1.0415e-05 gnorm: 0.34 [35m[1 day, 14:32:35<1 day, 10:48:09][39m
+[titan] 2025-09-09 08:08:35,227 - root - INFO - [31mstep: 21025 [32mloss: 2.8080 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.83 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.8089 [37mglobal_avg_top_loss: 1.9991
+[titan] 2025-09-09 08:08:35,227 - root - INFO - [34mlr: 1.0411e-05 gnorm: 0.34 [35m[1 day, 14:33:07<1 day, 10:47:35][39m
+[titan] 2025-09-09 08:09:07,117 - root - INFO - [31mstep: 21030 [32mloss: 2.7775 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.72 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8056 [37mglobal_avg_top_loss: 1.9720
+[titan] 2025-09-09 08:09:07,117 - root - INFO - [34mlr: 1.0407e-05 gnorm: 0.34 [35m[1 day, 14:33:39<1 day, 10:47:01][39m
+[titan] 2025-09-09 08:09:39,062 - root - INFO - [31mstep: 21035 [32mloss: 2.7663 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.89 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7906 [37mglobal_avg_top_loss: 1.9757
+[titan] 2025-09-09 08:09:39,062 - root - INFO - [34mlr: 1.0404e-05 gnorm: 0.34 [35m[1 day, 14:34:11<1 day, 10:46:27][39m
+[titan] 2025-09-09 08:10:11,008 - root - INFO - [31mstep: 21040 [32mloss: 2.7821 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.86 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7995 [37mglobal_avg_top_loss: 1.9826
+[titan] 2025-09-09 08:10:11,009 - root - INFO - [34mlr: 1.0400e-05 gnorm: 0.35 [35m[1 day, 14:34:43<1 day, 10:45:53][39m
+[titan] 2025-09-09 08:10:42,829 - root - INFO - [31mstep: 21045 [32mloss: 2.7440 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.80 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7797 [37mglobal_avg_top_loss: 1.9643
+[titan] 2025-09-09 08:10:42,829 - root - INFO - [34mlr: 1.0397e-05 gnorm: 0.33 [35m[1 day, 14:35:15<1 day, 10:45:19][39m
+[titan] 2025-09-09 08:11:08,409 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:11:14,825 - root - INFO - [31mstep: 21050 [32mloss: 2.7525 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.10 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7832 [37mglobal_avg_top_loss: 1.9693
+[titan] 2025-09-09 08:11:14,826 - root - INFO - [34mlr: 1.0393e-05 gnorm: 0.33 [35m[1 day, 14:35:47<1 day, 10:44:45][39m
+[titan] 2025-09-09 08:11:46,977 - root - INFO - [31mstep: 21055 [32mloss: 2.6952 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,192 [36mtflops: 485.75 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7572 [37mglobal_avg_top_loss: 1.9380
+[titan] 2025-09-09 08:11:46,977 - root - INFO - [34mlr: 1.0390e-05 gnorm: 0.34 [35m[1 day, 14:36:19<1 day, 10:44:11][39m
+[titan] 2025-09-09 08:12:18,872 - root - INFO - [31mstep: 21060 [32mloss: 2.6235 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.64 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7295 [37mglobal_avg_top_loss: 1.8941
+[titan] 2025-09-09 08:12:18,873 - root - INFO - [34mlr: 1.0386e-05 gnorm: 0.45 [35m[1 day, 14:36:51<1 day, 10:43:37][39m
+[titan] 2025-09-09 08:12:50,792 - root - INFO - [31mstep: 21065 [32mloss: 2.7581 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.28 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7873 [37mglobal_avg_top_loss: 1.9709
+[titan] 2025-09-09 08:12:50,792 - root - INFO - [34mlr: 1.0383e-05 gnorm: 0.34 [35m[1 day, 14:37:23<1 day, 10:43:03][39m
+[titan] 2025-09-09 08:13:22,704 - root - INFO - [31mstep: 21070 [32mloss: 2.6939 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.38 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7548 [37mglobal_avg_top_loss: 1.9391
+[titan] 2025-09-09 08:13:22,705 - root - INFO - [34mlr: 1.0379e-05 gnorm: 0.34 [35m[1 day, 14:37:55<1 day, 10:42:29][39m
+[titan] 2025-09-09 08:13:54,590 - root - INFO - [31mstep: 21075 [32mloss: 2.7878 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.80 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8016 [37mglobal_avg_top_loss: 1.9862
+[titan] 2025-09-09 08:13:54,590 - root - INFO - [34mlr: 1.0375e-05 gnorm: 0.34 [35m[1 day, 14:38:27<1 day, 10:41:55][39m
+[titan] 2025-09-09 08:14:26,459 - root - INFO - [31mstep: 21080 [32mloss: 2.7931 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.05 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.8034 [37mglobal_avg_top_loss: 1.9896
+[titan] 2025-09-09 08:14:26,460 - root - INFO - [34mlr: 1.0372e-05 gnorm: 0.34 [35m[1 day, 14:38:59<1 day, 10:41:21][39m
+[titan] 2025-09-09 08:14:58,295 - root - INFO - [31mstep: 21085 [32mloss: 3.1784 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.56 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.9944 [37mglobal_avg_top_loss: 2.1841
+[titan] 2025-09-09 08:14:58,296 - root - INFO - [34mlr: 1.0368e-05 gnorm: 0.37 [35m[1 day, 14:39:30<1 day, 10:40:47][39m
+[titan] 2025-09-09 08:15:30,318 - root - INFO - [31mstep: 21090 [32mloss: 2.8480 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.70 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.8296 [37mglobal_avg_top_loss: 2.0184
+[titan] 2025-09-09 08:15:30,318 - root - INFO - [34mlr: 1.0365e-05 gnorm: 0.32 [35m[1 day, 14:40:02<1 day, 10:40:13][39m
+[titan] 2025-09-09 08:16:02,066 - root - INFO - [31mstep: 21095 [32mloss: 3.1556 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,321 [36mtflops: 491.91 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 1.0224 [37mglobal_avg_top_loss: 2.1332
+[titan] 2025-09-09 08:16:02,066 - root - INFO - [34mlr: 1.0361e-05 gnorm: 0.33 [35m[1 day, 14:40:34<1 day, 10:39:39][39m
+[titan] 2025-09-09 08:16:27,563 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:16:33,905 - root - INFO - [31mstep: 21100 [32mloss: 2.8049 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.51 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.8074 [37mglobal_avg_top_loss: 1.9976
+[titan] 2025-09-09 08:16:33,906 - root - INFO - [34mlr: 1.0358e-05 gnorm: 0.36 [35m[1 day, 14:41:06<1 day, 10:39:05][39m
+[titan] 2025-09-09 08:17:05,759 - root - INFO - [31mstep: 21105 [32mloss: 2.6375 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.28 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7327 [37mglobal_avg_top_loss: 1.9048
+[titan] 2025-09-09 08:17:05,759 - root - INFO - [34mlr: 1.0354e-05 gnorm: 0.33 [35m[1 day, 14:41:38<1 day, 10:38:31][39m
+[titan] 2025-09-09 08:17:37,475 - root - INFO - [31mstep: 21110 [32mloss: 2.7934 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,332 [36mtflops: 492.42 [35mmfu: 49.79%[39m [37mglobal_avg_ntp_loss: 0.7993 [37mglobal_avg_top_loss: 1.9942
+[titan] 2025-09-09 08:17:37,475 - root - INFO - [34mlr: 1.0350e-05 gnorm: 0.34 [35m[1 day, 14:42:10<1 day, 10:37:57][39m
+[titan] 2025-09-09 08:18:09,514 - root - INFO - [31mstep: 21115 [32mloss: 2.7802 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.44 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7952 [37mglobal_avg_top_loss: 1.9850
+[titan] 2025-09-09 08:18:09,515 - root - INFO - [34mlr: 1.0347e-05 gnorm: 0.33 [35m[1 day, 14:42:42<1 day, 10:37:23][39m
+[titan] 2025-09-09 08:18:41,343 - root - INFO - [31mstep: 21120 [32mloss: 2.7539 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.67 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7800 [37mglobal_avg_top_loss: 1.9739
+[titan] 2025-09-09 08:18:41,344 - root - INFO - [34mlr: 1.0343e-05 gnorm: 0.35 [35m[1 day, 14:43:13<1 day, 10:36:49][39m
+[titan] 2025-09-09 08:19:13,081 - root - INFO - [31mstep: 21125 [32mloss: 3.1028 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,325 [36mtflops: 492.08 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 0.9473 [37mglobal_avg_top_loss: 2.1555
+[titan] 2025-09-09 08:19:13,081 - root - INFO - [34mlr: 1.0340e-05 gnorm: 0.38 [35m[1 day, 14:43:45<1 day, 10:36:15][39m
+[titan] 2025-09-09 08:19:45,334 - root - INFO - [31mstep: 21130 [32mloss: 2.7767 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,160 [36mtflops: 484.21 [35mmfu: 48.96%[39m [37mglobal_avg_ntp_loss: 0.7932 [37mglobal_avg_top_loss: 1.9835
+[titan] 2025-09-09 08:19:45,335 - root - INFO - [34mlr: 1.0336e-05 gnorm: 0.34 [35m[1 day, 14:44:17<1 day, 10:35:41][39m
+[titan] 2025-09-09 08:20:17,484 - root - INFO - [31mstep: 21135 [32mloss: 2.7999 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.77 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.8034 [37mglobal_avg_top_loss: 1.9965
+[titan] 2025-09-09 08:20:17,484 - root - INFO - [34mlr: 1.0333e-05 gnorm: 0.33 [35m[1 day, 14:44:50<1 day, 10:35:08][39m
+[titan] 2025-09-09 08:20:49,386 - root - INFO - [31mstep: 21140 [32mloss: 2.5905 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.55 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7072 [37mglobal_avg_top_loss: 1.8832
+[titan] 2025-09-09 08:20:49,386 - root - INFO - [34mlr: 1.0329e-05 gnorm: 0.43 [35m[1 day, 14:45:21<1 day, 10:34:34][39m
+[titan] 2025-09-09 08:21:21,367 - root - INFO - [31mstep: 21145 [32mloss: 3.1811 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.33 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 1.0347 [37mglobal_avg_top_loss: 2.1463
+[titan] 2025-09-09 08:21:21,368 - root - INFO - [34mlr: 1.0326e-05 gnorm: 0.34 [35m[1 day, 14:45:53<1 day, 10:34:00][39m
+[titan] 2025-09-09 08:21:46,894 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:21:53,350 - root - INFO - [31mstep: 21150 [32mloss: 2.8332 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.30 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.8192 [37mglobal_avg_top_loss: 2.0140
+[titan] 2025-09-09 08:21:53,351 - root - INFO - [34mlr: 1.0322e-05 gnorm: 0.37 [35m[1 day, 14:46:25<1 day, 10:33:26][39m
+[titan] 2025-09-09 08:22:25,343 - root - INFO - [31mstep: 21155 [32mloss: 2.7156 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.16 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7669 [37mglobal_avg_top_loss: 1.9487
+[titan] 2025-09-09 08:22:25,343 - root - INFO - [34mlr: 1.0318e-05 gnorm: 0.37 [35m[1 day, 14:46:57<1 day, 10:32:52][39m
+[titan] 2025-09-09 08:22:57,111 - root - INFO - [31mstep: 21160 [32mloss: 2.7876 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,315 [36mtflops: 491.60 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.8013 [37mglobal_avg_top_loss: 1.9863
+[titan] 2025-09-09 08:22:57,112 - root - INFO - [34mlr: 1.0315e-05 gnorm: 0.34 [35m[1 day, 14:47:29<1 day, 10:32:18][39m
+[titan] 2025-09-09 08:23:29,275 - root - INFO - [31mstep: 21165 [32mloss: 3.0356 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,188 [36mtflops: 485.56 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.9197 [37mglobal_avg_top_loss: 2.1159
+[titan] 2025-09-09 08:23:29,276 - root - INFO - [34mlr: 1.0311e-05 gnorm: 0.39 [35m[1 day, 14:48:01<1 day, 10:31:44][39m
+[titan] 2025-09-09 08:24:01,373 - root - INFO - [31mstep: 21170 [32mloss: 2.7337 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.56 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7779 [37mglobal_avg_top_loss: 1.9559
+[titan] 2025-09-09 08:24:01,373 - root - INFO - [34mlr: 1.0308e-05 gnorm: 0.34 [35m[1 day, 14:48:33<1 day, 10:31:10][39m
+[titan] 2025-09-09 08:24:33,246 - root - INFO - [31mstep: 21175 [32mloss: 3.2832 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.99 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 1.0789 [37mglobal_avg_top_loss: 2.2042
+[titan] 2025-09-09 08:24:33,246 - root - INFO - [34mlr: 1.0304e-05 gnorm: 0.38 [35m[1 day, 14:49:05<1 day, 10:30:36][39m
+[titan] 2025-09-09 08:25:05,150 - root - INFO - [31mstep: 21180 [32mloss: 2.7886 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.51 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8007 [37mglobal_avg_top_loss: 1.9879
+[titan] 2025-09-09 08:25:05,151 - root - INFO - [34mlr: 1.0301e-05 gnorm: 0.38 [35m[1 day, 14:49:37<1 day, 10:30:02][39m
+[titan] 2025-09-09 08:25:37,105 - root - INFO - [31mstep: 21185 [32mloss: 2.7363 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.74 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7782 [37mglobal_avg_top_loss: 1.9581
+[titan] 2025-09-09 08:25:37,105 - root - INFO - [34mlr: 1.0297e-05 gnorm: 0.32 [35m[1 day, 14:50:09<1 day, 10:29:28][39m
+[titan] 2025-09-09 08:26:09,014 - root - INFO - [31mstep: 21190 [32mloss: 2.7883 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.44 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7990 [37mglobal_avg_top_loss: 1.9894
+[titan] 2025-09-09 08:26:09,014 - root - INFO - [34mlr: 1.0294e-05 gnorm: 0.37 [35m[1 day, 14:50:41<1 day, 10:28:54][39m
+[titan] 2025-09-09 08:26:40,951 - root - INFO - [31mstep: 21195 [32mloss: 2.7609 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 489.00 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7866 [37mglobal_avg_top_loss: 1.9743
+[titan] 2025-09-09 08:26:40,952 - root - INFO - [34mlr: 1.0290e-05 gnorm: 0.35 [35m[1 day, 14:51:13<1 day, 10:28:21][39m
+[titan] 2025-09-09 08:27:06,456 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:27:12,857 - root - INFO - [31mstep: 21200 [32mloss: 2.8088 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.49 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8095 [37mglobal_avg_top_loss: 1.9993
+[titan] 2025-09-09 08:27:12,857 - root - INFO - [34mlr: 1.0286e-05 gnorm: 0.34 [35m[1 day, 14:51:45<1 day, 10:27:47][39m
+[titan] 2025-09-09 08:27:44,761 - root - INFO - [31mstep: 21205 [32mloss: 2.8327 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.51 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8227 [37mglobal_avg_top_loss: 2.0100
+[titan] 2025-09-09 08:27:44,761 - root - INFO - [34mlr: 1.0283e-05 gnorm: 0.36 [35m[1 day, 14:52:17<1 day, 10:27:13][39m
+[titan] 2025-09-09 08:28:16,646 - root - INFO - [31mstep: 21210 [32mloss: 2.7428 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.81 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7750 [37mglobal_avg_top_loss: 1.9678
+[titan] 2025-09-09 08:28:16,646 - root - INFO - [34mlr: 1.0279e-05 gnorm: 0.37 [35m[1 day, 14:52:49<1 day, 10:26:39][39m
+[titan] 2025-09-09 08:28:48,498 - root - INFO - [31mstep: 21215 [32mloss: 2.7985 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.31 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.8044 [37mglobal_avg_top_loss: 1.9941
+[titan] 2025-09-09 08:28:48,498 - root - INFO - [34mlr: 1.0276e-05 gnorm: 0.41 [35m[1 day, 14:53:21<1 day, 10:26:05][39m
+[titan] 2025-09-09 08:29:20,626 - root - INFO - [31mstep: 21220 [32mloss: 2.7330 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,200 [36mtflops: 486.11 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7767 [37mglobal_avg_top_loss: 1.9564
+[titan] 2025-09-09 08:29:20,626 - root - INFO - [34mlr: 1.0272e-05 gnorm: 0.64 [35m[1 day, 14:53:53<1 day, 10:25:31][39m
+[titan] 2025-09-09 08:29:52,377 - root - INFO - [31mstep: 21225 [32mloss: 3.2566 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,321 [36mtflops: 491.87 [35mmfu: 49.73%[39m [37mglobal_avg_ntp_loss: 1.0652 [37mglobal_avg_top_loss: 2.1914
+[titan] 2025-09-09 08:29:52,377 - root - INFO - [34mlr: 1.0269e-05 gnorm: 0.35 [35m[1 day, 14:54:24<1 day, 10:24:57][39m
+[titan] 2025-09-09 08:30:24,267 - root - INFO - [31mstep: 21230 [32mloss: 2.7619 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.72 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7897 [37mglobal_avg_top_loss: 1.9723
+[titan] 2025-09-09 08:30:24,268 - root - INFO - [34mlr: 1.0265e-05 gnorm: 0.38 [35m[1 day, 14:54:56<1 day, 10:24:23][39m
+[titan] 2025-09-09 08:30:56,119 - root - INFO - [31mstep: 21235 [32mloss: 2.7596 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.32 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7843 [37mglobal_avg_top_loss: 1.9753
+[titan] 2025-09-09 08:30:56,120 - root - INFO - [34mlr: 1.0262e-05 gnorm: 0.36 [35m[1 day, 14:55:28<1 day, 10:23:49][39m
+[titan] 2025-09-09 08:31:27,996 - root - INFO - [31mstep: 21240 [32mloss: 2.6339 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.93 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7300 [37mglobal_avg_top_loss: 1.9039
+[titan] 2025-09-09 08:31:27,997 - root - INFO - [34mlr: 1.0258e-05 gnorm: 0.37 [35m[1 day, 14:56:00<1 day, 10:23:15][39m
+[titan] 2025-09-09 08:31:59,922 - root - INFO - [31mstep: 21245 [32mloss: 2.8927 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.18 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.8453 [37mglobal_avg_top_loss: 2.0474
+[titan] 2025-09-09 08:31:59,922 - root - INFO - [34mlr: 1.0254e-05 gnorm: 0.36 [35m[1 day, 14:56:32<1 day, 10:22:41][39m
+[titan] 2025-09-09 08:32:25,411 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:32:31,783 - root - INFO - [31mstep: 21250 [32mloss: 2.6490 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.17 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7374 [37mglobal_avg_top_loss: 1.9116
+[titan] 2025-09-09 08:32:31,783 - root - INFO - [34mlr: 1.0251e-05 gnorm: 0.37 [35m[1 day, 14:57:04<1 day, 10:22:07][39m
+[titan] 2025-09-09 08:33:03,883 - root - INFO - [31mstep: 21255 [32mloss: 2.9289 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,208 [36mtflops: 486.52 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.8814 [37mglobal_avg_top_loss: 2.0475
+[titan] 2025-09-09 08:33:03,884 - root - INFO - [34mlr: 1.0247e-05 gnorm: 0.39 [35m[1 day, 14:57:36<1 day, 10:21:33][39m
+[titan] 2025-09-09 08:33:35,861 - root - INFO - [31mstep: 21260 [32mloss: 2.7507 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.38 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7845 [37mglobal_avg_top_loss: 1.9663
+[titan] 2025-09-09 08:33:35,862 - root - INFO - [34mlr: 1.0244e-05 gnorm: 0.41 [35m[1 day, 14:58:08<1 day, 10:20:59][39m
+[titan] 2025-09-09 08:34:07,792 - root - INFO - [31mstep: 21265 [32mloss: 2.7694 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.10 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7900 [37mglobal_avg_top_loss: 1.9794
+[titan] 2025-09-09 08:34:07,792 - root - INFO - [34mlr: 1.0240e-05 gnorm: 0.35 [35m[1 day, 14:58:40<1 day, 10:20:25][39m
+[titan] 2025-09-09 08:34:39,695 - root - INFO - [31mstep: 21270 [32mloss: 3.2825 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.53 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 1.0775 [37mglobal_avg_top_loss: 2.2050
+[titan] 2025-09-09 08:34:39,696 - root - INFO - [34mlr: 1.0237e-05 gnorm: 0.36 [35m[1 day, 14:59:12<1 day, 10:19:51][39m
+[titan] 2025-09-09 08:35:11,643 - root - INFO - [31mstep: 21275 [32mloss: 2.7718 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.84 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7955 [37mglobal_avg_top_loss: 1.9763
+[titan] 2025-09-09 08:35:11,643 - root - INFO - [34mlr: 1.0233e-05 gnorm: 0.34 [35m[1 day, 14:59:44<1 day, 10:19:17][39m
+[titan] 2025-09-09 08:35:43,579 - root - INFO - [31mstep: 21280 [32mloss: 2.7849 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.02 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8014 [37mglobal_avg_top_loss: 1.9835
+[titan] 2025-09-09 08:35:43,579 - root - INFO - [34mlr: 1.0229e-05 gnorm: 0.38 [35m[1 day, 15:00:16<1 day, 10:18:43][39m
+[titan] 2025-09-09 08:36:15,843 - root - INFO - [31mstep: 21285 [32mloss: 2.8401 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,157 [36mtflops: 484.06 [35mmfu: 48.94%[39m [37mglobal_avg_ntp_loss: 0.8238 [37mglobal_avg_top_loss: 2.0163
+[titan] 2025-09-09 08:36:15,843 - root - INFO - [34mlr: 1.0226e-05 gnorm: 0.42 [35m[1 day, 15:00:48<1 day, 10:18:10][39m
+[titan] 2025-09-09 08:36:47,470 - root - INFO - [31mstep: 21290 [32mloss: 3.1267 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,361 [36mtflops: 493.80 [35mmfu: 49.93%[39m [37mglobal_avg_ntp_loss: 0.9904 [37mglobal_avg_top_loss: 2.1363
+[titan] 2025-09-09 08:36:47,470 - root - INFO - [34mlr: 1.0222e-05 gnorm: 0.48 [35m[1 day, 15:01:20<1 day, 10:17:36][39m
+[titan] 2025-09-09 08:37:19,400 - root - INFO - [31mstep: 21295 [32mloss: 2.7485 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.11 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7810 [37mglobal_avg_top_loss: 1.9674
+[titan] 2025-09-09 08:37:19,400 - root - INFO - [34mlr: 1.0219e-05 gnorm: 0.34 [35m[1 day, 15:01:51<1 day, 10:17:02][39m
+[titan] 2025-09-09 08:37:44,909 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:37:51,307 - root - INFO - [31mstep: 21300 [32mloss: 2.6898 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.46 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7528 [37mglobal_avg_top_loss: 1.9370
+[titan] 2025-09-09 08:37:51,308 - root - INFO - [34mlr: 1.0215e-05 gnorm: 0.63 [35m[1 day, 15:02:23<1 day, 10:16:28][39m
+[titan] 2025-09-09 08:38:23,328 - root - INFO - [31mstep: 21305 [32mloss: 3.2138 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.73 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 1.0453 [37mglobal_avg_top_loss: 2.1684
+[titan] 2025-09-09 08:38:23,328 - root - INFO - [34mlr: 1.0212e-05 gnorm: 0.33 [35m[1 day, 15:02:55<1 day, 10:15:54][39m
+[titan] 2025-09-09 08:38:55,457 - root - INFO - [31mstep: 21310 [32mloss: 2.6523 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,199 [36mtflops: 486.08 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7390 [37mglobal_avg_top_loss: 1.9133
+[titan] 2025-09-09 08:38:55,458 - root - INFO - [34mlr: 1.0208e-05 gnorm: 0.33 [35m[1 day, 15:03:28<1 day, 10:15:20][39m
+[titan] 2025-09-09 08:39:27,268 - root - INFO - [31mstep: 21315 [32mloss: 2.7725 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,301 [36mtflops: 490.95 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7902 [37mglobal_avg_top_loss: 1.9823
+[titan] 2025-09-09 08:39:27,268 - root - INFO - [34mlr: 1.0205e-05 gnorm: 0.38 [35m[1 day, 15:03:59<1 day, 10:14:46][39m
+[titan] 2025-09-09 08:39:59,304 - root - INFO - [31mstep: 21320 [32mloss: 3.2155 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.49 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 1.0474 [37mglobal_avg_top_loss: 2.1681
+[titan] 2025-09-09 08:39:59,305 - root - INFO - [34mlr: 1.0201e-05 gnorm: 0.37 [35m[1 day, 15:04:31<1 day, 10:14:12][39m
+[titan] 2025-09-09 08:40:31,203 - root - INFO - [31mstep: 21325 [32mloss: 2.7910 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.59 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8014 [37mglobal_avg_top_loss: 1.9895
+[titan] 2025-09-09 08:40:31,204 - root - INFO - [34mlr: 1.0197e-05 gnorm: 0.35 [35m[1 day, 15:05:03<1 day, 10:13:38][39m
+[titan] 2025-09-09 08:41:02,867 - root - INFO - [31mstep: 21330 [32mloss: 2.7792 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,349 [36mtflops: 493.22 [35mmfu: 49.87%[39m [37mglobal_avg_ntp_loss: 0.7908 [37mglobal_avg_top_loss: 1.9884
+[titan] 2025-09-09 08:41:02,868 - root - INFO - [34mlr: 1.0194e-05 gnorm: 0.75 [35m[1 day, 15:05:35<1 day, 10:13:04][39m
+[titan] 2025-09-09 08:41:34,736 - root - INFO - [31mstep: 21335 [32mloss: 2.8769 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.05 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.8415 [37mglobal_avg_top_loss: 2.0354
+[titan] 2025-09-09 08:41:34,737 - root - INFO - [34mlr: 1.0190e-05 gnorm: 0.36 [35m[1 day, 15:06:07<1 day, 10:12:30][39m
+[titan] 2025-09-09 08:42:06,671 - root - INFO - [31mstep: 21340 [32mloss: 2.8258 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.04 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8173 [37mglobal_avg_top_loss: 2.0085
+[titan] 2025-09-09 08:42:06,672 - root - INFO - [34mlr: 1.0187e-05 gnorm: 0.35 [35m[1 day, 15:06:39<1 day, 10:11:56][39m
+[titan] 2025-09-09 08:42:39,089 - root - INFO - [31mstep: 21345 [32mloss: 2.7732 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,109 [36mtflops: 481.76 [35mmfu: 48.71%[39m [37mglobal_avg_ntp_loss: 0.7933 [37mglobal_avg_top_loss: 1.9799
+[titan] 2025-09-09 08:42:39,089 - root - INFO - [34mlr: 1.0183e-05 gnorm: 0.35 [35m[1 day, 15:07:11<1 day, 10:11:23][39m
+[titan] 2025-09-09 08:43:04,629 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:43:11,000 - root - INFO - [31mstep: 21350 [32mloss: 3.1695 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.41 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 1.0284 [37mglobal_avg_top_loss: 2.1411
+[titan] 2025-09-09 08:43:11,000 - root - INFO - [34mlr: 1.0180e-05 gnorm: 0.44 [35m[1 day, 15:07:43<1 day, 10:10:49][39m
+[titan] 2025-09-09 08:43:43,087 - root - INFO - [31mstep: 21355 [32mloss: 2.8161 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.71 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.8140 [37mglobal_avg_top_loss: 2.0022
+[titan] 2025-09-09 08:43:43,088 - root - INFO - [34mlr: 1.0176e-05 gnorm: 0.34 [35m[1 day, 15:08:15<1 day, 10:10:15][39m
+[titan] 2025-09-09 08:44:15,025 - root - INFO - [31mstep: 21360 [32mloss: 2.8946 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.99 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8617 [37mglobal_avg_top_loss: 2.0329
+[titan] 2025-09-09 08:44:15,026 - root - INFO - [34mlr: 1.0173e-05 gnorm: 0.53 [35m[1 day, 15:08:47<1 day, 10:09:41][39m
+[titan] 2025-09-09 08:44:47,038 - root - INFO - [31mstep: 21365 [32mloss: 2.8272 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.86 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.8180 [37mglobal_avg_top_loss: 2.0092
+[titan] 2025-09-09 08:44:47,038 - root - INFO - [34mlr: 1.0169e-05 gnorm: 0.37 [35m[1 day, 15:09:19<1 day, 10:09:07][39m
+[titan] 2025-09-09 08:45:19,179 - root - INFO - [31mstep: 21370 [32mloss: 2.7363 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,195 [36mtflops: 485.91 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7739 [37mglobal_avg_top_loss: 1.9625
+[titan] 2025-09-09 08:45:19,179 - root - INFO - [34mlr: 1.0165e-05 gnorm: 0.58 [35m[1 day, 15:09:51<1 day, 10:08:34][39m
+[titan] 2025-09-09 08:45:51,007 - root - INFO - [31mstep: 21375 [32mloss: 2.6926 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.67 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7562 [37mglobal_avg_top_loss: 1.9364
+[titan] 2025-09-09 08:45:51,008 - root - INFO - [34mlr: 1.0162e-05 gnorm: 0.35 [35m[1 day, 15:10:23<1 day, 10:08:00][39m
+[titan] 2025-09-09 08:46:22,977 - root - INFO - [31mstep: 21380 [32mloss: 3.1665 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.50 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 1.0248 [37mglobal_avg_top_loss: 2.1417
+[titan] 2025-09-09 08:46:22,978 - root - INFO - [34mlr: 1.0158e-05 gnorm: 0.38 [35m[1 day, 15:10:55<1 day, 10:07:26][39m
+[titan] 2025-09-09 08:46:54,724 - root - INFO - [31mstep: 21385 [32mloss: 3.1693 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,322 [36mtflops: 491.94 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 1.0278 [37mglobal_avg_top_loss: 2.1416
+[titan] 2025-09-09 08:46:54,725 - root - INFO - [34mlr: 1.0155e-05 gnorm: 0.38 [35m[1 day, 15:11:27<1 day, 10:06:52][39m
+[titan] 2025-09-09 08:47:26,715 - root - INFO - [31mstep: 21390 [32mloss: 2.7132 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7658 [37mglobal_avg_top_loss: 1.9475
+[titan] 2025-09-09 08:47:26,716 - root - INFO - [34mlr: 1.0151e-05 gnorm: 0.35 [35m[1 day, 15:11:59<1 day, 10:06:18][39m
+[titan] 2025-09-09 08:47:58,747 - root - INFO - [31mstep: 21395 [32mloss: 2.8333 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.57 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.8238 [37mglobal_avg_top_loss: 2.0096
+[titan] 2025-09-09 08:47:58,747 - root - INFO - [34mlr: 1.0148e-05 gnorm: 0.38 [35m[1 day, 15:12:31<1 day, 10:05:44][39m
+[titan] 2025-09-09 08:48:24,378 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:48:30,698 - root - INFO - [31mstep: 21400 [32mloss: 3.2217 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.79 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 1.0531 [37mglobal_avg_top_loss: 2.1686
+[titan] 2025-09-09 08:48:30,698 - root - INFO - [34mlr: 1.0144e-05 gnorm: 0.49 [35m[1 day, 15:13:03<1 day, 10:05:10][39m
+[titan] 2025-09-09 08:49:02,771 - root - INFO - [31mstep: 21405 [32mloss: 2.7985 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.93 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8028 [37mglobal_avg_top_loss: 1.9957
+[titan] 2025-09-09 08:49:02,772 - root - INFO - [34mlr: 1.0141e-05 gnorm: 0.38 [35m[1 day, 15:13:35<1 day, 10:04:36][39m
+[titan] 2025-09-09 08:49:34,717 - root - INFO - [31mstep: 21410 [32mloss: 2.7025 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.88 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7627 [37mglobal_avg_top_loss: 1.9398
+[titan] 2025-09-09 08:49:34,717 - root - INFO - [34mlr: 1.0137e-05 gnorm: 0.39 [35m[1 day, 15:14:07<1 day, 10:04:02][39m
+[titan] 2025-09-09 08:50:06,822 - root - INFO - [31mstep: 21415 [32mloss: 3.2043 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.44 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 1.0499 [37mglobal_avg_top_loss: 2.1544
+[titan] 2025-09-09 08:50:06,823 - root - INFO - [34mlr: 1.0134e-05 gnorm: 0.41 [35m[1 day, 15:14:39<1 day, 10:03:29][39m
+[titan] 2025-09-09 08:50:38,964 - root - INFO - [31mstep: 21420 [32mloss: 2.7892 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,195 [36mtflops: 485.90 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.8005 [37mglobal_avg_top_loss: 1.9887
+[titan] 2025-09-09 08:50:38,964 - root - INFO - [34mlr: 1.0130e-05 gnorm: 0.41 [35m[1 day, 15:15:11<1 day, 10:02:55][39m
+[titan] 2025-09-09 08:51:10,696 - root - INFO - [31mstep: 21425 [32mloss: 2.8082 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,327 [36mtflops: 492.17 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 0.8092 [37mglobal_avg_top_loss: 1.9990
+[titan] 2025-09-09 08:51:10,696 - root - INFO - [34mlr: 1.0126e-05 gnorm: 0.33 [35m[1 day, 15:15:43<1 day, 10:02:21][39m
+[titan] 2025-09-09 08:51:42,687 - root - INFO - [31mstep: 21430 [32mloss: 3.2729 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 1.0723 [37mglobal_avg_top_loss: 2.2005
+[titan] 2025-09-09 08:51:42,687 - root - INFO - [34mlr: 1.0123e-05 gnorm: 0.44 [35m[1 day, 15:16:15<1 day, 10:01:47][39m
+[titan] 2025-09-09 08:52:14,594 - root - INFO - [31mstep: 21435 [32mloss: 2.9409 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.47 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8806 [37mglobal_avg_top_loss: 2.0603
+[titan] 2025-09-09 08:52:14,595 - root - INFO - [34mlr: 1.0119e-05 gnorm: 0.36 [35m[1 day, 15:16:47<1 day, 10:01:13][39m
+[titan] 2025-09-09 08:52:46,674 - root - INFO - [31mstep: 21440 [32mloss: 2.7742 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,215 [36mtflops: 486.83 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7939 [37mglobal_avg_top_loss: 1.9803
+[titan] 2025-09-09 08:52:46,675 - root - INFO - [34mlr: 1.0116e-05 gnorm: 0.34 [35m[1 day, 15:17:19<1 day, 10:00:39][39m
+[titan] 2025-09-09 08:53:18,594 - root - INFO - [31mstep: 21445 [32mloss: 2.9764 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.27 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.9063 [37mglobal_avg_top_loss: 2.0701
+[titan] 2025-09-09 08:53:18,594 - root - INFO - [34mlr: 1.0112e-05 gnorm: 0.37 [35m[1 day, 15:17:51<1 day, 10:00:05][39m
+[titan] 2025-09-09 08:53:44,001 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:53:50,615 - root - INFO - [31mstep: 21450 [32mloss: 2.8811 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.72 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.8518 [37mglobal_avg_top_loss: 2.0293
+[titan] 2025-09-09 08:53:50,616 - root - INFO - [34mlr: 1.0109e-05 gnorm: 0.59 [35m[1 day, 15:18:23<1 day, 9:59:32][39m
+[titan] 2025-09-09 08:54:23,030 - root - INFO - [31mstep: 21455 [32mloss: 2.7564 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,109 [36mtflops: 481.79 [35mmfu: 48.72%[39m [37mglobal_avg_ntp_loss: 0.7852 [37mglobal_avg_top_loss: 1.9713
+[titan] 2025-09-09 08:54:23,031 - root - INFO - [34mlr: 1.0105e-05 gnorm: 0.35 [35m[1 day, 15:18:55<1 day, 9:58:58][39m
+[titan] 2025-09-09 08:54:55,019 - root - INFO - [31mstep: 21460 [32mloss: 2.7226 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.22 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7708 [37mglobal_avg_top_loss: 1.9519
+[titan] 2025-09-09 08:54:55,019 - root - INFO - [34mlr: 1.0102e-05 gnorm: 0.34 [35m[1 day, 15:19:27<1 day, 9:58:24][39m
+[titan] 2025-09-09 08:55:27,041 - root - INFO - [31mstep: 21465 [32mloss: 2.7373 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.71 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7844 [37mglobal_avg_top_loss: 1.9530
+[titan] 2025-09-09 08:55:27,041 - root - INFO - [34mlr: 1.0098e-05 gnorm: 0.33 [35m[1 day, 15:19:59<1 day, 9:57:51][39m
+[titan] 2025-09-09 08:55:59,128 - root - INFO - [31mstep: 21470 [32mloss: 2.7267 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.72 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7742 [37mglobal_avg_top_loss: 1.9524
+[titan] 2025-09-09 08:55:59,128 - root - INFO - [34mlr: 1.0094e-05 gnorm: 0.34 [35m[1 day, 15:20:31<1 day, 9:57:17][39m
+[titan] 2025-09-09 08:56:30,924 - root - INFO - [31mstep: 21475 [32mloss: 2.7791 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.18 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7971 [37mglobal_avg_top_loss: 1.9820
+[titan] 2025-09-09 08:56:30,924 - root - INFO - [34mlr: 1.0091e-05 gnorm: 0.33 [35m[1 day, 15:21:03<1 day, 9:56:43][39m
+[titan] 2025-09-09 08:57:03,162 - root - INFO - [31mstep: 21480 [32mloss: 3.2365 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,165 [36mtflops: 484.44 [35mmfu: 48.98%[39m [37mglobal_avg_ntp_loss: 1.0599 [37mglobal_avg_top_loss: 2.1767
+[titan] 2025-09-09 08:57:03,162 - root - INFO - [34mlr: 1.0087e-05 gnorm: 0.36 [35m[1 day, 15:21:35<1 day, 9:56:09][39m
+[titan] 2025-09-09 08:57:35,295 - root - INFO - [31mstep: 21485 [32mloss: 2.8410 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.02 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.8233 [37mglobal_avg_top_loss: 2.0177
+[titan] 2025-09-09 08:57:35,295 - root - INFO - [34mlr: 1.0084e-05 gnorm: 0.33 [35m[1 day, 15:22:07<1 day, 9:55:35][39m
+[titan] 2025-09-09 08:58:07,572 - root - INFO - [31mstep: 21490 [32mloss: 2.7830 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,152 [36mtflops: 483.85 [35mmfu: 48.92%[39m [37mglobal_avg_ntp_loss: 0.8010 [37mglobal_avg_top_loss: 1.9821
+[titan] 2025-09-09 08:58:07,573 - root - INFO - [34mlr: 1.0080e-05 gnorm: 0.39 [35m[1 day, 15:22:40<1 day, 9:55:02][39m
+[titan] 2025-09-09 08:58:39,731 - root - INFO - [31mstep: 21495 [32mloss: 3.1936 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.64 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 1.0459 [37mglobal_avg_top_loss: 2.1477
+[titan] 2025-09-09 08:58:39,731 - root - INFO - [34mlr: 1.0077e-05 gnorm: 0.45 [35m[1 day, 15:23:12<1 day, 9:54:28][39m
+[titan] 2025-09-09 08:59:05,215 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 08:59:11,695 - root - INFO - [31mstep: 21500 [32mloss: 2.7727 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.60 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7926 [37mglobal_avg_top_loss: 1.9801
+[titan] 2025-09-09 08:59:11,695 - root - INFO - [34mlr: 1.0073e-05 gnorm: 0.34 [35m[1 day, 15:23:44<1 day, 9:53:54][39m
+[titan] 2025-09-09 08:59:37,553 - root - INFO - Dumping profiler traces at step 21504
+[titan] 2025-09-09 08:59:37,604 - root - INFO - Finished dumping profiler traces in 0.05 seconds
+[titan] 2025-09-09 08:59:43,999 - root - INFO - [31mstep: 21505 [32mloss: 2.8412 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,144 [36mtflops: 483.45 [35mmfu: 48.88%[39m [37mglobal_avg_ntp_loss: 0.8224 [37mglobal_avg_top_loss: 2.0188
+[titan] 2025-09-09 08:59:43,999 - root - INFO - [34mlr: 1.0070e-05 gnorm: 0.35 [35m[1 day, 15:24:16<1 day, 9:53:21][39m
+[titan] 2025-09-09 09:00:16,226 - root - INFO - [31mstep: 21510 [32mloss: 3.1718 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,168 [36mtflops: 484.60 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 1.0284 [37mglobal_avg_top_loss: 2.1434
+[titan] 2025-09-09 09:00:16,227 - root - INFO - [34mlr: 1.0066e-05 gnorm: 0.53 [35m[1 day, 15:24:48<1 day, 9:52:47][39m
+[titan] 2025-09-09 09:00:48,288 - root - INFO - [31mstep: 21515 [32mloss: 2.8465 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.11 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.8217 [37mglobal_avg_top_loss: 2.0248
+[titan] 2025-09-09 09:00:48,289 - root - INFO - [34mlr: 1.0062e-05 gnorm: 0.36 [35m[1 day, 15:25:20<1 day, 9:52:13][39m
+[titan] 2025-09-09 09:01:20,370 - root - INFO - [31mstep: 21520 [32mloss: 2.7681 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.80 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7901 [37mglobal_avg_top_loss: 1.9780
+[titan] 2025-09-09 09:01:20,370 - root - INFO - [34mlr: 1.0059e-05 gnorm: 0.36 [35m[1 day, 15:25:52<1 day, 9:51:40][39m
+[titan] 2025-09-09 09:01:52,291 - root - INFO - [31mstep: 21525 [32mloss: 2.8162 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.25 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.8160 [37mglobal_avg_top_loss: 2.0002
+[titan] 2025-09-09 09:01:52,292 - root - INFO - [34mlr: 1.0055e-05 gnorm: 0.38 [35m[1 day, 15:26:24<1 day, 9:51:06][39m
+[titan] 2025-09-09 09:02:24,640 - root - INFO - [31mstep: 21530 [32mloss: 3.1600 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,130 [36mtflops: 482.78 [35mmfu: 48.82%[39m [37mglobal_avg_ntp_loss: 1.0201 [37mglobal_avg_top_loss: 2.1399
+[titan] 2025-09-09 09:02:24,641 - root - INFO - [34mlr: 1.0052e-05 gnorm: 0.34 [35m[1 day, 15:26:57<1 day, 9:50:32][39m
+[titan] 2025-09-09 09:02:56,773 - root - INFO - [31mstep: 21535 [32mloss: 2.8205 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.03 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.8149 [37mglobal_avg_top_loss: 2.0056
+[titan] 2025-09-09 09:02:56,773 - root - INFO - [34mlr: 1.0048e-05 gnorm: 0.34 [35m[1 day, 15:27:29<1 day, 9:49:58][39m
+[titan] 2025-09-09 09:03:28,861 - root - INFO - [31mstep: 21540 [32mloss: 2.8180 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.70 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.8195 [37mglobal_avg_top_loss: 1.9985
+[titan] 2025-09-09 09:03:28,862 - root - INFO - [34mlr: 1.0045e-05 gnorm: 0.34 [35m[1 day, 15:28:01<1 day, 9:49:25][39m
+[titan] 2025-09-09 09:04:00,701 - root - INFO - [31mstep: 21545 [32mloss: 2.6384 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.50 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7275 [37mglobal_avg_top_loss: 1.9108
+[titan] 2025-09-09 09:04:00,701 - root - INFO - [34mlr: 1.0041e-05 gnorm: 0.46 [35m[1 day, 15:28:33<1 day, 9:48:51][39m
+[titan] 2025-09-09 09:04:26,260 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:04:32,679 - root - INFO - [31mstep: 21550 [32mloss: 2.8645 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.38 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.8352 [37mglobal_avg_top_loss: 2.0293
+[titan] 2025-09-09 09:04:32,679 - root - INFO - [34mlr: 1.0038e-05 gnorm: 0.34 [35m[1 day, 15:29:05<1 day, 9:48:17][39m
+[titan] 2025-09-09 09:05:04,604 - root - INFO - [31mstep: 21555 [32mloss: 2.7747 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.20 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.8015 [37mglobal_avg_top_loss: 1.9732
+[titan] 2025-09-09 09:05:04,604 - root - INFO - [34mlr: 1.0034e-05 gnorm: 0.34 [35m[1 day, 15:29:37<1 day, 9:47:43][39m
+[titan] 2025-09-09 09:05:36,652 - root - INFO - [31mstep: 21560 [32mloss: 3.2656 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.31 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 1.0721 [37mglobal_avg_top_loss: 2.1935
+[titan] 2025-09-09 09:05:36,652 - root - INFO - [34mlr: 1.0031e-05 gnorm: 0.37 [35m[1 day, 15:30:09<1 day, 9:47:09][39m
+[titan] 2025-09-09 09:06:08,695 - root - INFO - [31mstep: 21565 [32mloss: 2.8106 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.39 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.8015 [37mglobal_avg_top_loss: 2.0091
+[titan] 2025-09-09 09:06:08,695 - root - INFO - [34mlr: 1.0027e-05 gnorm: 0.47 [35m[1 day, 15:30:41<1 day, 9:46:35][39m
+[titan] 2025-09-09 09:06:40,846 - root - INFO - [31mstep: 21570 [32mloss: 2.7767 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,192 [36mtflops: 485.75 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7938 [37mglobal_avg_top_loss: 1.9830
+[titan] 2025-09-09 09:06:40,846 - root - INFO - [34mlr: 1.0023e-05 gnorm: 0.33 [35m[1 day, 15:31:13<1 day, 9:46:02][39m
+[titan] 2025-09-09 09:07:12,820 - root - INFO - [31mstep: 21575 [32mloss: 3.2102 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.43 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 1.0477 [37mglobal_avg_top_loss: 2.1625
+[titan] 2025-09-09 09:07:12,821 - root - INFO - [34mlr: 1.0020e-05 gnorm: 0.36 [35m[1 day, 15:31:45<1 day, 9:45:28][39m
+[titan] 2025-09-09 09:07:44,834 - root - INFO - [31mstep: 21580 [32mloss: 2.8018 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.84 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7931 [37mglobal_avg_top_loss: 2.0087
+[titan] 2025-09-09 09:07:44,834 - root - INFO - [34mlr: 1.0016e-05 gnorm: 1.13 [35m[1 day, 15:32:17<1 day, 9:44:54][39m
+[titan] 2025-09-09 09:08:16,906 - root - INFO - [31mstep: 21585 [32mloss: 2.8776 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.96 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.8386 [37mglobal_avg_top_loss: 2.0390
+[titan] 2025-09-09 09:08:16,906 - root - INFO - [34mlr: 1.0013e-05 gnorm: 0.37 [35m[1 day, 15:32:49<1 day, 9:44:20][39m
+[titan] 2025-09-09 09:08:48,956 - root - INFO - [31mstep: 21590 [32mloss: 2.7611 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.27 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7877 [37mglobal_avg_top_loss: 1.9734
+[titan] 2025-09-09 09:08:48,957 - root - INFO - [34mlr: 1.0009e-05 gnorm: 0.35 [35m[1 day, 15:33:21<1 day, 9:43:47][39m
+[titan] 2025-09-09 09:09:20,964 - root - INFO - [31mstep: 21595 [32mloss: 2.6858 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.93 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7556 [37mglobal_avg_top_loss: 1.9303
+[titan] 2025-09-09 09:09:20,964 - root - INFO - [34mlr: 1.0006e-05 gnorm: 0.34 [35m[1 day, 15:33:53<1 day, 9:43:13][39m
+[titan] 2025-09-09 09:09:46,585 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:09:53,062 - root - INFO - [31mstep: 21600 [32mloss: 2.7319 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.55 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7754 [37mglobal_avg_top_loss: 1.9565
+[titan] 2025-09-09 09:09:53,063 - root - INFO - [34mlr: 1.0002e-05 gnorm: 0.34 [35m[1 day, 15:34:25<1 day, 9:42:39][39m
+[titan] 2025-09-09 09:10:25,171 - root - INFO - [31mstep: 21605 [32mloss: 2.8199 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.40 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.8112 [37mglobal_avg_top_loss: 2.0086
+[titan] 2025-09-09 09:10:25,171 - root - INFO - [34mlr: 9.9986e-06 gnorm: 0.34 [35m[1 day, 15:34:57<1 day, 9:42:05][39m
+[titan] 2025-09-09 09:10:57,199 - root - INFO - [31mstep: 21610 [32mloss: 3.2078 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.62 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 1.0454 [37mglobal_avg_top_loss: 2.1623
+[titan] 2025-09-09 09:10:57,199 - root - INFO - [34mlr: 9.9950e-06 gnorm: 0.34 [35m[1 day, 15:35:29<1 day, 9:41:32][39m
+[titan] 2025-09-09 09:11:29,325 - root - INFO - [31mstep: 21615 [32mloss: 2.8016 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,200 [36mtflops: 486.13 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.8056 [37mglobal_avg_top_loss: 1.9959
+[titan] 2025-09-09 09:11:29,325 - root - INFO - [34mlr: 9.9915e-06 gnorm: 0.36 [35m[1 day, 15:36:01<1 day, 9:40:58][39m
+[titan] 2025-09-09 09:12:01,269 - root - INFO - [31mstep: 21620 [32mloss: 2.7403 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.90 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7775 [37mglobal_avg_top_loss: 1.9628
+[titan] 2025-09-09 09:12:01,270 - root - INFO - [34mlr: 9.9879e-06 gnorm: 0.36 [35m[1 day, 15:36:33<1 day, 9:40:24][39m
+[titan] 2025-09-09 09:12:33,188 - root - INFO - [31mstep: 21625 [32mloss: 2.7520 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.29 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7841 [37mglobal_avg_top_loss: 1.9678
+[titan] 2025-09-09 09:12:33,188 - root - INFO - [34mlr: 9.9844e-06 gnorm: 0.35 [35m[1 day, 15:37:05<1 day, 9:39:50][39m
+[titan] 2025-09-09 09:13:04,993 - root - INFO - [31mstep: 21630 [32mloss: 2.7973 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.04 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.8051 [37mglobal_avg_top_loss: 1.9922
+[titan] 2025-09-09 09:13:04,993 - root - INFO - [34mlr: 9.9808e-06 gnorm: 0.34 [35m[1 day, 15:37:37<1 day, 9:39:16][39m
+[titan] 2025-09-09 09:13:37,258 - root - INFO - [31mstep: 21635 [32mloss: 2.8207 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,156 [36mtflops: 484.04 [35mmfu: 48.94%[39m [37mglobal_avg_ntp_loss: 0.8149 [37mglobal_avg_top_loss: 2.0058
+[titan] 2025-09-09 09:13:37,258 - root - INFO - [34mlr: 9.9773e-06 gnorm: 0.33 [35m[1 day, 15:38:09<1 day, 9:38:43][39m
+[titan] 2025-09-09 09:14:09,247 - root - INFO - [31mstep: 21640 [32mloss: 3.2058 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.21 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 1.0431 [37mglobal_avg_top_loss: 2.1627
+[titan] 2025-09-09 09:14:09,247 - root - INFO - [34mlr: 9.9737e-06 gnorm: 0.35 [35m[1 day, 15:38:41<1 day, 9:38:09][39m
+[titan] 2025-09-09 09:14:41,094 - root - INFO - [31mstep: 21645 [32mloss: 2.7891 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.40 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.8006 [37mglobal_avg_top_loss: 1.9886
+[titan] 2025-09-09 09:14:41,094 - root - INFO - [34mlr: 9.9702e-06 gnorm: 0.33 [35m[1 day, 15:39:13<1 day, 9:37:35][39m
+[titan] 2025-09-09 09:15:06,673 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:15:13,183 - root - INFO - [31mstep: 21650 [32mloss: 2.8429 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.69 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.8245 [37mglobal_avg_top_loss: 2.0184
+[titan] 2025-09-09 09:15:13,183 - root - INFO - [34mlr: 9.9667e-06 gnorm: 0.34 [35m[1 day, 15:39:45<1 day, 9:37:01][39m
+[titan] 2025-09-09 09:15:45,108 - root - INFO - [31mstep: 21655 [32mloss: 2.7378 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.18 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7868 [37mglobal_avg_top_loss: 1.9509
+[titan] 2025-09-09 09:15:45,109 - root - INFO - [34mlr: 9.9631e-06 gnorm: 0.34 [35m[1 day, 15:40:17<1 day, 9:36:27][39m
+[titan] 2025-09-09 09:16:17,071 - root - INFO - [31mstep: 21660 [32mloss: 2.7385 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.62 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7806 [37mglobal_avg_top_loss: 1.9579
+[titan] 2025-09-09 09:16:17,071 - root - INFO - [34mlr: 9.9596e-06 gnorm: 0.33 [35m[1 day, 15:40:49<1 day, 9:35:53][39m
+[titan] 2025-09-09 09:16:48,949 - root - INFO - [31mstep: 21665 [32mloss: 2.7924 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.91 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.8005 [37mglobal_avg_top_loss: 1.9919
+[titan] 2025-09-09 09:16:48,950 - root - INFO - [34mlr: 9.9560e-06 gnorm: 0.35 [35m[1 day, 15:41:21<1 day, 9:35:19][39m
+[titan] 2025-09-09 09:17:20,890 - root - INFO - [31mstep: 21670 [32mloss: 2.8056 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.96 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8157 [37mglobal_avg_top_loss: 1.9900
+[titan] 2025-09-09 09:17:20,890 - root - INFO - [34mlr: 9.9525e-06 gnorm: 0.33 [35m[1 day, 15:41:53<1 day, 9:34:46][39m
+[titan] 2025-09-09 09:17:53,055 - root - INFO - [31mstep: 21675 [32mloss: 3.1147 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,188 [36mtflops: 485.54 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 0.9845 [37mglobal_avg_top_loss: 2.1303
+[titan] 2025-09-09 09:17:53,055 - root - INFO - [34mlr: 9.9489e-06 gnorm: 0.42 [35m[1 day, 15:42:25<1 day, 9:34:12][39m
+[titan] 2025-09-09 09:18:25,060 - root - INFO - [31mstep: 21680 [32mloss: 2.8162 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.97 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8134 [37mglobal_avg_top_loss: 2.0027
+[titan] 2025-09-09 09:18:25,060 - root - INFO - [34mlr: 9.9454e-06 gnorm: 0.33 [35m[1 day, 15:42:57<1 day, 9:33:38][39m
+[titan] 2025-09-09 09:18:57,218 - root - INFO - [31mstep: 21685 [32mloss: 2.7674 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.64 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7897 [37mglobal_avg_top_loss: 1.9777
+[titan] 2025-09-09 09:18:57,219 - root - INFO - [34mlr: 9.9418e-06 gnorm: 0.34 [35m[1 day, 15:43:29<1 day, 9:33:04][39m
+[titan] 2025-09-09 09:19:29,260 - root - INFO - [31mstep: 21690 [32mloss: 3.2513 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.41 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 1.0636 [37mglobal_avg_top_loss: 2.1877
+[titan] 2025-09-09 09:19:29,261 - root - INFO - [34mlr: 9.9383e-06 gnorm: 0.36 [35m[1 day, 15:44:01<1 day, 9:32:31][39m
+[titan] 2025-09-09 09:20:01,389 - root - INFO - [31mstep: 21695 [32mloss: 2.7729 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,199 [36mtflops: 486.09 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7945 [37mglobal_avg_top_loss: 1.9784
+[titan] 2025-09-09 09:20:01,390 - root - INFO - [34mlr: 9.9347e-06 gnorm: 0.35 [35m[1 day, 15:44:33<1 day, 9:31:57][39m
+[titan] 2025-09-09 09:20:27,089 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:20:33,537 - root - INFO - [31mstep: 21700 [32mloss: 3.2302 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.80 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 1.0528 [37mglobal_avg_top_loss: 2.1775
+[titan] 2025-09-09 09:20:33,538 - root - INFO - [34mlr: 9.9312e-06 gnorm: 0.39 [35m[1 day, 15:45:06<1 day, 9:31:23][39m
+[titan] 2025-09-09 09:21:05,525 - root - INFO - [31mstep: 21705 [32mloss: 2.8747 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.23 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.8379 [37mglobal_avg_top_loss: 2.0368
+[titan] 2025-09-09 09:21:05,525 - root - INFO - [34mlr: 9.9277e-06 gnorm: 0.35 [35m[1 day, 15:45:37<1 day, 9:30:50][39m
+[titan] 2025-09-09 09:21:37,445 - root - INFO - [31mstep: 21710 [32mloss: 2.7781 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.27 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7988 [37mglobal_avg_top_loss: 1.9793
+[titan] 2025-09-09 09:21:37,445 - root - INFO - [34mlr: 9.9241e-06 gnorm: 0.34 [35m[1 day, 15:46:09<1 day, 9:30:16][39m
+[titan] 2025-09-09 09:22:09,456 - root - INFO - [31mstep: 21715 [32mloss: 2.7318 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.88 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7771 [37mglobal_avg_top_loss: 1.9547
+[titan] 2025-09-09 09:22:09,457 - root - INFO - [34mlr: 9.9206e-06 gnorm: 0.34 [35m[1 day, 15:46:41<1 day, 9:29:42][39m
+[titan] 2025-09-09 09:22:41,500 - root - INFO - [31mstep: 21720 [32mloss: 2.9010 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.38 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.8719 [37mglobal_avg_top_loss: 2.0291
+[titan] 2025-09-09 09:22:41,500 - root - INFO - [34mlr: 9.9170e-06 gnorm: 0.35 [35m[1 day, 15:47:13<1 day, 9:29:08][39m
+[titan] 2025-09-09 09:23:13,407 - root - INFO - [31mstep: 21725 [32mloss: 2.8291 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.47 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8200 [37mglobal_avg_top_loss: 2.0091
+[titan] 2025-09-09 09:23:13,407 - root - INFO - [34mlr: 9.9135e-06 gnorm: 0.34 [35m[1 day, 15:47:45<1 day, 9:28:34][39m
+[titan] 2025-09-09 09:23:45,374 - root - INFO - [31mstep: 21730 [32mloss: 2.8107 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.55 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.8108 [37mglobal_avg_top_loss: 2.0000
+[titan] 2025-09-09 09:23:45,375 - root - INFO - [34mlr: 9.9099e-06 gnorm: 0.46 [35m[1 day, 15:48:17<1 day, 9:28:00][39m
+[titan] 2025-09-09 09:24:17,342 - root - INFO - [31mstep: 21735 [32mloss: 2.7608 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.54 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7838 [37mglobal_avg_top_loss: 1.9770
+[titan] 2025-09-09 09:24:17,342 - root - INFO - [34mlr: 9.9064e-06 gnorm: 0.33 [35m[1 day, 15:48:49<1 day, 9:27:27][39m
+[titan] 2025-09-09 09:24:49,459 - root - INFO - [31mstep: 21740 [32mloss: 2.6843 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.26 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7508 [37mglobal_avg_top_loss: 1.9335
+[titan] 2025-09-09 09:24:49,460 - root - INFO - [34mlr: 9.9028e-06 gnorm: 0.33 [35m[1 day, 15:49:21<1 day, 9:26:53][39m
+[titan] 2025-09-09 09:25:21,362 - root - INFO - [31mstep: 21745 [32mloss: 2.7867 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.54 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7984 [37mglobal_avg_top_loss: 1.9883
+[titan] 2025-09-09 09:25:21,362 - root - INFO - [34mlr: 9.8993e-06 gnorm: 0.33 [35m[1 day, 15:49:53<1 day, 9:26:19][39m
+[titan] 2025-09-09 09:25:46,807 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:25:53,228 - root - INFO - [31mstep: 21750 [32mloss: 2.7778 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.09 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7930 [37mglobal_avg_top_loss: 1.9848
+[titan] 2025-09-09 09:25:53,228 - root - INFO - [34mlr: 9.8958e-06 gnorm: 0.35 [35m[1 day, 15:50:25<1 day, 9:25:45][39m
+[titan] 2025-09-09 09:26:25,445 - root - INFO - [31mstep: 21755 [32mloss: 3.2527 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,171 [36mtflops: 484.75 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 1.0648 [37mglobal_avg_top_loss: 2.1878
+[titan] 2025-09-09 09:26:25,445 - root - INFO - [34mlr: 9.8922e-06 gnorm: 0.37 [35m[1 day, 15:50:57<1 day, 9:25:12][39m
+[titan] 2025-09-09 09:26:57,489 - root - INFO - [31mstep: 21760 [32mloss: 2.8124 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.38 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.8115 [37mglobal_avg_top_loss: 2.0009
+[titan] 2025-09-09 09:26:57,489 - root - INFO - [34mlr: 9.8887e-06 gnorm: 0.34 [35m[1 day, 15:51:29<1 day, 9:24:38][39m
+[titan] 2025-09-09 09:27:29,539 - root - INFO - [31mstep: 21765 [32mloss: 2.7269 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.29 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7721 [37mglobal_avg_top_loss: 1.9549
+[titan] 2025-09-09 09:27:29,539 - root - INFO - [34mlr: 9.8851e-06 gnorm: 0.33 [35m[1 day, 15:52:01<1 day, 9:24:04][39m
+[titan] 2025-09-09 09:28:01,435 - root - INFO - [31mstep: 21770 [32mloss: 3.2764 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.63 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 1.0767 [37mglobal_avg_top_loss: 2.1997
+[titan] 2025-09-09 09:28:01,435 - root - INFO - [34mlr: 9.8816e-06 gnorm: 0.36 [35m[1 day, 15:52:33<1 day, 9:23:30][39m
+[titan] 2025-09-09 09:28:33,547 - root - INFO - [31mstep: 21775 [32mloss: 2.8499 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.34 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.8312 [37mglobal_avg_top_loss: 2.0187
+[titan] 2025-09-09 09:28:33,547 - root - INFO - [34mlr: 9.8781e-06 gnorm: 0.34 [35m[1 day, 15:53:05<1 day, 9:22:57][39m
+[titan] 2025-09-09 09:29:05,222 - root - INFO - [31mstep: 21780 [32mloss: 2.7963 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,345 [36mtflops: 493.05 [35mmfu: 49.85%[39m [37mglobal_avg_ntp_loss: 0.8028 [37mglobal_avg_top_loss: 1.9935
+[titan] 2025-09-09 09:29:05,223 - root - INFO - [34mlr: 9.8745e-06 gnorm: 0.35 [35m[1 day, 15:53:37<1 day, 9:22:22][39m
+[titan] 2025-09-09 09:29:37,193 - root - INFO - [31mstep: 21785 [32mloss: 2.8062 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.50 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.8093 [37mglobal_avg_top_loss: 1.9969
+[titan] 2025-09-09 09:29:37,193 - root - INFO - [34mlr: 9.8710e-06 gnorm: 0.34 [35m[1 day, 15:54:09<1 day, 9:21:49][39m
+[titan] 2025-09-09 09:30:08,984 - root - INFO - [31mstep: 21790 [32mloss: 2.7962 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,308 [36mtflops: 491.25 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.8009 [37mglobal_avg_top_loss: 1.9953
+[titan] 2025-09-09 09:30:08,984 - root - INFO - [34mlr: 9.8674e-06 gnorm: 0.35 [35m[1 day, 15:54:41<1 day, 9:21:15][39m
+[titan] 2025-09-09 09:30:40,810 - root - INFO - [31mstep: 21795 [32mloss: 2.7956 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.71 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.8059 [37mglobal_avg_top_loss: 1.9897
+[titan] 2025-09-09 09:30:40,810 - root - INFO - [34mlr: 9.8639e-06 gnorm: 0.32 [35m[1 day, 15:55:13<1 day, 9:20:41][39m
+[titan] 2025-09-09 09:31:06,355 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:31:12,680 - root - INFO - [31mstep: 21800 [32mloss: 2.7703 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.04 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7934 [37mglobal_avg_top_loss: 1.9769
+[titan] 2025-09-09 09:31:12,680 - root - INFO - [34mlr: 9.8603e-06 gnorm: 0.43 [35m[1 day, 15:55:45<1 day, 9:20:07][39m
+[titan] 2025-09-09 09:31:44,624 - root - INFO - [31mstep: 21805 [32mloss: 2.7569 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.90 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7871 [37mglobal_avg_top_loss: 1.9699
+[titan] 2025-09-09 09:31:44,624 - root - INFO - [34mlr: 9.8568e-06 gnorm: 0.32 [35m[1 day, 15:56:17<1 day, 9:19:33][39m
+[titan] 2025-09-09 09:32:16,374 - root - INFO - [31mstep: 21810 [32mloss: 2.7460 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,321 [36mtflops: 491.88 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 0.7822 [37mglobal_avg_top_loss: 1.9639
+[titan] 2025-09-09 09:32:16,375 - root - INFO - [34mlr: 9.8533e-06 gnorm: 0.35 [35m[1 day, 15:56:48<1 day, 9:18:59][39m
+[titan] 2025-09-09 09:32:48,303 - root - INFO - [31mstep: 21815 [32mloss: 2.6446 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.14 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7350 [37mglobal_avg_top_loss: 1.9095
+[titan] 2025-09-09 09:32:48,304 - root - INFO - [34mlr: 9.8497e-06 gnorm: 0.33 [35m[1 day, 15:57:20<1 day, 9:18:25][39m
+[titan] 2025-09-09 09:33:20,058 - root - INFO - [31mstep: 21820 [32mloss: 2.8282 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,319 [36mtflops: 491.81 [35mmfu: 49.73%[39m [37mglobal_avg_ntp_loss: 0.8196 [37mglobal_avg_top_loss: 2.0086
+[titan] 2025-09-09 09:33:20,058 - root - INFO - [34mlr: 9.8462e-06 gnorm: 0.35 [35m[1 day, 15:57:52<1 day, 9:17:51][39m
+[titan] 2025-09-09 09:33:52,089 - root - INFO - [31mstep: 21825 [32mloss: 2.8589 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.57 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.8374 [37mglobal_avg_top_loss: 2.0214
+[titan] 2025-09-09 09:33:52,090 - root - INFO - [34mlr: 9.8426e-06 gnorm: 0.35 [35m[1 day, 15:58:24<1 day, 9:17:18][39m
+[titan] 2025-09-09 09:34:23,933 - root - INFO - [31mstep: 21830 [32mloss: 2.7610 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.44 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7878 [37mglobal_avg_top_loss: 1.9732
+[titan] 2025-09-09 09:34:23,934 - root - INFO - [34mlr: 9.8391e-06 gnorm: 0.33 [35m[1 day, 15:58:56<1 day, 9:16:44][39m
+[titan] 2025-09-09 09:34:56,009 - root - INFO - [31mstep: 21835 [32mloss: 3.2460 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.89 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 1.0598 [37mglobal_avg_top_loss: 2.1862
+[titan] 2025-09-09 09:34:56,010 - root - INFO - [34mlr: 9.8356e-06 gnorm: 0.36 [35m[1 day, 15:59:28<1 day, 9:16:10][39m
+[titan] 2025-09-09 09:35:27,984 - root - INFO - [31mstep: 21840 [32mloss: 2.8116 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.43 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.8206 [37mglobal_avg_top_loss: 1.9911
+[titan] 2025-09-09 09:35:27,984 - root - INFO - [34mlr: 9.8320e-06 gnorm: 0.33 [35m[1 day, 16:00:00<1 day, 9:15:36][39m
+[titan] 2025-09-09 09:35:59,850 - root - INFO - [31mstep: 21845 [32mloss: 2.8150 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.10 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.8118 [37mglobal_avg_top_loss: 2.0031
+[titan] 2025-09-09 09:35:59,850 - root - INFO - [34mlr: 9.8285e-06 gnorm: 0.34 [35m[1 day, 16:00:32<1 day, 9:15:02][39m
+[titan] 2025-09-09 09:36:25,266 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:36:31,658 - root - INFO - [31mstep: 21850 [32mloss: 3.1993 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.98 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 1.0417 [37mglobal_avg_top_loss: 2.1576
+[titan] 2025-09-09 09:36:31,658 - root - INFO - [34mlr: 9.8249e-06 gnorm: 0.40 [35m[1 day, 16:01:04<1 day, 9:14:28][39m
+[titan] 2025-09-09 09:37:03,729 - root - INFO - [31mstep: 21855 [32mloss: 2.8371 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.97 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.8225 [37mglobal_avg_top_loss: 2.0146
+[titan] 2025-09-09 09:37:03,729 - root - INFO - [34mlr: 9.8214e-06 gnorm: 0.33 [35m[1 day, 16:01:36<1 day, 9:13:55][39m
+[titan] 2025-09-09 09:37:35,660 - root - INFO - [31mstep: 21860 [32mloss: 2.8166 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.10 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8123 [37mglobal_avg_top_loss: 2.0043
+[titan] 2025-09-09 09:37:35,660 - root - INFO - [34mlr: 9.8179e-06 gnorm: 0.35 [35m[1 day, 16:02:08<1 day, 9:13:21][39m
+[titan] 2025-09-09 09:38:07,597 - root - INFO - [31mstep: 21865 [32mloss: 3.2084 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 489.00 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 1.0441 [37mglobal_avg_top_loss: 2.1643
+[titan] 2025-09-09 09:38:07,597 - root - INFO - [34mlr: 9.8143e-06 gnorm: 0.41 [35m[1 day, 16:02:40<1 day, 9:12:47][39m
+[titan] 2025-09-09 09:38:39,439 - root - INFO - [31mstep: 21870 [32mloss: 2.7741 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.47 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7944 [37mglobal_avg_top_loss: 1.9797
+[titan] 2025-09-09 09:38:39,439 - root - INFO - [34mlr: 9.8108e-06 gnorm: 0.33 [35m[1 day, 16:03:11<1 day, 9:12:13][39m
+[titan] 2025-09-09 09:39:11,570 - root - INFO - [31mstep: 21875 [32mloss: 2.7826 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.05 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7949 [37mglobal_avg_top_loss: 1.9877
+[titan] 2025-09-09 09:39:11,570 - root - INFO - [34mlr: 9.8072e-06 gnorm: 0.36 [35m[1 day, 16:03:43<1 day, 9:11:39][39m
+[titan] 2025-09-09 09:39:43,333 - root - INFO - [31mstep: 21880 [32mloss: 2.7948 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,316 [36mtflops: 491.68 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.8035 [37mglobal_avg_top_loss: 1.9913
+[titan] 2025-09-09 09:39:43,334 - root - INFO - [34mlr: 9.8037e-06 gnorm: 0.35 [35m[1 day, 16:04:15<1 day, 9:11:05][39m
+[titan] 2025-09-09 09:40:15,140 - root - INFO - [31mstep: 21885 [32mloss: 2.8201 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,302 [36mtflops: 491.01 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.8154 [37mglobal_avg_top_loss: 2.0047
+[titan] 2025-09-09 09:40:15,141 - root - INFO - [34mlr: 9.8002e-06 gnorm: 0.38 [35m[1 day, 16:04:47<1 day, 9:10:31][39m
+[titan] 2025-09-09 09:40:46,740 - root - INFO - [31mstep: 21890 [32mloss: 2.7615 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,370 [36mtflops: 494.23 [35mmfu: 49.97%[39m [37mglobal_avg_ntp_loss: 0.7901 [37mglobal_avg_top_loss: 1.9714
+[titan] 2025-09-09 09:40:46,740 - root - INFO - [34mlr: 9.7966e-06 gnorm: 0.34 [35m[1 day, 16:05:19<1 day, 9:09:57][39m
+[titan] 2025-09-09 09:41:18,569 - root - INFO - [31mstep: 21895 [32mloss: 2.8081 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.67 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.8092 [37mglobal_avg_top_loss: 1.9988
+[titan] 2025-09-09 09:41:18,569 - root - INFO - [34mlr: 9.7931e-06 gnorm: 0.32 [35m[1 day, 16:05:50<1 day, 9:09:23][39m
+[titan] 2025-09-09 09:41:43,980 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:41:50,384 - root - INFO - [31mstep: 21900 [32mloss: 2.7632 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.88 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7893 [37mglobal_avg_top_loss: 1.9739
+[titan] 2025-09-09 09:41:50,384 - root - INFO - [34mlr: 9.7896e-06 gnorm: 0.36 [35m[1 day, 16:06:22<1 day, 9:08:50][39m
+[titan] 2025-09-09 09:42:22,286 - root - INFO - [31mstep: 21905 [32mloss: 2.6902 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.55 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7564 [37mglobal_avg_top_loss: 1.9338
+[titan] 2025-09-09 09:42:22,286 - root - INFO - [34mlr: 9.7860e-06 gnorm: 0.34 [35m[1 day, 16:06:54<1 day, 9:08:16][39m
+[titan] 2025-09-09 09:42:54,215 - root - INFO - [31mstep: 21910 [32mloss: 2.8081 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.13 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.8064 [37mglobal_avg_top_loss: 2.0017
+[titan] 2025-09-09 09:42:54,215 - root - INFO - [34mlr: 9.7825e-06 gnorm: 0.32 [35m[1 day, 16:07:26<1 day, 9:07:42][39m
+[titan] 2025-09-09 09:43:26,138 - root - INFO - [31mstep: 21915 [32mloss: 3.2417 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.21 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 1.0618 [37mglobal_avg_top_loss: 2.1799
+[titan] 2025-09-09 09:43:26,139 - root - INFO - [34mlr: 9.7789e-06 gnorm: 0.38 [35m[1 day, 16:07:58<1 day, 9:07:08][39m
+[titan] 2025-09-09 09:43:57,814 - root - INFO - [31mstep: 21920 [32mloss: 2.8170 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,345 [36mtflops: 493.04 [35mmfu: 49.85%[39m [37mglobal_avg_ntp_loss: 0.8148 [37mglobal_avg_top_loss: 2.0022
+[titan] 2025-09-09 09:43:57,815 - root - INFO - [34mlr: 9.7754e-06 gnorm: 0.33 [35m[1 day, 16:08:30<1 day, 9:06:34][39m
+[titan] 2025-09-09 09:44:29,578 - root - INFO - [31mstep: 21925 [32mloss: 2.7314 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,316 [36mtflops: 491.67 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.7739 [37mglobal_avg_top_loss: 1.9575
+[titan] 2025-09-09 09:44:29,579 - root - INFO - [34mlr: 9.7719e-06 gnorm: 0.32 [35m[1 day, 16:09:01<1 day, 9:06:00][39m
+[titan] 2025-09-09 09:45:01,467 - root - INFO - [31mstep: 21930 [32mloss: 3.2231 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.75 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 1.0525 [37mglobal_avg_top_loss: 2.1706
+[titan] 2025-09-09 09:45:01,468 - root - INFO - [34mlr: 9.7683e-06 gnorm: 0.37 [35m[1 day, 16:09:33<1 day, 9:05:26][39m
+[titan] 2025-09-09 09:45:33,163 - root - INFO - [31mstep: 21935 [32mloss: 2.6797 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,339 [36mtflops: 492.73 [35mmfu: 49.82%[39m [37mglobal_avg_ntp_loss: 0.7534 [37mglobal_avg_top_loss: 1.9263
+[titan] 2025-09-09 09:45:33,164 - root - INFO - [34mlr: 9.7648e-06 gnorm: 0.32 [35m[1 day, 16:10:05<1 day, 9:04:52][39m
+[titan] 2025-09-09 09:46:04,882 - root - INFO - [31mstep: 21940 [32mloss: 2.6642 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,331 [36mtflops: 492.37 [35mmfu: 49.79%[39m [37mglobal_avg_ntp_loss: 0.7409 [37mglobal_avg_top_loss: 1.9233
+[titan] 2025-09-09 09:46:04,882 - root - INFO - [34mlr: 9.7613e-06 gnorm: 0.36 [35m[1 day, 16:10:37<1 day, 9:04:18][39m
+[titan] 2025-09-09 09:46:36,664 - root - INFO - [31mstep: 21945 [32mloss: 3.2113 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,311 [36mtflops: 491.40 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 1.0472 [37mglobal_avg_top_loss: 2.1641
+[titan] 2025-09-09 09:46:36,664 - root - INFO - [34mlr: 9.7577e-06 gnorm: 0.42 [35m[1 day, 16:11:09<1 day, 9:03:44][39m
+[titan] 2025-09-09 09:47:02,172 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:47:08,577 - root - INFO - [31mstep: 21950 [32mloss: 2.8186 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.38 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.8139 [37mglobal_avg_top_loss: 2.0048
+[titan] 2025-09-09 09:47:08,577 - root - INFO - [34mlr: 9.7542e-06 gnorm: 0.33 [35m[1 day, 16:11:40<1 day, 9:03:11][39m
+[titan] 2025-09-09 09:47:40,404 - root - INFO - [31mstep: 21955 [32mloss: 2.8415 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.69 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.8217 [37mglobal_avg_top_loss: 2.0198
+[titan] 2025-09-09 09:47:40,405 - root - INFO - [34mlr: 9.7507e-06 gnorm: 0.35 [35m[1 day, 16:12:12<1 day, 9:02:37][39m
+[titan] 2025-09-09 09:48:12,411 - root - INFO - [31mstep: 21960 [32mloss: 2.7756 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.95 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7947 [37mglobal_avg_top_loss: 1.9810
+[titan] 2025-09-09 09:48:12,411 - root - INFO - [34mlr: 9.7471e-06 gnorm: 0.40 [35m[1 day, 16:12:44<1 day, 9:02:03][39m
+[titan] 2025-09-09 09:48:44,356 - root - INFO - [31mstep: 21965 [32mloss: 2.7766 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.88 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7952 [37mglobal_avg_top_loss: 1.9814
+[titan] 2025-09-09 09:48:44,357 - root - INFO - [34mlr: 9.7436e-06 gnorm: 0.34 [35m[1 day, 16:13:16<1 day, 9:01:29][39m
+[titan] 2025-09-09 09:49:16,244 - root - INFO - [31mstep: 21970 [32mloss: 2.8347 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.77 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8210 [37mglobal_avg_top_loss: 2.0137
+[titan] 2025-09-09 09:49:16,244 - root - INFO - [34mlr: 9.7401e-06 gnorm: 0.33 [35m[1 day, 16:13:48<1 day, 9:00:55][39m
+[titan] 2025-09-09 09:49:48,135 - root - INFO - [31mstep: 21975 [32mloss: 2.8245 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.71 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8236 [37mglobal_avg_top_loss: 2.0009
+[titan] 2025-09-09 09:49:48,135 - root - INFO - [34mlr: 9.7365e-06 gnorm: 0.41 [35m[1 day, 16:14:20<1 day, 9:00:21][39m
+[titan] 2025-09-09 09:50:19,972 - root - INFO - [31mstep: 21980 [32mloss: 2.7653 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.54 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7893 [37mglobal_avg_top_loss: 1.9760
+[titan] 2025-09-09 09:50:19,973 - root - INFO - [34mlr: 9.7330e-06 gnorm: 0.32 [35m[1 day, 16:14:52<1 day, 8:59:48][39m
+[titan] 2025-09-09 09:50:51,599 - root - INFO - [31mstep: 21985 [32mloss: 2.7592 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,361 [36mtflops: 493.80 [35mmfu: 49.93%[39m [37mglobal_avg_ntp_loss: 0.7876 [37mglobal_avg_top_loss: 1.9716
+[titan] 2025-09-09 09:50:51,600 - root - INFO - [34mlr: 9.7294e-06 gnorm: 0.34 [35m[1 day, 16:15:24<1 day, 8:59:13][39m
+[titan] 2025-09-09 09:51:23,425 - root - INFO - [31mstep: 21990 [32mloss: 2.7158 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.71 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7660 [37mglobal_avg_top_loss: 1.9497
+[titan] 2025-09-09 09:51:23,426 - root - INFO - [34mlr: 9.7259e-06 gnorm: 0.34 [35m[1 day, 16:15:55<1 day, 8:58:40][39m
+[titan] 2025-09-09 09:51:55,246 - root - INFO - [31mstep: 21995 [32mloss: 3.1904 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.79 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 1.0343 [37mglobal_avg_top_loss: 2.1560
+[titan] 2025-09-09 09:51:55,247 - root - INFO - [34mlr: 9.7224e-06 gnorm: 0.40 [35m[1 day, 16:16:27<1 day, 8:58:06][39m
+[titan] 2025-09-09 09:52:20,945 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:52:27,529 - root - INFO - [31mstep: 22000 [32mloss: 2.7668 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,151 [36mtflops: 483.77 [35mmfu: 48.91%[39m [37mglobal_avg_ntp_loss: 0.7902 [37mglobal_avg_top_loss: 1.9766
+[titan] 2025-09-09 09:52:27,530 - root - INFO - [34mlr: 9.7188e-06 gnorm: 0.35 [35m[1 day, 16:16:59<1 day, 8:57:32][39m
+[titan] 2025-09-09 09:52:59,568 - root - INFO - [31mstep: 22005 [32mloss: 2.7883 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.46 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.8005 [37mglobal_avg_top_loss: 1.9878
+[titan] 2025-09-09 09:52:59,568 - root - INFO - [34mlr: 9.7153e-06 gnorm: 0.33 [35m[1 day, 16:17:31<1 day, 8:56:58][39m
+[titan] 2025-09-09 09:53:31,448 - root - INFO - [31mstep: 22010 [32mloss: 2.7886 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.88 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.8015 [37mglobal_avg_top_loss: 1.9871
+[titan] 2025-09-09 09:53:31,449 - root - INFO - [34mlr: 9.7118e-06 gnorm: 0.35 [35m[1 day, 16:18:03<1 day, 8:56:25][39m
+[titan] 2025-09-09 09:54:03,731 - root - INFO - [31mstep: 22015 [32mloss: 2.7549 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,151 [36mtflops: 483.77 [35mmfu: 48.91%[39m [37mglobal_avg_ntp_loss: 0.7886 [37mglobal_avg_top_loss: 1.9664
+[titan] 2025-09-09 09:54:03,732 - root - INFO - [34mlr: 9.7082e-06 gnorm: 0.33 [35m[1 day, 16:18:36<1 day, 8:55:51][39m
+[titan] 2025-09-09 09:54:10,340 - root - INFO - Dumping profiler traces at step 22016
+[titan] 2025-09-09 09:54:10,391 - root - INFO - Finished dumping profiler traces in 0.05 seconds
+[titan] 2025-09-09 09:54:35,687 - root - INFO - [31mstep: 22020 [32mloss: 2.7375 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.72 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7770 [37mglobal_avg_top_loss: 1.9605
+[titan] 2025-09-09 09:54:35,687 - root - INFO - [34mlr: 9.7047e-06 gnorm: 0.35 [35m[1 day, 16:19:08<1 day, 8:55:17][39m
+[titan] 2025-09-09 09:55:07,657 - root - INFO - [31mstep: 22025 [32mloss: 3.2441 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.50 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 1.0641 [37mglobal_avg_top_loss: 2.1800
+[titan] 2025-09-09 09:55:07,657 - root - INFO - [34mlr: 9.7012e-06 gnorm: 0.34 [35m[1 day, 16:19:40<1 day, 8:54:44][39m
+[titan] 2025-09-09 09:55:39,861 - root - INFO - [31mstep: 22030 [32mloss: 2.7467 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,176 [36mtflops: 484.96 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7864 [37mglobal_avg_top_loss: 1.9603
+[titan] 2025-09-09 09:55:39,861 - root - INFO - [34mlr: 9.6976e-06 gnorm: 0.33 [35m[1 day, 16:20:12<1 day, 8:54:10][39m
+[titan] 2025-09-09 09:56:11,601 - root - INFO - [31mstep: 22035 [32mloss: 2.8318 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,324 [36mtflops: 492.04 [35mmfu: 49.75%[39m [37mglobal_avg_ntp_loss: 0.8177 [37mglobal_avg_top_loss: 2.0142
+[titan] 2025-09-09 09:56:11,601 - root - INFO - [34mlr: 9.6941e-06 gnorm: 0.33 [35m[1 day, 16:20:43<1 day, 8:53:36][39m
+[titan] 2025-09-09 09:56:43,556 - root - INFO - [31mstep: 22040 [32mloss: 2.7866 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.73 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7991 [37mglobal_avg_top_loss: 1.9874
+[titan] 2025-09-09 09:56:43,556 - root - INFO - [34mlr: 9.6906e-06 gnorm: 0.34 [35m[1 day, 16:21:15<1 day, 8:53:02][39m
+[titan] 2025-09-09 09:57:15,359 - root - INFO - [31mstep: 22045 [32mloss: 2.7100 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.06 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7674 [37mglobal_avg_top_loss: 1.9426
+[titan] 2025-09-09 09:57:15,360 - root - INFO - [34mlr: 9.6871e-06 gnorm: 0.34 [35m[1 day, 16:21:47<1 day, 8:52:28][39m
+[titan] 2025-09-09 09:57:40,940 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 09:57:47,275 - root - INFO - [31mstep: 22050 [32mloss: 2.7349 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.34 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7748 [37mglobal_avg_top_loss: 1.9601
+[titan] 2025-09-09 09:57:47,275 - root - INFO - [34mlr: 9.6835e-06 gnorm: 0.35 [35m[1 day, 16:22:19<1 day, 8:51:55][39m
+[titan] 2025-09-09 09:58:19,127 - root - INFO - [31mstep: 22055 [32mloss: 2.7298 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.30 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7710 [37mglobal_avg_top_loss: 1.9588
+[titan] 2025-09-09 09:58:19,128 - root - INFO - [34mlr: 9.6800e-06 gnorm: 0.37 [35m[1 day, 16:22:51<1 day, 8:51:21][39m
+[titan] 2025-09-09 09:58:50,998 - root - INFO - [31mstep: 22060 [32mloss: 2.7477 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.03 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7843 [37mglobal_avg_top_loss: 1.9634
+[titan] 2025-09-09 09:58:50,998 - root - INFO - [34mlr: 9.6765e-06 gnorm: 0.35 [35m[1 day, 16:23:23<1 day, 8:50:47][39m
+[titan] 2025-09-09 09:59:22,838 - root - INFO - [31mstep: 22065 [32mloss: 2.7699 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.50 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7909 [37mglobal_avg_top_loss: 1.9789
+[titan] 2025-09-09 09:59:22,838 - root - INFO - [34mlr: 9.6729e-06 gnorm: 0.34 [35m[1 day, 16:23:55<1 day, 8:50:13][39m
+[titan] 2025-09-09 09:59:54,827 - root - INFO - [31mstep: 22070 [32mloss: 2.7495 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.21 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7871 [37mglobal_avg_top_loss: 1.9624
+[titan] 2025-09-09 09:59:54,827 - root - INFO - [34mlr: 9.6694e-06 gnorm: 0.38 [35m[1 day, 16:24:27<1 day, 8:49:39][39m
+[titan] 2025-09-09 10:00:26,935 - root - INFO - [31mstep: 22075 [32mloss: 3.2809 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.41 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 1.0810 [37mglobal_avg_top_loss: 2.1999
+[titan] 2025-09-09 10:00:26,935 - root - INFO - [34mlr: 9.6659e-06 gnorm: 0.33 [35m[1 day, 16:24:59<1 day, 8:49:06][39m
+[titan] 2025-09-09 10:00:58,596 - root - INFO - [31mstep: 22080 [32mloss: 2.8153 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,350 [36mtflops: 493.27 [35mmfu: 49.88%[39m [37mglobal_avg_ntp_loss: 0.8120 [37mglobal_avg_top_loss: 2.0034
+[titan] 2025-09-09 10:00:58,596 - root - INFO - [34mlr: 9.6623e-06 gnorm: 0.35 [35m[1 day, 16:25:30<1 day, 8:48:32][39m
+[titan] 2025-09-09 10:01:30,590 - root - INFO - [31mstep: 22085 [32mloss: 2.7460 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.14 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7811 [37mglobal_avg_top_loss: 1.9649
+[titan] 2025-09-09 10:01:30,590 - root - INFO - [34mlr: 9.6588e-06 gnorm: 0.33 [35m[1 day, 16:26:02<1 day, 8:47:58][39m
+[titan] 2025-09-09 10:02:02,362 - root - INFO - [31mstep: 22090 [32mloss: 2.7389 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,314 [36mtflops: 491.55 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.7754 [37mglobal_avg_top_loss: 1.9635
+[titan] 2025-09-09 10:02:02,362 - root - INFO - [34mlr: 9.6553e-06 gnorm: 0.34 [35m[1 day, 16:26:34<1 day, 8:47:24][39m
+[titan] 2025-09-09 10:02:34,290 - root - INFO - [31mstep: 22095 [32mloss: 2.7602 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.13 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7891 [37mglobal_avg_top_loss: 1.9711
+[titan] 2025-09-09 10:02:34,291 - root - INFO - [34mlr: 9.6517e-06 gnorm: 0.34 [35m[1 day, 16:27:06<1 day, 8:46:50][39m
+[titan] 2025-09-09 10:02:59,647 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:03:06,074 - root - INFO - [31mstep: 22100 [32mloss: 2.7562 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,310 [36mtflops: 491.36 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.7872 [37mglobal_avg_top_loss: 1.9690
+[titan] 2025-09-09 10:03:06,075 - root - INFO - [34mlr: 9.6482e-06 gnorm: 0.36 [35m[1 day, 16:27:38<1 day, 8:46:16][39m
+[titan] 2025-09-09 10:03:37,842 - root - INFO - [31mstep: 22105 [32mloss: 3.1691 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,315 [36mtflops: 491.61 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 1.0253 [37mglobal_avg_top_loss: 2.1439
+[titan] 2025-09-09 10:03:37,842 - root - INFO - [34mlr: 9.6447e-06 gnorm: 0.37 [35m[1 day, 16:28:10<1 day, 8:45:42][39m
+[titan] 2025-09-09 10:04:09,641 - root - INFO - [31mstep: 22110 [32mloss: 2.8558 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.13 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.8291 [37mglobal_avg_top_loss: 2.0267
+[titan] 2025-09-09 10:04:09,641 - root - INFO - [34mlr: 9.6412e-06 gnorm: 0.33 [35m[1 day, 16:28:42<1 day, 8:45:08][39m
+[titan] 2025-09-09 10:04:41,365 - root - INFO - [31mstep: 22115 [32mloss: 2.6868 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,329 [36mtflops: 492.28 [35mmfu: 49.78%[39m [37mglobal_avg_ntp_loss: 0.7539 [37mglobal_avg_top_loss: 1.9328
+[titan] 2025-09-09 10:04:41,366 - root - INFO - [34mlr: 9.6376e-06 gnorm: 0.37 [35m[1 day, 16:29:13<1 day, 8:44:35][39m
+[titan] 2025-09-09 10:05:13,222 - root - INFO - [31mstep: 22120 [32mloss: 3.3268 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.25 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 1.1003 [37mglobal_avg_top_loss: 2.2265
+[titan] 2025-09-09 10:05:13,222 - root - INFO - [34mlr: 9.6341e-06 gnorm: 0.64 [35m[1 day, 16:29:45<1 day, 8:44:01][39m
+[titan] 2025-09-09 10:05:45,320 - root - INFO - [31mstep: 22125 [32mloss: 2.7736 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.55 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7928 [37mglobal_avg_top_loss: 1.9808
+[titan] 2025-09-09 10:05:45,320 - root - INFO - [34mlr: 9.6306e-06 gnorm: 0.33 [35m[1 day, 16:30:17<1 day, 8:43:27][39m
+[titan] 2025-09-09 10:06:17,222 - root - INFO - [31mstep: 22130 [32mloss: 2.8568 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.54 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8254 [37mglobal_avg_top_loss: 2.0314
+[titan] 2025-09-09 10:06:17,222 - root - INFO - [34mlr: 9.6270e-06 gnorm: 0.43 [35m[1 day, 16:30:49<1 day, 8:42:53][39m
+[titan] 2025-09-09 10:06:48,942 - root - INFO - [31mstep: 22135 [32mloss: 2.7616 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,331 [36mtflops: 492.35 [35mmfu: 49.78%[39m [37mglobal_avg_ntp_loss: 0.7892 [37mglobal_avg_top_loss: 1.9724
+[titan] 2025-09-09 10:06:48,943 - root - INFO - [34mlr: 9.6235e-06 gnorm: 0.35 [35m[1 day, 16:31:21<1 day, 8:42:19][39m
+[titan] 2025-09-09 10:07:20,631 - root - INFO - [31mstep: 22140 [32mloss: 2.7316 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,341 [36mtflops: 492.85 [35mmfu: 49.83%[39m [37mglobal_avg_ntp_loss: 0.7714 [37mglobal_avg_top_loss: 1.9602
+[titan] 2025-09-09 10:07:20,631 - root - INFO - [34mlr: 9.6200e-06 gnorm: 0.34 [35m[1 day, 16:31:53<1 day, 8:41:45][39m
+[titan] 2025-09-09 10:07:52,503 - root - INFO - [31mstep: 22145 [32mloss: 2.8411 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 490.00 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.8221 [37mglobal_avg_top_loss: 2.0190
+[titan] 2025-09-09 10:07:52,504 - root - INFO - [34mlr: 9.6165e-06 gnorm: 0.33 [35m[1 day, 16:32:24<1 day, 8:41:11][39m
+[titan] 2025-09-09 10:08:18,032 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:08:24,541 - root - INFO - [31mstep: 22150 [32mloss: 2.6662 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.47 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7439 [37mglobal_avg_top_loss: 1.9223
+[titan] 2025-09-09 10:08:24,541 - root - INFO - [34mlr: 9.6129e-06 gnorm: 0.33 [35m[1 day, 16:32:56<1 day, 8:40:38][39m
+[titan] 2025-09-09 10:08:56,313 - root - INFO - [31mstep: 22155 [32mloss: 2.7910 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,314 [36mtflops: 491.55 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.8034 [37mglobal_avg_top_loss: 1.9877
+[titan] 2025-09-09 10:08:56,314 - root - INFO - [34mlr: 9.6094e-06 gnorm: 0.34 [35m[1 day, 16:33:28<1 day, 8:40:04][39m
+[titan] 2025-09-09 10:09:28,146 - root - INFO - [31mstep: 22160 [32mloss: 2.7642 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.61 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7887 [37mglobal_avg_top_loss: 1.9755
+[titan] 2025-09-09 10:09:28,146 - root - INFO - [34mlr: 9.6059e-06 gnorm: 0.34 [35m[1 day, 16:34:00<1 day, 8:39:30][39m
+[titan] 2025-09-09 10:10:00,154 - root - INFO - [31mstep: 22165 [32mloss: 2.7905 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.93 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8073 [37mglobal_avg_top_loss: 1.9832
+[titan] 2025-09-09 10:10:00,154 - root - INFO - [34mlr: 9.6024e-06 gnorm: 0.33 [35m[1 day, 16:34:32<1 day, 8:38:56][39m
+[titan] 2025-09-09 10:10:31,835 - root - INFO - [31mstep: 22170 [32mloss: 2.7554 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,343 [36mtflops: 492.95 [35mmfu: 49.84%[39m [37mglobal_avg_ntp_loss: 0.7862 [37mglobal_avg_top_loss: 1.9693
+[titan] 2025-09-09 10:10:31,835 - root - INFO - [34mlr: 9.5988e-06 gnorm: 0.32 [35m[1 day, 16:35:04<1 day, 8:38:22][39m
+[titan] 2025-09-09 10:11:03,637 - root - INFO - [31mstep: 22175 [32mloss: 2.7585 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.08 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7868 [37mglobal_avg_top_loss: 1.9716
+[titan] 2025-09-09 10:11:03,638 - root - INFO - [34mlr: 9.5953e-06 gnorm: 0.34 [35m[1 day, 16:35:36<1 day, 8:37:48][39m
+[titan] 2025-09-09 10:11:35,566 - root - INFO - [31mstep: 22180 [32mloss: 2.8919 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.14 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.8649 [37mglobal_avg_top_loss: 2.0270
+[titan] 2025-09-09 10:11:35,566 - root - INFO - [34mlr: 9.5918e-06 gnorm: 0.34 [35m[1 day, 16:36:07<1 day, 8:37:15][39m
+[titan] 2025-09-09 10:12:07,435 - root - INFO - [31mstep: 22185 [32mloss: 2.6708 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.05 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7526 [37mglobal_avg_top_loss: 1.9182
+[titan] 2025-09-09 10:12:07,435 - root - INFO - [34mlr: 9.5882e-06 gnorm: 0.32 [35m[1 day, 16:36:39<1 day, 8:36:41][39m
+[titan] 2025-09-09 10:12:39,290 - root - INFO - [31mstep: 22190 [32mloss: 2.7907 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.26 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.8036 [37mglobal_avg_top_loss: 1.9871
+[titan] 2025-09-09 10:12:39,291 - root - INFO - [34mlr: 9.5847e-06 gnorm: 0.36 [35m[1 day, 16:37:11<1 day, 8:36:07][39m
+[titan] 2025-09-09 10:13:11,256 - root - INFO - [31mstep: 22195 [32mloss: 2.8944 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.57 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.8586 [37mglobal_avg_top_loss: 2.0359
+[titan] 2025-09-09 10:13:11,256 - root - INFO - [34mlr: 9.5812e-06 gnorm: 0.34 [35m[1 day, 16:37:43<1 day, 8:35:33][39m
+[titan] 2025-09-09 10:13:36,672 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:13:43,040 - root - INFO - [31mstep: 22200 [32mloss: 2.7918 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,310 [36mtflops: 491.37 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.8020 [37mglobal_avg_top_loss: 1.9898
+[titan] 2025-09-09 10:13:43,040 - root - INFO - [34mlr: 9.5777e-06 gnorm: 0.36 [35m[1 day, 16:38:15<1 day, 8:34:59][39m
+[titan] 2025-09-09 10:14:14,772 - root - INFO - [31mstep: 22205 [32mloss: 2.7796 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,327 [36mtflops: 492.16 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 0.7992 [37mglobal_avg_top_loss: 1.9804
+[titan] 2025-09-09 10:14:14,773 - root - INFO - [34mlr: 9.5741e-06 gnorm: 0.39 [35m[1 day, 16:38:47<1 day, 8:34:25][39m
+[titan] 2025-09-09 10:14:46,616 - root - INFO - [31mstep: 22210 [32mloss: 2.7982 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.44 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.8021 [37mglobal_avg_top_loss: 1.9961
+[titan] 2025-09-09 10:14:46,617 - root - INFO - [34mlr: 9.5706e-06 gnorm: 0.34 [35m[1 day, 16:39:18<1 day, 8:33:52][39m
+[titan] 2025-09-09 10:15:18,345 - root - INFO - [31mstep: 22215 [32mloss: 2.6928 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,328 [36mtflops: 492.22 [35mmfu: 49.77%[39m [37mglobal_avg_ntp_loss: 0.7549 [37mglobal_avg_top_loss: 1.9379
+[titan] 2025-09-09 10:15:18,346 - root - INFO - [34mlr: 9.5671e-06 gnorm: 0.33 [35m[1 day, 16:39:50<1 day, 8:33:18][39m
+[titan] 2025-09-09 10:15:50,299 - root - INFO - [31mstep: 22220 [32mloss: 2.7928 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.75 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.8067 [37mglobal_avg_top_loss: 1.9861
+[titan] 2025-09-09 10:15:50,299 - root - INFO - [34mlr: 9.5636e-06 gnorm: 0.34 [35m[1 day, 16:40:22<1 day, 8:32:44][39m
+[titan] 2025-09-09 10:16:22,115 - root - INFO - [31mstep: 22225 [32mloss: 2.8799 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.87 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.8419 [37mglobal_avg_top_loss: 2.0380
+[titan] 2025-09-09 10:16:22,115 - root - INFO - [34mlr: 9.5600e-06 gnorm: 0.86 [35m[1 day, 16:40:54<1 day, 8:32:10][39m
+[titan] 2025-09-09 10:16:53,932 - root - INFO - [31mstep: 22230 [32mloss: 2.7991 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.86 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.8039 [37mglobal_avg_top_loss: 1.9952
+[titan] 2025-09-09 10:16:53,932 - root - INFO - [34mlr: 9.5565e-06 gnorm: 0.34 [35m[1 day, 16:41:26<1 day, 8:31:36][39m
+[titan] 2025-09-09 10:17:25,893 - root - INFO - [31mstep: 22235 [32mloss: 3.1268 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.64 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.9946 [37mglobal_avg_top_loss: 2.1322
+[titan] 2025-09-09 10:17:25,893 - root - INFO - [34mlr: 9.5530e-06 gnorm: 0.40 [35m[1 day, 16:41:58<1 day, 8:31:03][39m
+[titan] 2025-09-09 10:17:57,780 - root - INFO - [31mstep: 22240 [32mloss: 2.7001 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.78 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7597 [37mglobal_avg_top_loss: 1.9404
+[titan] 2025-09-09 10:17:57,780 - root - INFO - [34mlr: 9.5495e-06 gnorm: 0.40 [35m[1 day, 16:42:30<1 day, 8:30:29][39m
+[titan] 2025-09-09 10:18:29,767 - root - INFO - [31mstep: 22245 [32mloss: 2.8551 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.25 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.8313 [37mglobal_avg_top_loss: 2.0238
+[titan] 2025-09-09 10:18:29,767 - root - INFO - [34mlr: 9.5460e-06 gnorm: 0.36 [35m[1 day, 16:43:02<1 day, 8:29:55][39m
+[titan] 2025-09-09 10:18:55,315 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:19:01,713 - root - INFO - [31mstep: 22250 [32mloss: 2.8191 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.87 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.8132 [37mglobal_avg_top_loss: 2.0059
+[titan] 2025-09-09 10:19:01,713 - root - INFO - [34mlr: 9.5424e-06 gnorm: 0.36 [35m[1 day, 16:43:34<1 day, 8:29:21][39m
+[titan] 2025-09-09 10:19:33,612 - root - INFO - [31mstep: 22255 [32mloss: 2.7885 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.58 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8026 [37mglobal_avg_top_loss: 1.9859
+[titan] 2025-09-09 10:19:33,613 - root - INFO - [34mlr: 9.5389e-06 gnorm: 0.33 [35m[1 day, 16:44:05<1 day, 8:28:47][39m
+[titan] 2025-09-09 10:20:05,773 - root - INFO - [31mstep: 22260 [32mloss: 2.7400 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,189 [36mtflops: 485.60 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7771 [37mglobal_avg_top_loss: 1.9629
+[titan] 2025-09-09 10:20:05,774 - root - INFO - [34mlr: 9.5354e-06 gnorm: 0.34 [35m[1 day, 16:44:38<1 day, 8:28:14][39m
+[titan] 2025-09-09 10:20:37,760 - root - INFO - [31mstep: 22265 [32mloss: 2.7462 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.24 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7826 [37mglobal_avg_top_loss: 1.9636
+[titan] 2025-09-09 10:20:37,761 - root - INFO - [34mlr: 9.5319e-06 gnorm: 0.34 [35m[1 day, 16:45:10<1 day, 8:27:40][39m
+[titan] 2025-09-09 10:21:09,496 - root - INFO - [31mstep: 22270 [32mloss: 2.7766 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,326 [36mtflops: 492.11 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 0.7942 [37mglobal_avg_top_loss: 1.9823
+[titan] 2025-09-09 10:21:09,496 - root - INFO - [34mlr: 9.5283e-06 gnorm: 0.36 [35m[1 day, 16:45:41<1 day, 8:27:06][39m
+[titan] 2025-09-09 10:21:41,427 - root - INFO - [31mstep: 22275 [32mloss: 2.8120 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.10 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8139 [37mglobal_avg_top_loss: 1.9981
+[titan] 2025-09-09 10:21:41,427 - root - INFO - [34mlr: 9.5248e-06 gnorm: 0.33 [35m[1 day, 16:46:13<1 day, 8:26:33][39m
+[titan] 2025-09-09 10:22:13,141 - root - INFO - [31mstep: 22280 [32mloss: 2.7896 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,333 [36mtflops: 492.45 [35mmfu: 49.79%[39m [37mglobal_avg_ntp_loss: 0.8030 [37mglobal_avg_top_loss: 1.9865
+[titan] 2025-09-09 10:22:13,141 - root - INFO - [34mlr: 9.5213e-06 gnorm: 0.34 [35m[1 day, 16:46:45<1 day, 8:25:59][39m
+[titan] 2025-09-09 10:22:44,944 - root - INFO - [31mstep: 22285 [32mloss: 3.1378 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.06 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.9973 [37mglobal_avg_top_loss: 2.1405
+[titan] 2025-09-09 10:22:44,945 - root - INFO - [34mlr: 9.5178e-06 gnorm: 0.34 [35m[1 day, 16:47:17<1 day, 8:25:25][39m
+[titan] 2025-09-09 10:23:16,856 - root - INFO - [31mstep: 22290 [32mloss: 2.7761 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.39 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7948 [37mglobal_avg_top_loss: 1.9813
+[titan] 2025-09-09 10:23:16,857 - root - INFO - [34mlr: 9.5143e-06 gnorm: 0.37 [35m[1 day, 16:47:49<1 day, 8:24:51][39m
+[titan] 2025-09-09 10:23:48,837 - root - INFO - [31mstep: 22295 [32mloss: 3.2144 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.33 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 1.0470 [37mglobal_avg_top_loss: 2.1675
+[titan] 2025-09-09 10:23:48,838 - root - INFO - [34mlr: 9.5107e-06 gnorm: 0.35 [35m[1 day, 16:48:21<1 day, 8:24:17][39m
+[titan] 2025-09-09 10:24:14,316 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:24:20,731 - root - INFO - [31mstep: 22300 [32mloss: 2.7452 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.67 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7705 [37mglobal_avg_top_loss: 1.9747
+[titan] 2025-09-09 10:24:20,732 - root - INFO - [34mlr: 9.5072e-06 gnorm: 1.10 [35m[1 day, 16:48:53<1 day, 8:23:44][39m
+[titan] 2025-09-09 10:24:52,942 - root - INFO - [31mstep: 22305 [32mloss: 2.8242 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,173 [36mtflops: 484.85 [35mmfu: 49.02%[39m [37mglobal_avg_ntp_loss: 0.8184 [37mglobal_avg_top_loss: 2.0058
+[titan] 2025-09-09 10:24:52,942 - root - INFO - [34mlr: 9.5037e-06 gnorm: 0.39 [35m[1 day, 16:49:25<1 day, 8:23:10][39m
+[titan] 2025-09-09 10:25:24,735 - root - INFO - [31mstep: 22310 [32mloss: 3.0296 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,307 [36mtflops: 491.22 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.9360 [37mglobal_avg_top_loss: 2.0937
+[titan] 2025-09-09 10:25:24,736 - root - INFO - [34mlr: 9.5002e-06 gnorm: 0.35 [35m[1 day, 16:49:57<1 day, 8:22:36][39m
+[titan] 2025-09-09 10:25:56,775 - root - INFO - [31mstep: 22315 [32mloss: 2.7146 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.44 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7729 [37mglobal_avg_top_loss: 1.9417
+[titan] 2025-09-09 10:25:56,775 - root - INFO - [34mlr: 9.4967e-06 gnorm: 0.38 [35m[1 day, 16:50:29<1 day, 8:22:02][39m
+[titan] 2025-09-09 10:26:28,848 - root - INFO - [31mstep: 22320 [32mloss: 2.7762 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.94 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7959 [37mglobal_avg_top_loss: 1.9804
+[titan] 2025-09-09 10:26:28,848 - root - INFO - [34mlr: 9.4931e-06 gnorm: 0.35 [35m[1 day, 16:51:01<1 day, 8:21:29][39m
+[titan] 2025-09-09 10:27:00,585 - root - INFO - [31mstep: 22325 [32mloss: 2.7520 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,325 [36mtflops: 492.09 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 0.7834 [37mglobal_avg_top_loss: 1.9685
+[titan] 2025-09-09 10:27:00,586 - root - INFO - [34mlr: 9.4896e-06 gnorm: 0.35 [35m[1 day, 16:51:32<1 day, 8:20:55][39m
+[titan] 2025-09-09 10:27:32,400 - root - INFO - [31mstep: 22330 [32mloss: 2.7717 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.89 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7958 [37mglobal_avg_top_loss: 1.9759
+[titan] 2025-09-09 10:27:32,400 - root - INFO - [34mlr: 9.4861e-06 gnorm: 0.35 [35m[1 day, 16:52:04<1 day, 8:20:21][39m
+[titan] 2025-09-09 10:28:04,253 - root - INFO - [31mstep: 22335 [32mloss: 2.6690 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.30 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7426 [37mglobal_avg_top_loss: 1.9264
+[titan] 2025-09-09 10:28:04,253 - root - INFO - [34mlr: 9.4826e-06 gnorm: 1.22 [35m[1 day, 16:52:36<1 day, 8:19:47][39m
+[titan] 2025-09-09 10:28:36,221 - root - INFO - [31mstep: 22340 [32mloss: 2.7836 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.53 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.8001 [37mglobal_avg_top_loss: 1.9835
+[titan] 2025-09-09 10:28:36,222 - root - INFO - [34mlr: 9.4791e-06 gnorm: 0.33 [35m[1 day, 16:53:08<1 day, 8:19:14][39m
+[titan] 2025-09-09 10:29:08,159 - root - INFO - [31mstep: 22345 [32mloss: 2.9081 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 489.00 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8610 [37mglobal_avg_top_loss: 2.0471
+[titan] 2025-09-09 10:29:08,159 - root - INFO - [34mlr: 9.4755e-06 gnorm: 0.34 [35m[1 day, 16:53:40<1 day, 8:18:40][39m
+[titan] 2025-09-09 10:29:33,594 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:29:40,005 - root - INFO - [31mstep: 22350 [32mloss: 2.8288 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.40 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.8207 [37mglobal_avg_top_loss: 2.0080
+[titan] 2025-09-09 10:29:40,006 - root - INFO - [34mlr: 9.4720e-06 gnorm: 0.34 [35m[1 day, 16:54:12<1 day, 8:18:06][39m
+[titan] 2025-09-09 10:30:11,980 - root - INFO - [31mstep: 22355 [32mloss: 2.7864 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.43 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7995 [37mglobal_avg_top_loss: 1.9869
+[titan] 2025-09-09 10:30:11,980 - root - INFO - [34mlr: 9.4685e-06 gnorm: 0.33 [35m[1 day, 16:54:44<1 day, 8:17:32][39m
+[titan] 2025-09-09 10:30:44,054 - root - INFO - [31mstep: 22360 [32mloss: 2.8197 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.91 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8168 [37mglobal_avg_top_loss: 2.0029
+[titan] 2025-09-09 10:30:44,055 - root - INFO - [34mlr: 9.4650e-06 gnorm: 0.34 [35m[1 day, 16:55:16<1 day, 8:16:59][39m
+[titan] 2025-09-09 10:31:15,806 - root - INFO - [31mstep: 22365 [32mloss: 3.1971 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,320 [36mtflops: 491.86 [35mmfu: 49.73%[39m [37mglobal_avg_ntp_loss: 1.0403 [37mglobal_avg_top_loss: 2.1568
+[titan] 2025-09-09 10:31:15,806 - root - INFO - [34mlr: 9.4615e-06 gnorm: 0.36 [35m[1 day, 16:55:48<1 day, 8:16:25][39m
+[titan] 2025-09-09 10:31:47,589 - root - INFO - [31mstep: 22370 [32mloss: 2.7473 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,310 [36mtflops: 491.37 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.7877 [37mglobal_avg_top_loss: 1.9596
+[titan] 2025-09-09 10:31:47,590 - root - INFO - [34mlr: 9.4580e-06 gnorm: 0.39 [35m[1 day, 16:56:19<1 day, 8:15:51][39m
+[titan] 2025-09-09 10:32:19,470 - root - INFO - [31mstep: 22375 [32mloss: 3.7100 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.87 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 1.3271 [37mglobal_avg_top_loss: 2.3829
+[titan] 2025-09-09 10:32:19,471 - root - INFO - [34mlr: 9.4544e-06 gnorm: 0.36 [35m[1 day, 16:56:51<1 day, 8:15:17][39m
+[titan] 2025-09-09 10:32:51,533 - root - INFO - [31mstep: 22380 [32mloss: 2.7663 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.08 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7923 [37mglobal_avg_top_loss: 1.9740
+[titan] 2025-09-09 10:32:51,534 - root - INFO - [34mlr: 9.4509e-06 gnorm: 0.35 [35m[1 day, 16:57:23<1 day, 8:14:44][39m
+[titan] 2025-09-09 10:33:23,490 - root - INFO - [31mstep: 22385 [32mloss: 2.7515 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.71 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7823 [37mglobal_avg_top_loss: 1.9692
+[titan] 2025-09-09 10:33:23,491 - root - INFO - [34mlr: 9.4474e-06 gnorm: 0.33 [35m[1 day, 16:57:55<1 day, 8:14:10][39m
+[titan] 2025-09-09 10:33:55,253 - root - INFO - [31mstep: 22390 [32mloss: 2.8091 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,317 [36mtflops: 491.69 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 0.8087 [37mglobal_avg_top_loss: 2.0004
+[titan] 2025-09-09 10:33:55,254 - root - INFO - [34mlr: 9.4439e-06 gnorm: 0.33 [35m[1 day, 16:58:27<1 day, 8:13:36][39m
+[titan] 2025-09-09 10:34:27,319 - root - INFO - [31mstep: 22395 [32mloss: 2.7495 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.05 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7844 [37mglobal_avg_top_loss: 1.9651
+[titan] 2025-09-09 10:34:27,319 - root - INFO - [34mlr: 9.4404e-06 gnorm: 0.34 [35m[1 day, 16:58:59<1 day, 8:13:02][39m
+[titan] 2025-09-09 10:34:52,743 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:34:59,120 - root - INFO - [31mstep: 22400 [32mloss: 2.7576 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.10 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7869 [37mglobal_avg_top_loss: 1.9707
+[titan] 2025-09-09 10:34:59,121 - root - INFO - [34mlr: 9.4369e-06 gnorm: 0.38 [35m[1 day, 16:59:31<1 day, 8:12:28][39m
+[titan] 2025-09-09 10:35:31,054 - root - INFO - [31mstep: 22405 [32mloss: 2.6767 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.06 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7545 [37mglobal_avg_top_loss: 1.9222
+[titan] 2025-09-09 10:35:31,054 - root - INFO - [34mlr: 9.4333e-06 gnorm: 0.35 [35m[1 day, 17:00:03<1 day, 8:11:55][39m
+[titan] 2025-09-09 10:36:02,956 - root - INFO - [31mstep: 22410 [32mloss: 2.8015 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.54 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8100 [37mglobal_avg_top_loss: 1.9915
+[titan] 2025-09-09 10:36:02,957 - root - INFO - [34mlr: 9.4298e-06 gnorm: 0.34 [35m[1 day, 17:00:35<1 day, 8:11:21][39m
+[titan] 2025-09-09 10:36:35,048 - root - INFO - [31mstep: 22415 [32mloss: 2.6917 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.66 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7588 [37mglobal_avg_top_loss: 1.9329
+[titan] 2025-09-09 10:36:35,048 - root - INFO - [34mlr: 9.4263e-06 gnorm: 0.45 [35m[1 day, 17:01:07<1 day, 8:10:47][39m
+[titan] 2025-09-09 10:37:06,842 - root - INFO - [31mstep: 22420 [32mloss: 2.6305 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.19 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.7283 [37mglobal_avg_top_loss: 1.9022
+[titan] 2025-09-09 10:37:06,843 - root - INFO - [34mlr: 9.4228e-06 gnorm: 0.38 [35m[1 day, 17:01:39<1 day, 8:10:14][39m
+[titan] 2025-09-09 10:37:38,716 - root - INFO - [31mstep: 22425 [32mloss: 2.7768 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.98 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7934 [37mglobal_avg_top_loss: 1.9834
+[titan] 2025-09-09 10:37:38,716 - root - INFO - [34mlr: 9.4193e-06 gnorm: 0.35 [35m[1 day, 17:02:11<1 day, 8:09:40][39m
+[titan] 2025-09-09 10:38:10,781 - root - INFO - [31mstep: 22430 [32mloss: 2.7943 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.05 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.8103 [37mglobal_avg_top_loss: 1.9840
+[titan] 2025-09-09 10:38:10,782 - root - INFO - [34mlr: 9.4158e-06 gnorm: 0.36 [35m[1 day, 17:02:43<1 day, 8:09:06][39m
+[titan] 2025-09-09 10:38:42,775 - root - INFO - [31mstep: 22435 [32mloss: 2.7842 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.14 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7941 [37mglobal_avg_top_loss: 1.9901
+[titan] 2025-09-09 10:38:42,776 - root - INFO - [34mlr: 9.4123e-06 gnorm: 0.41 [35m[1 day, 17:03:15<1 day, 8:08:32][39m
+[titan] 2025-09-09 10:39:14,484 - root - INFO - [31mstep: 22440 [32mloss: 2.7109 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,335 [36mtflops: 492.54 [35mmfu: 49.80%[39m [37mglobal_avg_ntp_loss: 0.7639 [37mglobal_avg_top_loss: 1.9470
+[titan] 2025-09-09 10:39:14,484 - root - INFO - [34mlr: 9.4087e-06 gnorm: 0.39 [35m[1 day, 17:03:46<1 day, 8:07:59][39m
+[titan] 2025-09-09 10:39:46,414 - root - INFO - [31mstep: 22445 [32mloss: 3.2198 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.11 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 1.0513 [37mglobal_avg_top_loss: 2.1685
+[titan] 2025-09-09 10:39:46,414 - root - INFO - [34mlr: 9.4052e-06 gnorm: 0.35 [35m[1 day, 17:04:18<1 day, 8:07:25][39m
+[titan] 2025-09-09 10:40:11,863 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:40:18,265 - root - INFO - [31mstep: 22450 [32mloss: 2.7975 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.33 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.8064 [37mglobal_avg_top_loss: 1.9911
+[titan] 2025-09-09 10:40:18,265 - root - INFO - [34mlr: 9.4017e-06 gnorm: 0.33 [35m[1 day, 17:04:50<1 day, 8:06:51][39m
+[titan] 2025-09-09 10:40:50,369 - root - INFO - [31mstep: 22455 [32mloss: 3.1519 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.47 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 1.0199 [37mglobal_avg_top_loss: 2.1320
+[titan] 2025-09-09 10:40:50,369 - root - INFO - [34mlr: 9.3982e-06 gnorm: 0.39 [35m[1 day, 17:05:22<1 day, 8:06:17][39m
+[titan] 2025-09-09 10:41:22,176 - root - INFO - [31mstep: 22460 [32mloss: 2.7428 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,302 [36mtflops: 491.01 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7800 [37mglobal_avg_top_loss: 1.9628
+[titan] 2025-09-09 10:41:22,176 - root - INFO - [34mlr: 9.3947e-06 gnorm: 0.34 [35m[1 day, 17:05:54<1 day, 8:05:44][39m
+[titan] 2025-09-09 10:41:54,086 - root - INFO - [31mstep: 22465 [32mloss: 2.7963 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.42 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8013 [37mglobal_avg_top_loss: 1.9951
+[titan] 2025-09-09 10:41:54,086 - root - INFO - [34mlr: 9.3912e-06 gnorm: 0.36 [35m[1 day, 17:06:26<1 day, 8:05:10][39m
+[titan] 2025-09-09 10:42:25,953 - root - INFO - [31mstep: 22470 [32mloss: 3.2035 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.07 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 1.0400 [37mglobal_avg_top_loss: 2.1635
+[titan] 2025-09-09 10:42:25,954 - root - INFO - [34mlr: 9.3877e-06 gnorm: 0.41 [35m[1 day, 17:06:58<1 day, 8:04:36][39m
+[titan] 2025-09-09 10:42:57,858 - root - INFO - [31mstep: 22475 [32mloss: 2.7818 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.51 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7972 [37mglobal_avg_top_loss: 1.9846
+[titan] 2025-09-09 10:42:57,858 - root - INFO - [34mlr: 9.3841e-06 gnorm: 0.32 [35m[1 day, 17:07:30<1 day, 8:04:02][39m
+[titan] 2025-09-09 10:43:29,724 - root - INFO - [31mstep: 22480 [32mloss: 2.7182 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.10 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7633 [37mglobal_avg_top_loss: 1.9549
+[titan] 2025-09-09 10:43:29,724 - root - INFO - [34mlr: 9.3806e-06 gnorm: 0.87 [35m[1 day, 17:08:02<1 day, 8:03:29][39m
+[titan] 2025-09-09 10:44:01,625 - root - INFO - [31mstep: 22485 [32mloss: 2.6214 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.55 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7243 [37mglobal_avg_top_loss: 1.8971
+[titan] 2025-09-09 10:44:01,626 - root - INFO - [34mlr: 9.3771e-06 gnorm: 0.38 [35m[1 day, 17:08:33<1 day, 8:02:55][39m
+[titan] 2025-09-09 10:44:33,371 - root - INFO - [31mstep: 22490 [32mloss: 2.7821 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,322 [36mtflops: 491.96 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 0.7987 [37mglobal_avg_top_loss: 1.9834
+[titan] 2025-09-09 10:44:33,372 - root - INFO - [34mlr: 9.3736e-06 gnorm: 0.34 [35m[1 day, 17:09:05<1 day, 8:02:21][39m
+[titan] 2025-09-09 10:45:05,613 - root - INFO - [31mstep: 22495 [32mloss: 2.7225 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,164 [36mtflops: 484.39 [35mmfu: 48.98%[39m [37mglobal_avg_ntp_loss: 0.7685 [37mglobal_avg_top_loss: 1.9540
+[titan] 2025-09-09 10:45:05,613 - root - INFO - [34mlr: 9.3701e-06 gnorm: 0.33 [35m[1 day, 17:09:37<1 day, 8:01:48][39m
+[titan] 2025-09-09 10:45:31,000 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
+[titan] 2025-09-09 10:45:37,393 - root - INFO - [31mstep: 22500 [32mloss: 2.7479 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,311 [36mtflops: 491.42 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.7795 [37mglobal_avg_top_loss: 1.9683
+[titan] 2025-09-09 10:45:37,393 - root - INFO - [34mlr: 9.3666e-06 gnorm: 0.35 [35m[1 day, 17:10:09<1 day, 8:01:14][39m
+[titan] 2025-09-09 10:46:09,297 - root - INFO - [31mstep: 22505 [32mloss: 2.8345 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.51 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8235 [37mglobal_avg_top_loss: 2.0110
+[titan] 2025-09-09 10:46:09,297 - root - INFO - [34mlr: 9.3631e-06 gnorm: 0.34 [35m[1 day, 17:10:41<1 day, 8:00:40][39m
+[titan] 2025-09-09 10:46:41,250 - root - INFO - [31mstep: 22510 [32mloss: 2.7794 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.76 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7973 [37mglobal_avg_top_loss: 1.9821
+[titan] 2025-09-09 10:46:41,250 - root - INFO - [34mlr: 9.3596e-06 gnorm: 0.45 [35m[1 day, 17:11:13<1 day, 8:00:06][39m
+[titan] 2025-09-09 10:47:13,222 - root - INFO - [31mstep: 22515 [32mloss: 2.8173 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.47 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.8121 [37mglobal_avg_top_loss: 2.0052
+[titan] 2025-09-09 10:47:13,222 - root - INFO - [34mlr: 9.3561e-06 gnorm: 0.37 [35m[1 day, 17:11:45<1 day, 7:59:33][39m
+[titan] 2025-09-09 10:47:45,148 - root - INFO - [31mstep: 22520 [32mloss: 2.6888 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.18 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7589 [37mglobal_avg_top_loss: 1.9299
+[titan] 2025-09-09 10:47:45,148 - root - INFO - [34mlr: 9.3526e-06 gnorm: 0.33 [35m[1 day, 17:12:17<1 day, 7:58:59][39m
+[titan] 2025-09-09 10:48:17,091 - root - INFO - [31mstep: 22525 [32mloss: 3.2359 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.92 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 1.0577 [37mglobal_avg_top_loss: 2.1781
+[titan] 2025-09-09 10:48:17,091 - root - INFO - [34mlr: 9.3490e-06 gnorm: 0.33 [35m[1 day, 17:12:49<1 day, 7:58:25][39m
+[titan] 2025-09-09 10:48:36,555 - root - INFO - Dumping profiler traces at step 22528
+[titan] 2025-09-09 10:48:36,623 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 10:48:49,411 - root - INFO - [31mstep: 22530 [32mloss: 2.8530 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,139 [36mtflops: 483.21 [35mmfu: 48.86%[39m [37mglobal_avg_ntp_loss: 0.8325 [37mglobal_avg_top_loss: 2.0205
+[titan] 2025-09-09 10:48:49,411 - root - INFO - [34mlr: 9.3455e-06 gnorm: 0.33 [35m[1 day, 17:13:21<1 day, 7:57:52][39m
+[titan] 2025-09-09 10:49:21,402 - root - INFO - [31mstep: 22535 [32mloss: 3.2439 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 1.0600 [37mglobal_avg_top_loss: 2.1838
+[titan] 2025-09-09 10:49:21,403 - root - INFO - [34mlr: 9.3420e-06 gnorm: 0.33 [35m[1 day, 17:13:53<1 day, 7:57:18][39m
+[titan] 2025-09-09 10:49:53,344 - root - INFO - [31mstep: 22540 [32mloss: 2.6626 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.94 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7424 [37mglobal_avg_top_loss: 1.9202
+[titan] 2025-09-09 10:49:53,344 - root - INFO - [34mlr: 9.3385e-06 gnorm: 0.49 [35m[1 day, 17:14:25<1 day, 7:56:44][39m
+[titan] 2025-09-09 10:50:25,321 - root - INFO - [31mstep: 22545 [32mloss: 2.7366 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.40 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7754 [37mglobal_avg_top_loss: 1.9612
+[titan] 2025-09-09 10:50:25,321 - root - INFO - [34mlr: 9.3350e-06 gnorm: 0.38 [35m[1 day, 17:14:57<1 day, 7:56:11][39m
+[titan] 2025-09-09 10:50:50,664 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
+[titan] 2025-09-09 10:50:57,066 - root - INFO - [31mstep: 22550 [32mloss: 2.7777 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,322 [36mtflops: 491.96 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 0.7964 [37mglobal_avg_top_loss: 1.9813
+[titan] 2025-09-09 10:50:57,066 - root - INFO - [34mlr: 9.3315e-06 gnorm: 0.34 [35m[1 day, 17:15:29<1 day, 7:55:37][39m
+[titan] 2025-09-09 10:51:28,927 - root - INFO - [31mstep: 22555 [32mloss: 2.8046 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.18 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.8085 [37mglobal_avg_top_loss: 1.9961
+[titan] 2025-09-09 10:51:28,927 - root - INFO - [34mlr: 9.3280e-06 gnorm: 0.34 [35m[1 day, 17:16:01<1 day, 7:55:03][39m
+[titan] 2025-09-09 10:52:00,706 - root - INFO - [31mstep: 22560 [32mloss: 2.8238 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,312 [36mtflops: 491.44 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.8159 [37mglobal_avg_top_loss: 2.0079
+[titan] 2025-09-09 10:52:00,706 - root - INFO - [34mlr: 9.3245e-06 gnorm: 0.35 [35m[1 day, 17:16:32<1 day, 7:54:29][39m
+[titan] 2025-09-09 10:52:32,649 - root - INFO - [31mstep: 22565 [32mloss: 2.7995 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.91 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.8084 [37mglobal_avg_top_loss: 1.9911
+[titan] 2025-09-09 10:52:32,650 - root - INFO - [34mlr: 9.3210e-06 gnorm: 0.33 [35m[1 day, 17:17:04<1 day, 7:53:56][39m
+[titan] 2025-09-09 10:53:04,351 - root - INFO - [31mstep: 22570 [32mloss: 2.7344 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,337 [36mtflops: 492.64 [35mmfu: 49.81%[39m [37mglobal_avg_ntp_loss: 0.7736 [37mglobal_avg_top_loss: 1.9607
+[titan] 2025-09-09 10:53:04,351 - root - INFO - [34mlr: 9.3175e-06 gnorm: 0.33 [35m[1 day, 17:17:36<1 day, 7:53:22][39m
+[titan] 2025-09-09 10:53:36,468 - root - INFO - [31mstep: 22575 [32mloss: 2.7945 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.26 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.8036 [37mglobal_avg_top_loss: 1.9910
+[titan] 2025-09-09 10:53:36,469 - root - INFO - [34mlr: 9.3140e-06 gnorm: 0.34 [35m[1 day, 17:18:08<1 day, 7:52:48][39m
+[titan] 2025-09-09 10:54:08,276 - root - INFO - [31mstep: 22580 [32mloss: 2.8650 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.99 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.8340 [37mglobal_avg_top_loss: 2.0310
+[titan] 2025-09-09 10:54:08,277 - root - INFO - [34mlr: 9.3105e-06 gnorm: 0.37 [35m[1 day, 17:18:40<1 day, 7:52:14][39m
+[titan] 2025-09-09 10:54:40,256 - root - INFO - [31mstep: 22585 [32mloss: 2.8324 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.35 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.8186 [37mglobal_avg_top_loss: 2.0138
+[titan] 2025-09-09 10:54:40,257 - root - INFO - [34mlr: 9.3069e-06 gnorm: 0.34 [35m[1 day, 17:19:12<1 day, 7:51:41][39m
+[titan] 2025-09-09 10:55:12,134 - root - INFO - [31mstep: 22590 [32mloss: 2.8197 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.92 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.8118 [37mglobal_avg_top_loss: 2.0079
+[titan] 2025-09-09 10:55:12,134 - root - INFO - [34mlr: 9.3034e-06 gnorm: 0.36 [35m[1 day, 17:19:44<1 day, 7:51:07][39m
+[titan] 2025-09-09 10:55:44,077 - root - INFO - [31mstep: 22595 [32mloss: 2.8490 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.92 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8302 [37mglobal_avg_top_loss: 2.0188
+[titan] 2025-09-09 10:55:44,077 - root - INFO - [34mlr: 9.2999e-06 gnorm: 0.33 [35m[1 day, 17:20:16<1 day, 7:50:33][39m
+[titan] 2025-09-09 10:56:09,586 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 10:56:15,911 - root - INFO - [31mstep: 22600 [32mloss: 2.7900 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.58 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7988 [37mglobal_avg_top_loss: 1.9912
+[titan] 2025-09-09 10:56:15,912 - root - INFO - [34mlr: 9.2964e-06 gnorm: 0.33 [35m[1 day, 17:20:48<1 day, 7:49:59][39m
+[titan] 2025-09-09 10:56:47,763 - root - INFO - [31mstep: 22605 [32mloss: 3.3223 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.32 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 1.1002 [37mglobal_avg_top_loss: 2.2221
+[titan] 2025-09-09 10:56:47,764 - root - INFO - [34mlr: 9.2929e-06 gnorm: 0.32 [35m[1 day, 17:21:20<1 day, 7:49:26][39m
+[titan] 2025-09-09 10:57:19,784 - root - INFO - [31mstep: 22610 [32mloss: 2.6639 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.72 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7431 [37mglobal_avg_top_loss: 1.9208
+[titan] 2025-09-09 10:57:19,785 - root - INFO - [34mlr: 9.2894e-06 gnorm: 0.33 [35m[1 day, 17:21:52<1 day, 7:48:52][39m
+[titan] 2025-09-09 10:57:51,810 - root - INFO - [31mstep: 22615 [32mloss: 3.2190 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.65 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 1.0484 [37mglobal_avg_top_loss: 2.1705
+[titan] 2025-09-09 10:57:51,811 - root - INFO - [34mlr: 9.2859e-06 gnorm: 0.34 [35m[1 day, 17:22:24<1 day, 7:48:18][39m
+[titan] 2025-09-09 10:58:23,853 - root - INFO - [31mstep: 22620 [32mloss: 2.6631 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.40 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7442 [37mglobal_avg_top_loss: 1.9189
+[titan] 2025-09-09 10:58:23,853 - root - INFO - [34mlr: 9.2824e-06 gnorm: 0.48 [35m[1 day, 17:22:56<1 day, 7:47:45][39m
+[titan] 2025-09-09 10:58:55,700 - root - INFO - [31mstep: 22625 [32mloss: 2.6856 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,289 [36mtflops: 490.38 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7538 [37mglobal_avg_top_loss: 1.9318
+[titan] 2025-09-09 10:58:55,701 - root - INFO - [34mlr: 9.2789e-06 gnorm: 0.36 [35m[1 day, 17:23:27<1 day, 7:47:11][39m
+[titan] 2025-09-09 10:59:27,447 - root - INFO - [31mstep: 22630 [32mloss: 2.7753 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,322 [36mtflops: 491.94 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 0.7984 [37mglobal_avg_top_loss: 1.9769
+[titan] 2025-09-09 10:59:27,448 - root - INFO - [34mlr: 9.2754e-06 gnorm: 0.34 [35m[1 day, 17:23:59<1 day, 7:46:37][39m
+[titan] 2025-09-09 10:59:59,425 - root - INFO - [31mstep: 22635 [32mloss: 2.8106 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.38 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.8130 [37mglobal_avg_top_loss: 1.9975
+[titan] 2025-09-09 10:59:59,426 - root - INFO - [34mlr: 9.2719e-06 gnorm: 0.34 [35m[1 day, 17:24:31<1 day, 7:46:04][39m
+[titan] 2025-09-09 11:00:31,577 - root - INFO - [31mstep: 22640 [32mloss: 2.7696 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,192 [36mtflops: 485.75 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.7899 [37mglobal_avg_top_loss: 1.9797
+[titan] 2025-09-09 11:00:31,577 - root - INFO - [34mlr: 9.2684e-06 gnorm: 0.33 [35m[1 day, 17:25:03<1 day, 7:45:30][39m
+[titan] 2025-09-09 11:01:03,377 - root - INFO - [31mstep: 22645 [32mloss: 2.8397 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.10 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.8273 [37mglobal_avg_top_loss: 2.0124
+[titan] 2025-09-09 11:01:03,378 - root - INFO - [34mlr: 9.2649e-06 gnorm: 0.35 [35m[1 day, 17:25:35<1 day, 7:44:56][39m
+[titan] 2025-09-09 11:01:28,758 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:01:35,230 - root - INFO - [31mstep: 22650 [32mloss: 2.8608 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.30 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.8327 [37mglobal_avg_top_loss: 2.0280
+[titan] 2025-09-09 11:01:35,231 - root - INFO - [34mlr: 9.2614e-06 gnorm: 0.35 [35m[1 day, 17:26:07<1 day, 7:44:22][39m
+[titan] 2025-09-09 11:02:07,070 - root - INFO - [31mstep: 22655 [32mloss: 2.7537 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.50 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7841 [37mglobal_avg_top_loss: 1.9696
+[titan] 2025-09-09 11:02:07,070 - root - INFO - [34mlr: 9.2579e-06 gnorm: 0.35 [35m[1 day, 17:26:39<1 day, 7:43:49][39m
+[titan] 2025-09-09 11:02:38,890 - root - INFO - [31mstep: 22660 [32mloss: 2.7692 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.80 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7933 [37mglobal_avg_top_loss: 1.9759
+[titan] 2025-09-09 11:02:38,891 - root - INFO - [34mlr: 9.2544e-06 gnorm: 0.33 [35m[1 day, 17:27:11<1 day, 7:43:15][39m
+[titan] 2025-09-09 11:03:10,840 - root - INFO - [31mstep: 22665 [32mloss: 2.8392 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.81 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.8259 [37mglobal_avg_top_loss: 2.0133
+[titan] 2025-09-09 11:03:10,840 - root - INFO - [34mlr: 9.2509e-06 gnorm: 0.36 [35m[1 day, 17:27:43<1 day, 7:42:41][39m
+[titan] 2025-09-09 11:03:42,657 - root - INFO - [31mstep: 22670 [32mloss: 2.7571 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.85 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7886 [37mglobal_avg_top_loss: 1.9686
+[titan] 2025-09-09 11:03:42,657 - root - INFO - [34mlr: 9.2474e-06 gnorm: 0.35 [35m[1 day, 17:28:14<1 day, 7:42:07][39m
+[titan] 2025-09-09 11:04:14,528 - root - INFO - [31mstep: 22675 [32mloss: 2.7855 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.03 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7986 [37mglobal_avg_top_loss: 1.9869
+[titan] 2025-09-09 11:04:14,528 - root - INFO - [34mlr: 9.2439e-06 gnorm: 0.34 [35m[1 day, 17:28:46<1 day, 7:41:34][39m
+[titan] 2025-09-09 11:04:46,139 - root - INFO - [31mstep: 22680 [32mloss: 2.7507 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,366 [36mtflops: 494.04 [35mmfu: 49.95%[39m [37mglobal_avg_ntp_loss: 0.7822 [37mglobal_avg_top_loss: 1.9685
+[titan] 2025-09-09 11:04:46,140 - root - INFO - [34mlr: 9.2404e-06 gnorm: 0.34 [35m[1 day, 17:29:18<1 day, 7:41:00][39m
+[titan] 2025-09-09 11:05:18,104 - root - INFO - [31mstep: 22685 [32mloss: 2.7724 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.59 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7908 [37mglobal_avg_top_loss: 1.9816
+[titan] 2025-09-09 11:05:18,104 - root - INFO - [34mlr: 9.2369e-06 gnorm: 0.34 [35m[1 day, 17:29:50<1 day, 7:40:26][39m
+[titan] 2025-09-09 11:05:49,931 - root - INFO - [31mstep: 22690 [32mloss: 2.7801 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.70 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.8051 [37mglobal_avg_top_loss: 1.9750
+[titan] 2025-09-09 11:05:49,931 - root - INFO - [34mlr: 9.2334e-06 gnorm: 0.33 [35m[1 day, 17:30:22<1 day, 7:39:52][39m
+[titan] 2025-09-09 11:06:21,895 - root - INFO - [31mstep: 22695 [32mloss: 2.7162 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.59 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7669 [37mglobal_avg_top_loss: 1.9493
+[titan] 2025-09-09 11:06:21,895 - root - INFO - [34mlr: 9.2299e-06 gnorm: 0.33 [35m[1 day, 17:30:54<1 day, 7:39:19][39m
+[titan] 2025-09-09 11:06:47,431 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:06:53,760 - root - INFO - [31mstep: 22700 [32mloss: 2.8097 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,284 [36mtflops: 490.11 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.8060 [37mglobal_avg_top_loss: 2.0037
+[titan] 2025-09-09 11:06:53,761 - root - INFO - [34mlr: 9.2264e-06 gnorm: 0.34 [35m[1 day, 17:31:26<1 day, 7:38:45][39m
+[titan] 2025-09-09 11:07:25,749 - root - INFO - [31mstep: 22705 [32mloss: 2.7664 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.21 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7870 [37mglobal_avg_top_loss: 1.9794
+[titan] 2025-09-09 11:07:25,750 - root - INFO - [34mlr: 9.2229e-06 gnorm: 0.33 [35m[1 day, 17:31:58<1 day, 7:38:11][39m
+[titan] 2025-09-09 11:07:57,582 - root - INFO - [31mstep: 22710 [32mloss: 2.7659 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.61 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7872 [37mglobal_avg_top_loss: 1.9787
+[titan] 2025-09-09 11:07:57,583 - root - INFO - [34mlr: 9.2194e-06 gnorm: 0.34 [35m[1 day, 17:32:29<1 day, 7:37:38][39m
+[titan] 2025-09-09 11:08:29,518 - root - INFO - [31mstep: 22715 [32mloss: 2.6822 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.03 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7539 [37mglobal_avg_top_loss: 1.9284
+[titan] 2025-09-09 11:08:29,518 - root - INFO - [34mlr: 9.2159e-06 gnorm: 0.38 [35m[1 day, 17:33:01<1 day, 7:37:04][39m
+[titan] 2025-09-09 11:09:01,396 - root - INFO - [31mstep: 22720 [32mloss: 2.7197 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.91 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7646 [37mglobal_avg_top_loss: 1.9552
+[titan] 2025-09-09 11:09:01,396 - root - INFO - [34mlr: 9.2124e-06 gnorm: 0.37 [35m[1 day, 17:33:33<1 day, 7:36:30][39m
+[titan] 2025-09-09 11:09:33,488 - root - INFO - [31mstep: 22725 [32mloss: 2.7337 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.64 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7739 [37mglobal_avg_top_loss: 1.9597
+[titan] 2025-09-09 11:09:33,488 - root - INFO - [34mlr: 9.2089e-06 gnorm: 0.37 [35m[1 day, 17:34:05<1 day, 7:35:57][39m
+[titan] 2025-09-09 11:10:05,420 - root - INFO - [31mstep: 22730 [32mloss: 2.7944 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.07 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8004 [37mglobal_avg_top_loss: 1.9940
+[titan] 2025-09-09 11:10:05,421 - root - INFO - [34mlr: 9.2054e-06 gnorm: 0.34 [35m[1 day, 17:34:37<1 day, 7:35:23][39m
+[titan] 2025-09-09 11:10:37,327 - root - INFO - [31mstep: 22735 [32mloss: 2.8260 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.48 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8181 [37mglobal_avg_top_loss: 2.0079
+[titan] 2025-09-09 11:10:37,327 - root - INFO - [34mlr: 9.2019e-06 gnorm: 0.35 [35m[1 day, 17:35:09<1 day, 7:34:49][39m
+[titan] 2025-09-09 11:11:09,288 - root - INFO - [31mstep: 22740 [32mloss: 2.7214 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.64 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7718 [37mglobal_avg_top_loss: 1.9496
+[titan] 2025-09-09 11:11:09,288 - root - INFO - [34mlr: 9.1984e-06 gnorm: 0.34 [35m[1 day, 17:35:41<1 day, 7:34:16][39m
+[titan] 2025-09-09 11:11:41,418 - root - INFO - [31mstep: 22745 [32mloss: 2.7742 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,199 [36mtflops: 486.07 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7975 [37mglobal_avg_top_loss: 1.9766
+[titan] 2025-09-09 11:11:41,418 - root - INFO - [34mlr: 9.1949e-06 gnorm: 0.34 [35m[1 day, 17:36:13<1 day, 7:33:42][39m
+[titan] 2025-09-09 11:12:06,924 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:12:13,283 - root - INFO - [31mstep: 22750 [32mloss: 2.8156 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.10 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.8110 [37mglobal_avg_top_loss: 2.0046
+[titan] 2025-09-09 11:12:13,284 - root - INFO - [34mlr: 9.1914e-06 gnorm: 0.34 [35m[1 day, 17:36:45<1 day, 7:33:08][39m
+[titan] 2025-09-09 11:12:45,201 - root - INFO - [31mstep: 22755 [32mloss: 2.6956 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.30 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7568 [37mglobal_avg_top_loss: 1.9388
+[titan] 2025-09-09 11:12:45,202 - root - INFO - [34mlr: 9.1879e-06 gnorm: 0.38 [35m[1 day, 17:37:17<1 day, 7:32:35][39m
+[titan] 2025-09-09 11:13:16,846 - root - INFO - [31mstep: 22760 [32mloss: 2.9061 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,355 [36mtflops: 493.52 [35mmfu: 49.90%[39m [37mglobal_avg_ntp_loss: 0.8713 [37mglobal_avg_top_loss: 2.0348
+[titan] 2025-09-09 11:13:16,847 - root - INFO - [34mlr: 9.1844e-06 gnorm: 0.33 [35m[1 day, 17:37:49<1 day, 7:32:01][39m
+[titan] 2025-09-09 11:13:48,720 - root - INFO - [31mstep: 22765 [32mloss: 2.6836 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.98 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7558 [37mglobal_avg_top_loss: 1.9277
+[titan] 2025-09-09 11:13:48,720 - root - INFO - [34mlr: 9.1809e-06 gnorm: 0.34 [35m[1 day, 17:38:20<1 day, 7:31:27][39m
+[titan] 2025-09-09 11:14:20,695 - root - INFO - [31mstep: 22770 [32mloss: 2.9051 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.42 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.8531 [37mglobal_avg_top_loss: 2.0520
+[titan] 2025-09-09 11:14:20,695 - root - INFO - [34mlr: 9.1774e-06 gnorm: 0.38 [35m[1 day, 17:38:52<1 day, 7:30:53][39m
+[titan] 2025-09-09 11:14:52,570 - root - INFO - [31mstep: 22775 [32mloss: 2.7745 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.96 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7923 [37mglobal_avg_top_loss: 1.9822
+[titan] 2025-09-09 11:14:52,571 - root - INFO - [34mlr: 9.1739e-06 gnorm: 0.35 [35m[1 day, 17:39:24<1 day, 7:30:20][39m
+[titan] 2025-09-09 11:15:24,495 - root - INFO - [31mstep: 22780 [32mloss: 2.7520 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.19 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7901 [37mglobal_avg_top_loss: 1.9619
+[titan] 2025-09-09 11:15:24,496 - root - INFO - [34mlr: 9.1704e-06 gnorm: 0.33 [35m[1 day, 17:39:56<1 day, 7:29:46][39m
+[titan] 2025-09-09 11:15:56,459 - root - INFO - [31mstep: 22785 [32mloss: 2.7848 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.61 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7977 [37mglobal_avg_top_loss: 1.9870
+[titan] 2025-09-09 11:15:56,459 - root - INFO - [34mlr: 9.1669e-06 gnorm: 0.35 [35m[1 day, 17:40:28<1 day, 7:29:12][39m
+[titan] 2025-09-09 11:16:28,305 - root - INFO - [31mstep: 22790 [32mloss: 2.8074 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.41 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.8169 [37mglobal_avg_top_loss: 1.9905
+[titan] 2025-09-09 11:16:28,305 - root - INFO - [34mlr: 9.1634e-06 gnorm: 0.36 [35m[1 day, 17:41:00<1 day, 7:28:39][39m
+[titan] 2025-09-09 11:17:00,082 - root - INFO - [31mstep: 22795 [32mloss: 2.7460 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,312 [36mtflops: 491.46 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.7854 [37mglobal_avg_top_loss: 1.9607
+[titan] 2025-09-09 11:17:00,083 - root - INFO - [34mlr: 9.1599e-06 gnorm: 0.36 [35m[1 day, 17:41:32<1 day, 7:28:05][39m
+[titan] 2025-09-09 11:17:25,548 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:17:31,876 - root - INFO - [31mstep: 22800 [32mloss: 2.7609 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,307 [36mtflops: 491.21 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.7932 [37mglobal_avg_top_loss: 1.9677
+[titan] 2025-09-09 11:17:31,877 - root - INFO - [34mlr: 9.1564e-06 gnorm: 0.34 [35m[1 day, 17:42:04<1 day, 7:27:31][39m
+[titan] 2025-09-09 11:18:03,734 - root - INFO - [31mstep: 22805 [32mloss: 2.7826 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.22 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7975 [37mglobal_avg_top_loss: 1.9850
+[titan] 2025-09-09 11:18:03,735 - root - INFO - [34mlr: 9.1529e-06 gnorm: 0.34 [35m[1 day, 17:42:35<1 day, 7:26:57][39m
+[titan] 2025-09-09 11:18:35,894 - root - INFO - [31mstep: 22810 [32mloss: 2.8022 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.62 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.8065 [37mglobal_avg_top_loss: 1.9957
+[titan] 2025-09-09 11:18:35,894 - root - INFO - [34mlr: 9.1494e-06 gnorm: 0.36 [35m[1 day, 17:43:08<1 day, 7:26:24][39m
+[titan] 2025-09-09 11:19:07,699 - root - INFO - [31mstep: 22815 [32mloss: 2.7565 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.03 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7833 [37mglobal_avg_top_loss: 1.9732
+[titan] 2025-09-09 11:19:07,700 - root - INFO - [34mlr: 9.1460e-06 gnorm: 0.34 [35m[1 day, 17:43:39<1 day, 7:25:50][39m
+[titan] 2025-09-09 11:19:39,497 - root - INFO - [31mstep: 22820 [32mloss: 2.7649 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.15 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7868 [37mglobal_avg_top_loss: 1.9781
+[titan] 2025-09-09 11:19:39,497 - root - INFO - [34mlr: 9.1425e-06 gnorm: 0.34 [35m[1 day, 17:44:11<1 day, 7:25:16][39m
+[titan] 2025-09-09 11:20:11,555 - root - INFO - [31mstep: 22825 [32mloss: 2.8329 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,222 [36mtflops: 487.17 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.8225 [37mglobal_avg_top_loss: 2.0104
+[titan] 2025-09-09 11:20:11,555 - root - INFO - [34mlr: 9.1390e-06 gnorm: 0.33 [35m[1 day, 17:44:43<1 day, 7:24:43][39m
+[titan] 2025-09-09 11:20:43,313 - root - INFO - [31mstep: 22830 [32mloss: 2.7845 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,318 [36mtflops: 491.75 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 0.7975 [37mglobal_avg_top_loss: 1.9870
+[titan] 2025-09-09 11:20:43,314 - root - INFO - [34mlr: 9.1355e-06 gnorm: 0.34 [35m[1 day, 17:45:15<1 day, 7:24:09][39m
+[titan] 2025-09-09 11:21:15,287 - root - INFO - [31mstep: 22835 [32mloss: 2.7507 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.44 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7929 [37mglobal_avg_top_loss: 1.9578
+[titan] 2025-09-09 11:21:15,288 - root - INFO - [34mlr: 9.1320e-06 gnorm: 0.33 [35m[1 day, 17:45:47<1 day, 7:23:35][39m
+[titan] 2025-09-09 11:21:47,194 - root - INFO - [31mstep: 22840 [32mloss: 2.7794 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.47 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7953 [37mglobal_avg_top_loss: 1.9841
+[titan] 2025-09-09 11:21:47,195 - root - INFO - [34mlr: 9.1285e-06 gnorm: 0.36 [35m[1 day, 17:46:19<1 day, 7:23:02][39m
+[titan] 2025-09-09 11:22:19,156 - root - INFO - [31mstep: 22845 [32mloss: 2.7638 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.64 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7921 [37mglobal_avg_top_loss: 1.9717
+[titan] 2025-09-09 11:22:19,156 - root - INFO - [34mlr: 9.1250e-06 gnorm: 0.37 [35m[1 day, 17:46:51<1 day, 7:22:28][39m
+[titan] 2025-09-09 11:22:44,617 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:22:51,039 - root - INFO - [31mstep: 22850 [32mloss: 2.8019 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.83 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.8064 [37mglobal_avg_top_loss: 1.9954
+[titan] 2025-09-09 11:22:51,040 - root - INFO - [34mlr: 9.1215e-06 gnorm: 0.35 [35m[1 day, 17:47:23<1 day, 7:21:54][39m
+[titan] 2025-09-09 11:23:22,972 - root - INFO - [31mstep: 22855 [32mloss: 3.1268 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.07 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 1.0072 [37mglobal_avg_top_loss: 2.1197
+[titan] 2025-09-09 11:23:22,972 - root - INFO - [34mlr: 9.1180e-06 gnorm: 0.35 [35m[1 day, 17:47:55<1 day, 7:21:21][39m
+[titan] 2025-09-09 11:23:54,718 - root - INFO - [31mstep: 22860 [32mloss: 2.7160 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,322 [36mtflops: 491.95 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 0.7664 [37mglobal_avg_top_loss: 1.9496
+[titan] 2025-09-09 11:23:54,718 - root - INFO - [34mlr: 9.1145e-06 gnorm: 0.35 [35m[1 day, 17:48:26<1 day, 7:20:47][39m
+[titan] 2025-09-09 11:24:26,551 - root - INFO - [31mstep: 22865 [32mloss: 2.7752 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.60 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7947 [37mglobal_avg_top_loss: 1.9805
+[titan] 2025-09-09 11:24:26,552 - root - INFO - [34mlr: 9.1110e-06 gnorm: 0.34 [35m[1 day, 17:48:58<1 day, 7:20:13][39m
+[titan] 2025-09-09 11:24:58,450 - root - INFO - [31mstep: 22870 [32mloss: 2.7930 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.59 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8056 [37mglobal_avg_top_loss: 1.9874
+[titan] 2025-09-09 11:24:58,451 - root - INFO - [34mlr: 9.1075e-06 gnorm: 0.35 [35m[1 day, 17:49:30<1 day, 7:19:39][39m
+[titan] 2025-09-09 11:25:30,304 - root - INFO - [31mstep: 22875 [32mloss: 2.5558 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.29 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.6957 [37mglobal_avg_top_loss: 1.8601
+[titan] 2025-09-09 11:25:30,304 - root - INFO - [34mlr: 9.1041e-06 gnorm: 0.39 [35m[1 day, 17:50:02<1 day, 7:19:06][39m
+[titan] 2025-09-09 11:26:02,104 - root - INFO - [31mstep: 22880 [32mloss: 2.6741 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.11 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7454 [37mglobal_avg_top_loss: 1.9287
+[titan] 2025-09-09 11:26:02,105 - root - INFO - [34mlr: 9.1006e-06 gnorm: 0.51 [35m[1 day, 17:50:34<1 day, 7:18:32][39m
+[titan] 2025-09-09 11:26:33,809 - root - INFO - [31mstep: 22885 [32mloss: 2.7962 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,336 [36mtflops: 492.59 [35mmfu: 49.81%[39m [37mglobal_avg_ntp_loss: 0.8071 [37mglobal_avg_top_loss: 1.9891
+[titan] 2025-09-09 11:26:33,809 - root - INFO - [34mlr: 9.0971e-06 gnorm: 0.35 [35m[1 day, 17:51:06<1 day, 7:17:58][39m
+[titan] 2025-09-09 11:27:05,669 - root - INFO - [31mstep: 22890 [32mloss: 2.8700 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.19 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.8373 [37mglobal_avg_top_loss: 2.0327
+[titan] 2025-09-09 11:27:05,669 - root - INFO - [34mlr: 9.0936e-06 gnorm: 0.37 [35m[1 day, 17:51:37<1 day, 7:17:24][39m
+[titan] 2025-09-09 11:27:37,617 - root - INFO - [31mstep: 22895 [32mloss: 2.8032 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.83 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.8068 [37mglobal_avg_top_loss: 1.9964
+[titan] 2025-09-09 11:27:37,618 - root - INFO - [34mlr: 9.0901e-06 gnorm: 0.35 [35m[1 day, 17:52:09<1 day, 7:16:51][39m
+[titan] 2025-09-09 11:28:03,041 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:28:09,363 - root - INFO - [31mstep: 22900 [32mloss: 2.8294 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,322 [36mtflops: 491.95 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 0.8210 [37mglobal_avg_top_loss: 2.0084
+[titan] 2025-09-09 11:28:09,364 - root - INFO - [34mlr: 9.0866e-06 gnorm: 0.35 [35m[1 day, 17:52:41<1 day, 7:16:17][39m
+[titan] 2025-09-09 11:28:41,201 - root - INFO - [31mstep: 22905 [32mloss: 2.7928 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.53 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.8006 [37mglobal_avg_top_loss: 1.9921
+[titan] 2025-09-09 11:28:41,202 - root - INFO - [34mlr: 9.0831e-06 gnorm: 0.33 [35m[1 day, 17:53:13<1 day, 7:15:43][39m
+[titan] 2025-09-09 11:29:12,985 - root - INFO - [31mstep: 22910 [32mloss: 2.7151 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,310 [36mtflops: 491.37 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.7683 [37mglobal_avg_top_loss: 1.9468
+[titan] 2025-09-09 11:29:12,985 - root - INFO - [34mlr: 9.0796e-06 gnorm: 0.35 [35m[1 day, 17:53:45<1 day, 7:15:09][39m
+[titan] 2025-09-09 11:29:44,871 - root - INFO - [31mstep: 22915 [32mloss: 2.8219 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.79 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8126 [37mglobal_avg_top_loss: 2.0093
+[titan] 2025-09-09 11:29:44,871 - root - INFO - [34mlr: 9.0761e-06 gnorm: 0.36 [35m[1 day, 17:54:17<1 day, 7:14:36][39m
+[titan] 2025-09-09 11:30:16,700 - root - INFO - [31mstep: 22920 [32mloss: 2.7748 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.66 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7942 [37mglobal_avg_top_loss: 1.9806
+[titan] 2025-09-09 11:30:16,700 - root - INFO - [34mlr: 9.0727e-06 gnorm: 0.37 [35m[1 day, 17:54:48<1 day, 7:14:02][39m
+[titan] 2025-09-09 11:30:48,436 - root - INFO - [31mstep: 22925 [32mloss: 2.7282 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,325 [36mtflops: 492.10 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 0.7720 [37mglobal_avg_top_loss: 1.9562
+[titan] 2025-09-09 11:30:48,437 - root - INFO - [34mlr: 9.0692e-06 gnorm: 0.35 [35m[1 day, 17:55:20<1 day, 7:13:28][39m
+[titan] 2025-09-09 11:31:20,306 - root - INFO - [31mstep: 22930 [32mloss: 2.8364 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.05 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.8199 [37mglobal_avg_top_loss: 2.0165
+[titan] 2025-09-09 11:31:20,306 - root - INFO - [34mlr: 9.0657e-06 gnorm: 0.39 [35m[1 day, 17:55:52<1 day, 7:12:55][39m
+[titan] 2025-09-09 11:31:52,015 - root - INFO - [31mstep: 22935 [32mloss: 2.6347 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,334 [36mtflops: 492.52 [35mmfu: 49.80%[39m [37mglobal_avg_ntp_loss: 0.7312 [37mglobal_avg_top_loss: 1.9035
+[titan] 2025-09-09 11:31:52,015 - root - INFO - [34mlr: 9.0622e-06 gnorm: 0.33 [35m[1 day, 17:56:24<1 day, 7:12:21][39m
+[titan] 2025-09-09 11:32:23,867 - root - INFO - [31mstep: 22940 [32mloss: 2.7362 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.31 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7749 [37mglobal_avg_top_loss: 1.9613
+[titan] 2025-09-09 11:32:23,868 - root - INFO - [34mlr: 9.0587e-06 gnorm: 0.34 [35m[1 day, 17:56:56<1 day, 7:11:47][39m
+[titan] 2025-09-09 11:32:55,767 - root - INFO - [31mstep: 22945 [32mloss: 2.7867 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.58 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7998 [37mglobal_avg_top_loss: 1.9869
+[titan] 2025-09-09 11:32:55,768 - root - INFO - [34mlr: 9.0552e-06 gnorm: 0.34 [35m[1 day, 17:57:27<1 day, 7:11:13][39m
+[titan] 2025-09-09 11:33:21,346 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:33:27,657 - root - INFO - [31mstep: 22950 [32mloss: 2.8209 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.73 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8159 [37mglobal_avg_top_loss: 2.0050
+[titan] 2025-09-09 11:33:27,657 - root - INFO - [34mlr: 9.0517e-06 gnorm: 0.34 [35m[1 day, 17:57:59<1 day, 7:10:40][39m
+[titan] 2025-09-09 11:33:59,558 - root - INFO - [31mstep: 22955 [32mloss: 2.7284 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.56 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7689 [37mglobal_avg_top_loss: 1.9596
+[titan] 2025-09-09 11:33:59,559 - root - INFO - [34mlr: 9.0483e-06 gnorm: 0.33 [35m[1 day, 17:58:31<1 day, 7:10:06][39m
+[titan] 2025-09-09 11:34:31,523 - root - INFO - [31mstep: 22960 [32mloss: 2.6068 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.58 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7167 [37mglobal_avg_top_loss: 1.8901
+[titan] 2025-09-09 11:34:31,523 - root - INFO - [34mlr: 9.0448e-06 gnorm: 0.41 [35m[1 day, 17:59:03<1 day, 7:09:32][39m
+[titan] 2025-09-09 11:35:03,263 - root - INFO - [31mstep: 22965 [32mloss: 2.7925 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,324 [36mtflops: 492.04 [35mmfu: 49.75%[39m [37mglobal_avg_ntp_loss: 0.8003 [37mglobal_avg_top_loss: 1.9922
+[titan] 2025-09-09 11:35:03,264 - root - INFO - [34mlr: 9.0413e-06 gnorm: 0.34 [35m[1 day, 17:59:35<1 day, 7:08:59][39m
+[titan] 2025-09-09 11:35:35,175 - root - INFO - [31mstep: 22970 [32mloss: 2.6798 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.39 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7490 [37mglobal_avg_top_loss: 1.9308
+[titan] 2025-09-09 11:35:35,176 - root - INFO - [34mlr: 9.0378e-06 gnorm: 0.34 [35m[1 day, 18:00:07<1 day, 7:08:25][39m
+[titan] 2025-09-09 11:36:07,033 - root - INFO - [31mstep: 22975 [32mloss: 2.7753 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.22 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7944 [37mglobal_avg_top_loss: 1.9809
+[titan] 2025-09-09 11:36:07,034 - root - INFO - [34mlr: 9.0343e-06 gnorm: 0.34 [35m[1 day, 18:00:39<1 day, 7:07:51][39m
+[titan] 2025-09-09 11:36:38,804 - root - INFO - [31mstep: 22980 [32mloss: 2.7525 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,314 [36mtflops: 491.56 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.7825 [37mglobal_avg_top_loss: 1.9700
+[titan] 2025-09-09 11:36:38,805 - root - INFO - [34mlr: 9.0308e-06 gnorm: 0.34 [35m[1 day, 18:01:11<1 day, 7:07:17][39m
+[titan] 2025-09-09 11:37:10,727 - root - INFO - [31mstep: 22985 [32mloss: 2.7750 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.22 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7942 [37mglobal_avg_top_loss: 1.9808
+[titan] 2025-09-09 11:37:10,728 - root - INFO - [34mlr: 9.0274e-06 gnorm: 0.33 [35m[1 day, 18:01:42<1 day, 7:06:44][39m
+[titan] 2025-09-09 11:37:42,724 - root - INFO - [31mstep: 22990 [32mloss: 2.7206 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.09 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7689 [37mglobal_avg_top_loss: 1.9517
+[titan] 2025-09-09 11:37:42,725 - root - INFO - [34mlr: 9.0239e-06 gnorm: 0.34 [35m[1 day, 18:02:14<1 day, 7:06:10][39m
+[titan] 2025-09-09 11:38:14,616 - root - INFO - [31mstep: 22995 [32mloss: 2.7913 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.71 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8000 [37mglobal_avg_top_loss: 1.9913
+[titan] 2025-09-09 11:38:14,616 - root - INFO - [34mlr: 9.0204e-06 gnorm: 0.34 [35m[1 day, 18:02:46<1 day, 7:05:37][39m
+[titan] 2025-09-09 11:38:40,130 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:38:46,600 - root - INFO - [31mstep: 23000 [32mloss: 2.7613 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.29 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7908 [37mglobal_avg_top_loss: 1.9706
+[titan] 2025-09-09 11:38:46,600 - root - INFO - [34mlr: 9.0169e-06 gnorm: 0.34 [35m[1 day, 18:03:18<1 day, 7:05:03][39m
+[titan] 2025-09-09 11:39:18,600 - root - INFO - [31mstep: 23005 [32mloss: 2.8584 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.05 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.8325 [37mglobal_avg_top_loss: 2.0259
+[titan] 2025-09-09 11:39:18,600 - root - INFO - [34mlr: 9.0134e-06 gnorm: 0.36 [35m[1 day, 18:03:50<1 day, 7:04:29][39m
+[titan] 2025-09-09 11:39:50,502 - root - INFO - [31mstep: 23010 [32mloss: 3.0171 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.53 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.9313 [37mglobal_avg_top_loss: 2.0859
+[titan] 2025-09-09 11:39:50,503 - root - INFO - [34mlr: 9.0099e-06 gnorm: 0.35 [35m[1 day, 18:04:22<1 day, 7:03:56][39m
+[titan] 2025-09-09 11:40:22,576 - root - INFO - [31mstep: 23015 [32mloss: 2.7704 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.93 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7901 [37mglobal_avg_top_loss: 1.9803
+[titan] 2025-09-09 11:40:22,576 - root - INFO - [34mlr: 9.0065e-06 gnorm: 0.48 [35m[1 day, 18:04:54<1 day, 7:03:22][39m
+[titan] 2025-09-09 11:40:54,414 - root - INFO - [31mstep: 23020 [32mloss: 2.8252 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.53 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.8169 [37mglobal_avg_top_loss: 2.0083
+[titan] 2025-09-09 11:40:54,414 - root - INFO - [34mlr: 9.0030e-06 gnorm: 0.40 [35m[1 day, 18:05:26<1 day, 7:02:48][39m
+[titan] 2025-09-09 11:41:26,341 - root - INFO - [31mstep: 23025 [32mloss: 2.7760 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.16 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7956 [37mglobal_avg_top_loss: 1.9804
+[titan] 2025-09-09 11:41:26,342 - root - INFO - [34mlr: 8.9995e-06 gnorm: 0.38 [35m[1 day, 18:05:58<1 day, 7:02:15][39m
+[titan] 2025-09-09 11:41:58,349 - root - INFO - [31mstep: 23030 [32mloss: 3.0877 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.93 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.9368 [37mglobal_avg_top_loss: 2.1509
+[titan] 2025-09-09 11:41:58,349 - root - INFO - [34mlr: 8.9960e-06 gnorm: 0.45 [35m[1 day, 18:06:30<1 day, 7:01:41][39m
+[titan] 2025-09-09 11:42:30,275 - root - INFO - [31mstep: 23035 [32mloss: 2.6858 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.18 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7464 [37mglobal_avg_top_loss: 1.9394
+[titan] 2025-09-09 11:42:30,275 - root - INFO - [34mlr: 8.9925e-06 gnorm: 0.47 [35m[1 day, 18:07:02<1 day, 7:01:08][39m
+[titan] 2025-09-09 11:43:02,324 - root - INFO - [31mstep: 23040 [32mloss: 2.7471 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.29 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7782 [37mglobal_avg_top_loss: 1.9689
+[titan] 2025-09-09 11:43:02,325 - root - INFO - [34mlr: 8.9891e-06 gnorm: 0.34 [35m[1 day, 18:07:34<1 day, 7:00:34][39m
+[titan] 2025-09-09 11:43:02,609 - root - INFO - Dumping profiler traces at step 23040
+[titan] 2025-09-09 11:43:02,679 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 11:43:34,417 - root - INFO - [31mstep: 23045 [32mloss: 2.7461 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.63 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7832 [37mglobal_avg_top_loss: 1.9629
+[titan] 2025-09-09 11:43:34,418 - root - INFO - [34mlr: 8.9856e-06 gnorm: 0.37 [35m[1 day, 18:08:06<1 day, 7:00:01][39m
+[titan] 2025-09-09 11:43:59,967 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:44:06,339 - root - INFO - [31mstep: 23050 [32mloss: 2.7869 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.24 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.8034 [37mglobal_avg_top_loss: 1.9834
+[titan] 2025-09-09 11:44:06,340 - root - INFO - [34mlr: 8.9821e-06 gnorm: 0.36 [35m[1 day, 18:08:38<1 day, 6:59:27][39m
+[titan] 2025-09-09 11:44:38,083 - root - INFO - [31mstep: 23055 [32mloss: 2.6814 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,323 [36mtflops: 491.98 [35mmfu: 49.75%[39m [37mglobal_avg_ntp_loss: 0.7484 [37mglobal_avg_top_loss: 1.9330
+[titan] 2025-09-09 11:44:38,084 - root - INFO - [34mlr: 8.9786e-06 gnorm: 0.35 [35m[1 day, 18:09:10<1 day, 6:58:53][39m
+[titan] 2025-09-09 11:45:10,114 - root - INFO - [31mstep: 23060 [32mloss: 2.8875 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.58 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.8427 [37mglobal_avg_top_loss: 2.0448
+[titan] 2025-09-09 11:45:10,114 - root - INFO - [34mlr: 8.9751e-06 gnorm: 0.35 [35m[1 day, 18:09:42<1 day, 6:58:20][39m
+[titan] 2025-09-09 11:45:41,908 - root - INFO - [31mstep: 23065 [32mloss: 2.7740 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,307 [36mtflops: 491.21 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.7953 [37mglobal_avg_top_loss: 1.9787
+[titan] 2025-09-09 11:45:41,908 - root - INFO - [34mlr: 8.9717e-06 gnorm: 0.38 [35m[1 day, 18:10:14<1 day, 6:57:46][39m
+[titan] 2025-09-09 11:46:13,769 - root - INFO - [31mstep: 23070 [32mloss: 2.8134 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.18 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.8113 [37mglobal_avg_top_loss: 2.0021
+[titan] 2025-09-09 11:46:13,769 - root - INFO - [34mlr: 8.9682e-06 gnorm: 0.34 [35m[1 day, 18:10:45<1 day, 6:57:12][39m
+[titan] 2025-09-09 11:46:45,656 - root - INFO - [31mstep: 23075 [32mloss: 2.8088 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.76 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8091 [37mglobal_avg_top_loss: 1.9996
+[titan] 2025-09-09 11:46:45,657 - root - INFO - [34mlr: 8.9647e-06 gnorm: 0.37 [35m[1 day, 18:11:17<1 day, 6:56:39][39m
+[titan] 2025-09-09 11:47:17,644 - root - INFO - [31mstep: 23080 [32mloss: 2.7354 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.24 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7726 [37mglobal_avg_top_loss: 1.9629
+[titan] 2025-09-09 11:47:17,644 - root - INFO - [34mlr: 8.9612e-06 gnorm: 0.35 [35m[1 day, 18:11:49<1 day, 6:56:05][39m
+[titan] 2025-09-09 11:47:49,422 - root - INFO - [31mstep: 23085 [32mloss: 2.8107 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,312 [36mtflops: 491.46 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.8102 [37mglobal_avg_top_loss: 2.0005
+[titan] 2025-09-09 11:47:49,422 - root - INFO - [34mlr: 8.9578e-06 gnorm: 0.35 [35m[1 day, 18:12:21<1 day, 6:55:31][39m
+[titan] 2025-09-09 11:48:21,076 - root - INFO - [31mstep: 23090 [32mloss: 2.7969 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,352 [36mtflops: 493.37 [35mmfu: 49.89%[39m [37mglobal_avg_ntp_loss: 0.8008 [37mglobal_avg_top_loss: 1.9961
+[titan] 2025-09-09 11:48:21,077 - root - INFO - [34mlr: 8.9543e-06 gnorm: 0.35 [35m[1 day, 18:12:53<1 day, 6:54:57][39m
+[titan] 2025-09-09 11:48:53,000 - root - INFO - [31mstep: 23095 [32mloss: 2.7270 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.22 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7717 [37mglobal_avg_top_loss: 1.9553
+[titan] 2025-09-09 11:48:53,000 - root - INFO - [34mlr: 8.9508e-06 gnorm: 0.35 [35m[1 day, 18:13:25<1 day, 6:54:24][39m
+[titan] 2025-09-09 11:49:18,548 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:49:24,954 - root - INFO - [31mstep: 23100 [32mloss: 2.7597 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.75 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7865 [37mglobal_avg_top_loss: 1.9731
+[titan] 2025-09-09 11:49:24,954 - root - INFO - [34mlr: 8.9473e-06 gnorm: 0.36 [35m[1 day, 18:13:57<1 day, 6:53:50][39m
+[titan] 2025-09-09 11:49:56,770 - root - INFO - [31mstep: 23105 [32mloss: 2.6881 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.86 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7603 [37mglobal_avg_top_loss: 1.9278
+[titan] 2025-09-09 11:49:56,771 - root - INFO - [34mlr: 8.9439e-06 gnorm: 0.43 [35m[1 day, 18:14:28<1 day, 6:53:16][39m
+[titan] 2025-09-09 11:50:28,605 - root - INFO - [31mstep: 23110 [32mloss: 2.7222 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.59 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7715 [37mglobal_avg_top_loss: 1.9506
+[titan] 2025-09-09 11:50:28,605 - root - INFO - [34mlr: 8.9404e-06 gnorm: 0.38 [35m[1 day, 18:15:00<1 day, 6:52:43][39m
+[titan] 2025-09-09 11:51:00,852 - root - INFO - [31mstep: 23115 [32mloss: 3.1842 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,162 [36mtflops: 484.30 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 1.0304 [37mglobal_avg_top_loss: 2.1538
+[titan] 2025-09-09 11:51:00,852 - root - INFO - [34mlr: 8.9369e-06 gnorm: 0.34 [35m[1 day, 18:15:33<1 day, 6:52:09][39m
+[titan] 2025-09-09 11:51:32,818 - root - INFO - [31mstep: 23120 [32mloss: 2.7098 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.56 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7635 [37mglobal_avg_top_loss: 1.9463
+[titan] 2025-09-09 11:51:32,818 - root - INFO - [34mlr: 8.9334e-06 gnorm: 0.34 [35m[1 day, 18:16:04<1 day, 6:51:36][39m
+[titan] 2025-09-09 11:52:04,703 - root - INFO - [31mstep: 23125 [32mloss: 2.7399 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.81 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7820 [37mglobal_avg_top_loss: 1.9579
+[titan] 2025-09-09 11:52:04,703 - root - INFO - [34mlr: 8.9300e-06 gnorm: 0.39 [35m[1 day, 18:16:36<1 day, 6:51:02][39m
+[titan] 2025-09-09 11:52:36,806 - root - INFO - [31mstep: 23130 [32mloss: 2.7554 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.47 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7868 [37mglobal_avg_top_loss: 1.9686
+[titan] 2025-09-09 11:52:36,807 - root - INFO - [34mlr: 8.9265e-06 gnorm: 0.37 [35m[1 day, 18:17:08<1 day, 6:50:29][39m
+[titan] 2025-09-09 11:53:08,936 - root - INFO - [31mstep: 23135 [32mloss: 2.8058 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,199 [36mtflops: 486.07 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.8041 [37mglobal_avg_top_loss: 2.0017
+[titan] 2025-09-09 11:53:08,937 - root - INFO - [34mlr: 8.9230e-06 gnorm: 0.35 [35m[1 day, 18:17:41<1 day, 6:49:55][39m
+[titan] 2025-09-09 11:53:40,833 - root - INFO - [31mstep: 23140 [32mloss: 2.7882 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.63 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7986 [37mglobal_avg_top_loss: 1.9897
+[titan] 2025-09-09 11:53:40,833 - root - INFO - [34mlr: 8.9195e-06 gnorm: 0.38 [35m[1 day, 18:18:12<1 day, 6:49:21][39m
+[titan] 2025-09-09 11:54:12,844 - root - INFO - [31mstep: 23145 [32mloss: 2.6345 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.87 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7333 [37mglobal_avg_top_loss: 1.9012
+[titan] 2025-09-09 11:54:12,845 - root - INFO - [34mlr: 8.9161e-06 gnorm: 0.35 [35m[1 day, 18:18:45<1 day, 6:48:48][39m
+[titan] 2025-09-09 11:54:38,348 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 11:54:44,752 - root - INFO - [31mstep: 23150 [32mloss: 2.7469 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.46 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7794 [37mglobal_avg_top_loss: 1.9675
+[titan] 2025-09-09 11:54:44,753 - root - INFO - [34mlr: 8.9126e-06 gnorm: 0.34 [35m[1 day, 18:19:16<1 day, 6:48:14][39m
+[titan] 2025-09-09 11:55:16,689 - root - INFO - [31mstep: 23155 [32mloss: 2.7659 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.01 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7974 [37mglobal_avg_top_loss: 1.9685
+[titan] 2025-09-09 11:55:16,689 - root - INFO - [34mlr: 8.9091e-06 gnorm: 0.36 [35m[1 day, 18:19:48<1 day, 6:47:41][39m
+[titan] 2025-09-09 11:55:48,590 - root - INFO - [31mstep: 23160 [32mloss: 2.7921 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.56 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8010 [37mglobal_avg_top_loss: 1.9911
+[titan] 2025-09-09 11:55:48,590 - root - INFO - [34mlr: 8.9056e-06 gnorm: 0.35 [35m[1 day, 18:20:20<1 day, 6:47:07][39m
+[titan] 2025-09-09 11:56:20,500 - root - INFO - [31mstep: 23165 [32mloss: 2.8496 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.41 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8365 [37mglobal_avg_top_loss: 2.0131
+[titan] 2025-09-09 11:56:20,501 - root - INFO - [34mlr: 8.9022e-06 gnorm: 0.34 [35m[1 day, 18:20:52<1 day, 6:46:33][39m
+[titan] 2025-09-09 11:56:52,479 - root - INFO - [31mstep: 23170 [32mloss: 2.7570 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.37 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7863 [37mglobal_avg_top_loss: 1.9707
+[titan] 2025-09-09 11:56:52,480 - root - INFO - [34mlr: 8.8987e-06 gnorm: 0.36 [35m[1 day, 18:21:24<1 day, 6:46:00][39m
+[titan] 2025-09-09 11:57:24,376 - root - INFO - [31mstep: 23175 [32mloss: 2.7902 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.63 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.8023 [37mglobal_avg_top_loss: 1.9879
+[titan] 2025-09-09 11:57:24,376 - root - INFO - [34mlr: 8.8952e-06 gnorm: 0.37 [35m[1 day, 18:21:56<1 day, 6:45:26][39m
+[titan] 2025-09-09 11:57:56,644 - root - INFO - [31mstep: 23180 [32mloss: 2.6998 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,155 [36mtflops: 483.99 [35mmfu: 48.94%[39m [37mglobal_avg_ntp_loss: 0.7586 [37mglobal_avg_top_loss: 1.9412
+[titan] 2025-09-09 11:57:56,644 - root - INFO - [34mlr: 8.8918e-06 gnorm: 0.36 [35m[1 day, 18:22:28<1 day, 6:44:53][39m
+[titan] 2025-09-09 11:58:28,425 - root - INFO - [31mstep: 23185 [32mloss: 2.7027 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,311 [36mtflops: 491.40 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.7590 [37mglobal_avg_top_loss: 1.9437
+[titan] 2025-09-09 11:58:28,426 - root - INFO - [34mlr: 8.8883e-06 gnorm: 0.34 [35m[1 day, 18:23:00<1 day, 6:44:19][39m
+[titan] 2025-09-09 11:59:00,240 - root - INFO - [31mstep: 23190 [32mloss: 2.7335 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.89 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7693 [37mglobal_avg_top_loss: 1.9642
+[titan] 2025-09-09 11:59:00,241 - root - INFO - [34mlr: 8.8848e-06 gnorm: 0.35 [35m[1 day, 18:23:32<1 day, 6:43:45][39m
+[titan] 2025-09-09 11:59:32,237 - root - INFO - [31mstep: 23195 [32mloss: 3.1848 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.10 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 1.0295 [37mglobal_avg_top_loss: 2.1552
+[titan] 2025-09-09 11:59:32,237 - root - INFO - [34mlr: 8.8813e-06 gnorm: 0.34 [35m[1 day, 18:24:04<1 day, 6:43:12][39m
+[titan] 2025-09-09 11:59:57,814 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:00:04,191 - root - INFO - [31mstep: 23200 [32mloss: 2.8072 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.74 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.8083 [37mglobal_avg_top_loss: 1.9988
+[titan] 2025-09-09 12:00:04,192 - root - INFO - [34mlr: 8.8779e-06 gnorm: 0.37 [35m[1 day, 18:24:36<1 day, 6:42:38][39m
+[titan] 2025-09-09 12:00:36,129 - root - INFO - [31mstep: 23205 [32mloss: 2.8785 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 489.00 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8585 [37mglobal_avg_top_loss: 2.0200
+[titan] 2025-09-09 12:00:36,130 - root - INFO - [34mlr: 8.8744e-06 gnorm: 1.67 [35m[1 day, 18:25:08<1 day, 6:42:05][39m
+[titan] 2025-09-09 12:01:08,076 - root - INFO - [31mstep: 23210 [32mloss: 2.8774 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.85 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.8501 [37mglobal_avg_top_loss: 2.0273
+[titan] 2025-09-09 12:01:08,077 - root - INFO - [34mlr: 8.8709e-06 gnorm: 0.56 [35m[1 day, 18:25:40<1 day, 6:41:31][39m
+[titan] 2025-09-09 12:01:39,999 - root - INFO - [31mstep: 23215 [32mloss: 2.7257 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.23 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7716 [37mglobal_avg_top_loss: 1.9541
+[titan] 2025-09-09 12:01:40,000 - root - INFO - [34mlr: 8.8675e-06 gnorm: 0.33 [35m[1 day, 18:26:12<1 day, 6:40:57][39m
+[titan] 2025-09-09 12:02:11,927 - root - INFO - [31mstep: 23220 [32mloss: 2.7426 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.15 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7794 [37mglobal_avg_top_loss: 1.9631
+[titan] 2025-09-09 12:02:11,928 - root - INFO - [34mlr: 8.8640e-06 gnorm: 0.39 [35m[1 day, 18:26:44<1 day, 6:40:24][39m
+[titan] 2025-09-09 12:02:43,813 - root - INFO - [31mstep: 23225 [32mloss: 2.7552 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.79 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7856 [37mglobal_avg_top_loss: 1.9696
+[titan] 2025-09-09 12:02:43,814 - root - INFO - [34mlr: 8.8605e-06 gnorm: 0.38 [35m[1 day, 18:27:15<1 day, 6:39:50][39m
+[titan] 2025-09-09 12:03:15,806 - root - INFO - [31mstep: 23230 [32mloss: 2.8304 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.16 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8263 [37mglobal_avg_top_loss: 2.0041
+[titan] 2025-09-09 12:03:15,806 - root - INFO - [34mlr: 8.8571e-06 gnorm: 0.42 [35m[1 day, 18:27:47<1 day, 6:39:17][39m
+[titan] 2025-09-09 12:03:47,624 - root - INFO - [31mstep: 23235 [32mloss: 2.8164 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.84 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.8141 [37mglobal_avg_top_loss: 2.0023
+[titan] 2025-09-09 12:03:47,624 - root - INFO - [34mlr: 8.8536e-06 gnorm: 0.35 [35m[1 day, 18:28:19<1 day, 6:38:43][39m
+[titan] 2025-09-09 12:04:19,567 - root - INFO - [31mstep: 23240 [32mloss: 2.7515 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.92 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7795 [37mglobal_avg_top_loss: 1.9721
+[titan] 2025-09-09 12:04:19,567 - root - INFO - [34mlr: 8.8501e-06 gnorm: 0.38 [35m[1 day, 18:28:51<1 day, 6:38:09][39m
+[titan] 2025-09-09 12:04:51,346 - root - INFO - [31mstep: 23245 [32mloss: 2.6992 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,311 [36mtflops: 491.43 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.7580 [37mglobal_avg_top_loss: 1.9413
+[titan] 2025-09-09 12:04:51,347 - root - INFO - [34mlr: 8.8467e-06 gnorm: 0.42 [35m[1 day, 18:29:23<1 day, 6:37:36][39m
+[titan] 2025-09-09 12:05:17,083 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:05:23,483 - root - INFO - [31mstep: 23250 [32mloss: 2.7266 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,197 [36mtflops: 485.98 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.7712 [37mglobal_avg_top_loss: 1.9554
+[titan] 2025-09-09 12:05:23,483 - root - INFO - [34mlr: 8.8432e-06 gnorm: 0.36 [35m[1 day, 18:29:55<1 day, 6:37:02][39m
+[titan] 2025-09-09 12:05:55,347 - root - INFO - [31mstep: 23255 [32mloss: 2.7597 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,284 [36mtflops: 490.11 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7876 [37mglobal_avg_top_loss: 1.9721
+[titan] 2025-09-09 12:05:55,348 - root - INFO - [34mlr: 8.8397e-06 gnorm: 0.37 [35m[1 day, 18:30:27<1 day, 6:36:29][39m
+[titan] 2025-09-09 12:06:27,342 - root - INFO - [31mstep: 23260 [32mloss: 2.7848 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.13 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8092 [37mglobal_avg_top_loss: 1.9756
+[titan] 2025-09-09 12:06:27,342 - root - INFO - [34mlr: 8.8363e-06 gnorm: 0.34 [35m[1 day, 18:30:59<1 day, 6:35:55][39m
+[titan] 2025-09-09 12:06:59,311 - root - INFO - [31mstep: 23265 [32mloss: 2.6614 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.52 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7415 [37mglobal_avg_top_loss: 1.9199
+[titan] 2025-09-09 12:06:59,311 - root - INFO - [34mlr: 8.8328e-06 gnorm: 0.36 [35m[1 day, 18:31:31<1 day, 6:35:21][39m
+[titan] 2025-09-09 12:07:31,117 - root - INFO - [31mstep: 23270 [32mloss: 2.7784 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.01 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7939 [37mglobal_avg_top_loss: 1.9845
+[titan] 2025-09-09 12:07:31,118 - root - INFO - [34mlr: 8.8293e-06 gnorm: 0.37 [35m[1 day, 18:32:03<1 day, 6:34:48][39m
+[titan] 2025-09-09 12:08:03,210 - root - INFO - [31mstep: 23275 [32mloss: 3.1576 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.63 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 1.0188 [37mglobal_avg_top_loss: 2.1388
+[titan] 2025-09-09 12:08:03,211 - root - INFO - [34mlr: 8.8259e-06 gnorm: 0.33 [35m[1 day, 18:32:35<1 day, 6:34:14][39m
+[titan] 2025-09-09 12:08:35,038 - root - INFO - [31mstep: 23280 [32mloss: 2.6968 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.68 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7566 [37mglobal_avg_top_loss: 1.9402
+[titan] 2025-09-09 12:08:35,039 - root - INFO - [34mlr: 8.8224e-06 gnorm: 0.32 [35m[1 day, 18:33:07<1 day, 6:33:41][39m
+[titan] 2025-09-09 12:09:06,992 - root - INFO - [31mstep: 23285 [32mloss: 2.7542 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.76 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7828 [37mglobal_avg_top_loss: 1.9714
+[titan] 2025-09-09 12:09:06,992 - root - INFO - [34mlr: 8.8189e-06 gnorm: 0.34 [35m[1 day, 18:33:39<1 day, 6:33:07][39m
+[titan] 2025-09-09 12:09:39,069 - root - INFO - [31mstep: 23290 [32mloss: 2.8147 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.87 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8125 [37mglobal_avg_top_loss: 2.0022
+[titan] 2025-09-09 12:09:39,069 - root - INFO - [34mlr: 8.8155e-06 gnorm: 0.35 [35m[1 day, 18:34:11<1 day, 6:32:33][39m
+[titan] 2025-09-09 12:10:11,005 - root - INFO - [31mstep: 23295 [32mloss: 2.7415 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.02 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7804 [37mglobal_avg_top_loss: 1.9612
+[titan] 2025-09-09 12:10:11,006 - root - INFO - [34mlr: 8.8120e-06 gnorm: 0.35 [35m[1 day, 18:34:43<1 day, 6:32:00][39m
+[titan] 2025-09-09 12:10:36,637 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:10:43,076 - root - INFO - [31mstep: 23300 [32mloss: 2.7410 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.96 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7760 [37mglobal_avg_top_loss: 1.9650
+[titan] 2025-09-09 12:10:43,077 - root - INFO - [34mlr: 8.8086e-06 gnorm: 0.35 [35m[1 day, 18:35:15<1 day, 6:31:26][39m
+[titan] 2025-09-09 12:11:15,031 - root - INFO - [31mstep: 23305 [32mloss: 2.7628 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.73 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7868 [37mglobal_avg_top_loss: 1.9760
+[titan] 2025-09-09 12:11:15,032 - root - INFO - [34mlr: 8.8051e-06 gnorm: 0.35 [35m[1 day, 18:35:47<1 day, 6:30:53][39m
+[titan] 2025-09-09 12:11:47,044 - root - INFO - [31mstep: 23310 [32mloss: 2.7991 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.85 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.8023 [37mglobal_avg_top_loss: 1.9968
+[titan] 2025-09-09 12:11:47,044 - root - INFO - [34mlr: 8.8016e-06 gnorm: 0.53 [35m[1 day, 18:36:19<1 day, 6:30:19][39m
+[titan] 2025-09-09 12:12:19,090 - root - INFO - [31mstep: 23315 [32mloss: 2.8062 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.35 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.8075 [37mglobal_avg_top_loss: 1.9987
+[titan] 2025-09-09 12:12:19,090 - root - INFO - [34mlr: 8.7982e-06 gnorm: 0.38 [35m[1 day, 18:36:51<1 day, 6:29:46][39m
+[titan] 2025-09-09 12:12:50,890 - root - INFO - [31mstep: 23320 [32mloss: 2.7351 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.11 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7736 [37mglobal_avg_top_loss: 1.9614
+[titan] 2025-09-09 12:12:50,890 - root - INFO - [34mlr: 8.7947e-06 gnorm: 0.34 [35m[1 day, 18:37:23<1 day, 6:29:12][39m
+[titan] 2025-09-09 12:13:22,908 - root - INFO - [31mstep: 23325 [32mloss: 2.6636 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.77 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7399 [37mglobal_avg_top_loss: 1.9236
+[titan] 2025-09-09 12:13:22,908 - root - INFO - [34mlr: 8.7912e-06 gnorm: 0.36 [35m[1 day, 18:37:55<1 day, 6:28:39][39m
+[titan] 2025-09-09 12:13:54,931 - root - INFO - [31mstep: 23330 [32mloss: 2.6635 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.69 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7441 [37mglobal_avg_top_loss: 1.9194
+[titan] 2025-09-09 12:13:54,932 - root - INFO - [34mlr: 8.7878e-06 gnorm: 0.33 [35m[1 day, 18:38:27<1 day, 6:28:05][39m
+[titan] 2025-09-09 12:14:26,892 - root - INFO - [31mstep: 23335 [32mloss: 2.5600 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.6974 [37mglobal_avg_top_loss: 1.8626
+[titan] 2025-09-09 12:14:26,892 - root - INFO - [34mlr: 8.7843e-06 gnorm: 0.41 [35m[1 day, 18:38:59<1 day, 6:27:31][39m
+[titan] 2025-09-09 12:14:58,920 - root - INFO - [31mstep: 23340 [32mloss: 2.8411 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.62 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.8424 [37mglobal_avg_top_loss: 1.9986
+[titan] 2025-09-09 12:14:58,920 - root - INFO - [34mlr: 8.7809e-06 gnorm: 0.34 [35m[1 day, 18:39:31<1 day, 6:26:58][39m
+[titan] 2025-09-09 12:15:30,777 - root - INFO - [31mstep: 23345 [32mloss: 2.7061 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.23 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7633 [37mglobal_avg_top_loss: 1.9427
+[titan] 2025-09-09 12:15:30,778 - root - INFO - [34mlr: 8.7774e-06 gnorm: 0.35 [35m[1 day, 18:40:02<1 day, 6:26:24][39m
+[titan] 2025-09-09 12:15:56,333 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:16:02,742 - root - INFO - [31mstep: 23350 [32mloss: 2.7458 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.59 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7796 [37mglobal_avg_top_loss: 1.9662
+[titan] 2025-09-09 12:16:02,742 - root - INFO - [34mlr: 8.7740e-06 gnorm: 0.38 [35m[1 day, 18:40:34<1 day, 6:25:51][39m
+[titan] 2025-09-09 12:16:34,601 - root - INFO - [31mstep: 23355 [32mloss: 3.2101 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.20 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 1.0401 [37mglobal_avg_top_loss: 2.1699
+[titan] 2025-09-09 12:16:34,602 - root - INFO - [34mlr: 8.7705e-06 gnorm: 0.60 [35m[1 day, 18:41:06<1 day, 6:25:17][39m
+[titan] 2025-09-09 12:17:06,548 - root - INFO - [31mstep: 23360 [32mloss: 2.7253 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.85 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7679 [37mglobal_avg_top_loss: 1.9574
+[titan] 2025-09-09 12:17:06,549 - root - INFO - [34mlr: 8.7670e-06 gnorm: 0.35 [35m[1 day, 18:41:38<1 day, 6:24:43][39m
+[titan] 2025-09-09 12:17:38,393 - root - INFO - [31mstep: 23365 [32mloss: 2.7058 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.42 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7605 [37mglobal_avg_top_loss: 1.9453
+[titan] 2025-09-09 12:17:38,393 - root - INFO - [34mlr: 8.7636e-06 gnorm: 0.37 [35m[1 day, 18:42:10<1 day, 6:24:10][39m
+[titan] 2025-09-09 12:18:10,211 - root - INFO - [31mstep: 23370 [32mloss: 2.7983 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.84 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.8055 [37mglobal_avg_top_loss: 1.9927
+[titan] 2025-09-09 12:18:10,212 - root - INFO - [34mlr: 8.7601e-06 gnorm: 0.42 [35m[1 day, 18:42:42<1 day, 6:23:36][39m
+[titan] 2025-09-09 12:18:42,398 - root - INFO - [31mstep: 23375 [32mloss: 2.7652 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,181 [36mtflops: 485.21 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7873 [37mglobal_avg_top_loss: 1.9779
+[titan] 2025-09-09 12:18:42,399 - root - INFO - [34mlr: 8.7567e-06 gnorm: 0.35 [35m[1 day, 18:43:14<1 day, 6:23:03][39m
+[titan] 2025-09-09 12:19:14,226 - root - INFO - [31mstep: 23380 [32mloss: 2.7757 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.68 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7929 [37mglobal_avg_top_loss: 1.9828
+[titan] 2025-09-09 12:19:14,227 - root - INFO - [34mlr: 8.7532e-06 gnorm: 0.37 [35m[1 day, 18:43:46<1 day, 6:22:29][39m
+[titan] 2025-09-09 12:19:46,052 - root - INFO - [31mstep: 23385 [32mloss: 2.7610 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.72 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7895 [37mglobal_avg_top_loss: 1.9714
+[titan] 2025-09-09 12:19:46,053 - root - INFO - [34mlr: 8.7497e-06 gnorm: 0.40 [35m[1 day, 18:44:18<1 day, 6:21:55][39m
+[titan] 2025-09-09 12:20:17,962 - root - INFO - [31mstep: 23390 [32mloss: 3.0177 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.42 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.9169 [37mglobal_avg_top_loss: 2.1008
+[titan] 2025-09-09 12:20:17,963 - root - INFO - [34mlr: 8.7463e-06 gnorm: 0.46 [35m[1 day, 18:44:50<1 day, 6:21:22][39m
+[titan] 2025-09-09 12:20:50,114 - root - INFO - [31mstep: 23395 [32mloss: 2.8390 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,192 [36mtflops: 485.74 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.8225 [37mglobal_avg_top_loss: 2.0165
+[titan] 2025-09-09 12:20:50,115 - root - INFO - [34mlr: 8.7428e-06 gnorm: 0.35 [35m[1 day, 18:45:22<1 day, 6:20:48][39m
+[titan] 2025-09-09 12:21:15,917 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:21:22,196 - root - INFO - [31mstep: 23400 [32mloss: 2.7674 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.80 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7905 [37mglobal_avg_top_loss: 1.9769
+[titan] 2025-09-09 12:21:22,196 - root - INFO - [34mlr: 8.7394e-06 gnorm: 0.34 [35m[1 day, 18:45:54<1 day, 6:20:15][39m
+[titan] 2025-09-09 12:21:54,058 - root - INFO - [31mstep: 23405 [32mloss: 2.7830 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.16 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7948 [37mglobal_avg_top_loss: 1.9882
+[titan] 2025-09-09 12:21:54,058 - root - INFO - [34mlr: 8.7359e-06 gnorm: 0.40 [35m[1 day, 18:46:26<1 day, 6:19:41][39m
+[titan] 2025-09-09 12:22:25,950 - root - INFO - [31mstep: 23410 [32mloss: 2.8381 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.70 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8232 [37mglobal_avg_top_loss: 2.0149
+[titan] 2025-09-09 12:22:25,950 - root - INFO - [34mlr: 8.7325e-06 gnorm: 0.38 [35m[1 day, 18:46:58<1 day, 6:19:08][39m
+[titan] 2025-09-09 12:22:58,010 - root - INFO - [31mstep: 23415 [32mloss: 2.6745 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.12 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7517 [37mglobal_avg_top_loss: 1.9228
+[titan] 2025-09-09 12:22:58,011 - root - INFO - [34mlr: 8.7290e-06 gnorm: 0.35 [35m[1 day, 18:47:30<1 day, 6:18:34][39m
+[titan] 2025-09-09 12:23:29,890 - root - INFO - [31mstep: 23420 [32mloss: 2.7963 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.88 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.8049 [37mglobal_avg_top_loss: 1.9914
+[titan] 2025-09-09 12:23:29,891 - root - INFO - [34mlr: 8.7256e-06 gnorm: 0.39 [35m[1 day, 18:48:01<1 day, 6:18:01][39m
+[titan] 2025-09-09 12:24:01,956 - root - INFO - [31mstep: 23425 [32mloss: 2.6885 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.05 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7613 [37mglobal_avg_top_loss: 1.9272
+[titan] 2025-09-09 12:24:01,956 - root - INFO - [34mlr: 8.7221e-06 gnorm: 0.39 [35m[1 day, 18:48:34<1 day, 6:17:27][39m
+[titan] 2025-09-09 12:24:33,893 - root - INFO - [31mstep: 23430 [32mloss: 2.7514 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.01 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7817 [37mglobal_avg_top_loss: 1.9698
+[titan] 2025-09-09 12:24:33,893 - root - INFO - [34mlr: 8.7187e-06 gnorm: 0.36 [35m[1 day, 18:49:05<1 day, 6:16:54][39m
+[titan] 2025-09-09 12:25:05,823 - root - INFO - [31mstep: 23435 [32mloss: 3.1636 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.11 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 1.0187 [37mglobal_avg_top_loss: 2.1450
+[titan] 2025-09-09 12:25:05,824 - root - INFO - [34mlr: 8.7152e-06 gnorm: 0.43 [35m[1 day, 18:49:37<1 day, 6:16:20][39m
+[titan] 2025-09-09 12:25:37,661 - root - INFO - [31mstep: 23440 [32mloss: 2.7340 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.54 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7726 [37mglobal_avg_top_loss: 1.9614
+[titan] 2025-09-09 12:25:37,661 - root - INFO - [34mlr: 8.7117e-06 gnorm: 0.36 [35m[1 day, 18:50:09<1 day, 6:15:46][39m
+[titan] 2025-09-09 12:26:09,802 - root - INFO - [31mstep: 23445 [32mloss: 2.7629 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,195 [36mtflops: 485.90 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7870 [37mglobal_avg_top_loss: 1.9759
+[titan] 2025-09-09 12:26:09,803 - root - INFO - [34mlr: 8.7083e-06 gnorm: 0.37 [35m[1 day, 18:50:41<1 day, 6:15:13][39m
+[titan] 2025-09-09 12:26:35,446 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:26:41,842 - root - INFO - [31mstep: 23450 [32mloss: 2.7548 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.45 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7878 [37mglobal_avg_top_loss: 1.9670
+[titan] 2025-09-09 12:26:41,842 - root - INFO - [34mlr: 8.7048e-06 gnorm: 0.41 [35m[1 day, 18:51:13<1 day, 6:14:39][39m
+[titan] 2025-09-09 12:27:13,893 - root - INFO - [31mstep: 23455 [32mloss: 2.7984 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.26 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.8033 [37mglobal_avg_top_loss: 1.9950
+[titan] 2025-09-09 12:27:13,893 - root - INFO - [34mlr: 8.7014e-06 gnorm: 0.35 [35m[1 day, 18:51:45<1 day, 6:14:06][39m
+[titan] 2025-09-09 12:27:45,931 - root - INFO - [31mstep: 23460 [32mloss: 2.7268 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.46 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7748 [37mglobal_avg_top_loss: 1.9520
+[titan] 2025-09-09 12:27:45,932 - root - INFO - [34mlr: 8.6979e-06 gnorm: 0.34 [35m[1 day, 18:52:18<1 day, 6:13:32][39m
+[titan] 2025-09-09 12:28:17,892 - root - INFO - [31mstep: 23465 [32mloss: 2.8195 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.8143 [37mglobal_avg_top_loss: 2.0052
+[titan] 2025-09-09 12:28:17,892 - root - INFO - [34mlr: 8.6945e-06 gnorm: 0.37 [35m[1 day, 18:52:49<1 day, 6:12:59][39m
+[titan] 2025-09-09 12:28:49,979 - root - INFO - [31mstep: 23470 [32mloss: 3.1037 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.72 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.9470 [37mglobal_avg_top_loss: 2.1566
+[titan] 2025-09-09 12:28:49,979 - root - INFO - [34mlr: 8.6910e-06 gnorm: 0.39 [35m[1 day, 18:53:22<1 day, 6:12:25][39m
+[titan] 2025-09-09 12:29:21,796 - root - INFO - [31mstep: 23475 [32mloss: 2.6998 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.86 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7643 [37mglobal_avg_top_loss: 1.9354
+[titan] 2025-09-09 12:29:21,796 - root - INFO - [34mlr: 8.6876e-06 gnorm: 0.35 [35m[1 day, 18:53:53<1 day, 6:11:52][39m
+[titan] 2025-09-09 12:29:54,060 - root - INFO - [31mstep: 23480 [32mloss: 2.7357 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,156 [36mtflops: 484.05 [35mmfu: 48.94%[39m [37mglobal_avg_ntp_loss: 0.7758 [37mglobal_avg_top_loss: 1.9598
+[titan] 2025-09-09 12:29:54,060 - root - INFO - [34mlr: 8.6841e-06 gnorm: 0.34 [35m[1 day, 18:54:26<1 day, 6:11:18][39m
+[titan] 2025-09-09 12:30:26,233 - root - INFO - [31mstep: 23485 [32mloss: 2.8005 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,185 [36mtflops: 485.42 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 0.8039 [37mglobal_avg_top_loss: 1.9966
+[titan] 2025-09-09 12:30:26,234 - root - INFO - [34mlr: 8.6807e-06 gnorm: 0.34 [35m[1 day, 18:54:58<1 day, 6:10:45][39m
+[titan] 2025-09-09 12:30:58,284 - root - INFO - [31mstep: 23490 [32mloss: 2.7755 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.26 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7952 [37mglobal_avg_top_loss: 1.9804
+[titan] 2025-09-09 12:30:58,285 - root - INFO - [34mlr: 8.6772e-06 gnorm: 0.34 [35m[1 day, 18:55:30<1 day, 6:10:12][39m
+[titan] 2025-09-09 12:31:30,359 - root - INFO - [31mstep: 23495 [32mloss: 2.6809 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.92 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7512 [37mglobal_avg_top_loss: 1.9297
+[titan] 2025-09-09 12:31:30,359 - root - INFO - [34mlr: 8.6738e-06 gnorm: 0.35 [35m[1 day, 18:56:02<1 day, 6:09:38][39m
+[titan] 2025-09-09 12:31:56,191 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:32:02,547 - root - INFO - [31mstep: 23500 [32mloss: 2.7743 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,180 [36mtflops: 485.19 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7926 [37mglobal_avg_top_loss: 1.9817
+[titan] 2025-09-09 12:32:02,547 - root - INFO - [34mlr: 8.6703e-06 gnorm: 0.37 [35m[1 day, 18:56:34<1 day, 6:09:05][39m
+[titan] 2025-09-09 12:32:34,616 - root - INFO - [31mstep: 23505 [32mloss: 2.6703 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.99 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7554 [37mglobal_avg_top_loss: 1.9149
+[titan] 2025-09-09 12:32:34,616 - root - INFO - [34mlr: 8.6669e-06 gnorm: 0.35 [35m[1 day, 18:57:06<1 day, 6:08:31][39m
+[titan] 2025-09-09 12:33:06,980 - root - INFO - [31mstep: 23510 [32mloss: 2.7271 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,125 [36mtflops: 482.55 [35mmfu: 48.79%[39m [37mglobal_avg_ntp_loss: 0.7726 [37mglobal_avg_top_loss: 1.9545
+[titan] 2025-09-09 12:33:06,980 - root - INFO - [34mlr: 8.6634e-06 gnorm: 0.34 [35m[1 day, 18:57:39<1 day, 6:07:58][39m
+[titan] 2025-09-09 12:33:39,118 - root - INFO - [31mstep: 23515 [32mloss: 3.1399 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,196 [36mtflops: 485.96 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.9570 [37mglobal_avg_top_loss: 2.1829
+[titan] 2025-09-09 12:33:39,118 - root - INFO - [34mlr: 8.6600e-06 gnorm: 0.39 [35m[1 day, 18:58:11<1 day, 6:07:25][39m
+[titan] 2025-09-09 12:34:11,249 - root - INFO - [31mstep: 23520 [32mloss: 2.7517 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.05 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7878 [37mglobal_avg_top_loss: 1.9640
+[titan] 2025-09-09 12:34:11,250 - root - INFO - [34mlr: 8.6566e-06 gnorm: 0.36 [35m[1 day, 18:58:43<1 day, 6:06:51][39m
+[titan] 2025-09-09 12:34:43,178 - root - INFO - [31mstep: 23525 [32mloss: 2.6464 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.14 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7354 [37mglobal_avg_top_loss: 1.9110
+[titan] 2025-09-09 12:34:43,178 - root - INFO - [34mlr: 8.6531e-06 gnorm: 0.36 [35m[1 day, 18:59:15<1 day, 6:06:18][39m
+[titan] 2025-09-09 12:35:15,250 - root - INFO - [31mstep: 23530 [32mloss: 2.8099 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.95 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.8116 [37mglobal_avg_top_loss: 1.9983
+[titan] 2025-09-09 12:35:15,250 - root - INFO - [34mlr: 8.6497e-06 gnorm: 0.35 [35m[1 day, 18:59:47<1 day, 6:05:44][39m
+[titan] 2025-09-09 12:35:47,336 - root - INFO - [31mstep: 23535 [32mloss: 2.6512 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.73 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7391 [37mglobal_avg_top_loss: 1.9121
+[titan] 2025-09-09 12:35:47,337 - root - INFO - [34mlr: 8.6462e-06 gnorm: 0.36 [35m[1 day, 19:00:19<1 day, 6:05:11][39m
+[titan] 2025-09-09 12:36:19,208 - root - INFO - [31mstep: 23540 [32mloss: 2.7836 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.01 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7960 [37mglobal_avg_top_loss: 1.9876
+[titan] 2025-09-09 12:36:19,208 - root - INFO - [34mlr: 8.6428e-06 gnorm: 0.35 [35m[1 day, 19:00:51<1 day, 6:04:37][39m
+[titan] 2025-09-09 12:36:51,143 - root - INFO - [31mstep: 23545 [32mloss: 2.8085 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.03 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8067 [37mglobal_avg_top_loss: 2.0018
+[titan] 2025-09-09 12:36:51,144 - root - INFO - [34mlr: 8.6393e-06 gnorm: 0.34 [35m[1 day, 19:01:23<1 day, 6:04:03][39m
+[titan] 2025-09-09 12:37:16,853 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:37:23,333 - root - INFO - [31mstep: 23550 [32mloss: 2.7229 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,180 [36mtflops: 485.17 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7713 [37mglobal_avg_top_loss: 1.9516
+[titan] 2025-09-09 12:37:23,333 - root - INFO - [34mlr: 8.6359e-06 gnorm: 0.37 [35m[1 day, 19:01:55<1 day, 6:03:30][39m
+[titan] 2025-09-09 12:37:36,306 - root - INFO - Dumping profiler traces at step 23552
+[titan] 2025-09-09 12:37:36,375 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 12:37:55,411 - root - INFO - [31mstep: 23555 [32mloss: 2.7637 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,215 [36mtflops: 486.85 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7858 [37mglobal_avg_top_loss: 1.9779
+[titan] 2025-09-09 12:37:55,412 - root - INFO - [34mlr: 8.6324e-06 gnorm: 0.40 [35m[1 day, 19:02:27<1 day, 6:02:57][39m
+[titan] 2025-09-09 12:38:27,564 - root - INFO - [31mstep: 23560 [32mloss: 2.7298 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,191 [36mtflops: 485.72 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.7723 [37mglobal_avg_top_loss: 1.9575
+[titan] 2025-09-09 12:38:27,565 - root - INFO - [34mlr: 8.6290e-06 gnorm: 0.34 [35m[1 day, 19:02:59<1 day, 6:02:23][39m
+[titan] 2025-09-09 12:38:59,577 - root - INFO - [31mstep: 23565 [32mloss: 2.7718 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.86 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7922 [37mglobal_avg_top_loss: 1.9796
+[titan] 2025-09-09 12:38:59,577 - root - INFO - [34mlr: 8.6255e-06 gnorm: 0.36 [35m[1 day, 19:03:31<1 day, 6:01:50][39m
+[titan] 2025-09-09 12:39:31,578 - root - INFO - [31mstep: 23570 [32mloss: 3.0739 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.03 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.9525 [37mglobal_avg_top_loss: 2.1214
+[titan] 2025-09-09 12:39:31,578 - root - INFO - [34mlr: 8.6221e-06 gnorm: 0.35 [35m[1 day, 19:04:03<1 day, 6:01:16][39m
+[titan] 2025-09-09 12:40:03,797 - root - INFO - [31mstep: 23575 [32mloss: 2.6237 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,171 [36mtflops: 484.73 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 0.7232 [37mglobal_avg_top_loss: 1.9004
+[titan] 2025-09-09 12:40:03,798 - root - INFO - [34mlr: 8.6187e-06 gnorm: 0.34 [35m[1 day, 19:04:35<1 day, 6:00:43][39m
+[titan] 2025-09-09 12:40:35,545 - root - INFO - [31mstep: 23580 [32mloss: 2.7272 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,322 [36mtflops: 491.93 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 0.7693 [37mglobal_avg_top_loss: 1.9579
+[titan] 2025-09-09 12:40:35,545 - root - INFO - [34mlr: 8.6152e-06 gnorm: 0.34 [35m[1 day, 19:05:07<1 day, 6:00:09][39m
+[titan] 2025-09-09 12:41:07,454 - root - INFO - [31mstep: 23585 [32mloss: 2.6902 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.44 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7624 [37mglobal_avg_top_loss: 1.9278
+[titan] 2025-09-09 12:41:07,454 - root - INFO - [34mlr: 8.6118e-06 gnorm: 0.35 [35m[1 day, 19:05:39<1 day, 5:59:36][39m
+[titan] 2025-09-09 12:41:39,552 - root - INFO - [31mstep: 23590 [32mloss: 2.7357 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.55 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7731 [37mglobal_avg_top_loss: 1.9626
+[titan] 2025-09-09 12:41:39,552 - root - INFO - [34mlr: 8.6083e-06 gnorm: 0.34 [35m[1 day, 19:06:11<1 day, 5:59:02][39m
+[titan] 2025-09-09 12:42:11,437 - root - INFO - [31mstep: 23595 [32mloss: 2.6897 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.80 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7595 [37mglobal_avg_top_loss: 1.9302
+[titan] 2025-09-09 12:42:11,438 - root - INFO - [34mlr: 8.6049e-06 gnorm: 0.37 [35m[1 day, 19:06:43<1 day, 5:58:29][39m
+[titan] 2025-09-09 12:42:36,967 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:42:43,327 - root - INFO - [31mstep: 23600 [32mloss: 2.7191 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.73 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7666 [37mglobal_avg_top_loss: 1.9524
+[titan] 2025-09-09 12:42:43,328 - root - INFO - [34mlr: 8.6015e-06 gnorm: 0.35 [35m[1 day, 19:07:15<1 day, 5:57:55][39m
+[titan] 2025-09-09 12:43:15,415 - root - INFO - [31mstep: 23605 [32mloss: 2.9160 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.72 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.8723 [37mglobal_avg_top_loss: 2.0437
+[titan] 2025-09-09 12:43:15,415 - root - INFO - [34mlr: 8.5980e-06 gnorm: 0.37 [35m[1 day, 19:07:47<1 day, 5:57:21][39m
+[titan] 2025-09-09 12:43:47,503 - root - INFO - [31mstep: 23610 [32mloss: 2.6779 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.71 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7476 [37mglobal_avg_top_loss: 1.9303
+[titan] 2025-09-09 12:43:47,503 - root - INFO - [34mlr: 8.5946e-06 gnorm: 0.34 [35m[1 day, 19:08:19<1 day, 5:56:48][39m
+[titan] 2025-09-09 12:44:19,582 - root - INFO - [31mstep: 23615 [32mloss: 2.7448 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,215 [36mtflops: 486.84 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7784 [37mglobal_avg_top_loss: 1.9664
+[titan] 2025-09-09 12:44:19,583 - root - INFO - [34mlr: 8.5911e-06 gnorm: 0.35 [35m[1 day, 19:08:51<1 day, 5:56:15][39m
+[titan] 2025-09-09 12:44:51,379 - root - INFO - [31mstep: 23620 [32mloss: 2.7088 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.17 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7653 [37mglobal_avg_top_loss: 1.9435
+[titan] 2025-09-09 12:44:51,379 - root - INFO - [34mlr: 8.5877e-06 gnorm: 0.37 [35m[1 day, 19:09:23<1 day, 5:55:41][39m
+[titan] 2025-09-09 12:45:23,511 - root - INFO - [31mstep: 23625 [32mloss: 2.8010 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.03 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.8065 [37mglobal_avg_top_loss: 1.9945
+[titan] 2025-09-09 12:45:23,512 - root - INFO - [34mlr: 8.5843e-06 gnorm: 0.35 [35m[1 day, 19:09:55<1 day, 5:55:08][39m
+[titan] 2025-09-09 12:45:55,505 - root - INFO - [31mstep: 23630 [32mloss: 2.9796 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.14 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8844 [37mglobal_avg_top_loss: 2.0952
+[titan] 2025-09-09 12:45:55,505 - root - INFO - [34mlr: 8.5808e-06 gnorm: 0.36 [35m[1 day, 19:10:27<1 day, 5:54:34][39m
+[titan] 2025-09-09 12:46:27,671 - root - INFO - [31mstep: 23635 [32mloss: 2.7300 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,187 [36mtflops: 485.52 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 0.7748 [37mglobal_avg_top_loss: 1.9553
+[titan] 2025-09-09 12:46:27,672 - root - INFO - [34mlr: 8.5774e-06 gnorm: 0.36 [35m[1 day, 19:10:59<1 day, 5:54:01][39m
+[titan] 2025-09-09 12:46:59,665 - root - INFO - [31mstep: 23640 [32mloss: 2.7168 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.15 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7695 [37mglobal_avg_top_loss: 1.9473
+[titan] 2025-09-09 12:46:59,665 - root - INFO - [34mlr: 8.5739e-06 gnorm: 0.36 [35m[1 day, 19:11:31<1 day, 5:53:27][39m
+[titan] 2025-09-09 12:47:31,752 - root - INFO - [31mstep: 23645 [32mloss: 2.7886 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.75 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7986 [37mglobal_avg_top_loss: 1.9900
+[titan] 2025-09-09 12:47:31,752 - root - INFO - [34mlr: 8.5705e-06 gnorm: 0.35 [35m[1 day, 19:12:03<1 day, 5:52:54][39m
+[titan] 2025-09-09 12:47:57,384 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:48:03,772 - root - INFO - [31mstep: 23650 [32mloss: 2.8259 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.75 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.8204 [37mglobal_avg_top_loss: 2.0055
+[titan] 2025-09-09 12:48:03,772 - root - INFO - [34mlr: 8.5671e-06 gnorm: 0.41 [35m[1 day, 19:12:35<1 day, 5:52:20][39m
+[titan] 2025-09-09 12:48:36,003 - root - INFO - [31mstep: 23655 [32mloss: 2.7630 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,167 [36mtflops: 484.54 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7897 [37mglobal_avg_top_loss: 1.9733
+[titan] 2025-09-09 12:48:36,003 - root - INFO - [34mlr: 8.5636e-06 gnorm: 0.41 [35m[1 day, 19:13:08<1 day, 5:51:47][39m
+[titan] 2025-09-09 12:49:07,821 - root - INFO - [31mstep: 23660 [32mloss: 2.7133 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.84 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7639 [37mglobal_avg_top_loss: 1.9494
+[titan] 2025-09-09 12:49:07,821 - root - INFO - [34mlr: 8.5602e-06 gnorm: 0.38 [35m[1 day, 19:13:39<1 day, 5:51:13][39m
+[titan] 2025-09-09 12:49:39,932 - root - INFO - [31mstep: 23665 [32mloss: 2.7182 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.36 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7685 [37mglobal_avg_top_loss: 1.9496
+[titan] 2025-09-09 12:49:39,932 - root - INFO - [34mlr: 8.5568e-06 gnorm: 0.34 [35m[1 day, 19:14:11<1 day, 5:50:40][39m
+[titan] 2025-09-09 12:50:11,942 - root - INFO - [31mstep: 23670 [32mloss: 2.7891 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7985 [37mglobal_avg_top_loss: 1.9905
+[titan] 2025-09-09 12:50:11,942 - root - INFO - [34mlr: 8.5533e-06 gnorm: 0.36 [35m[1 day, 19:14:43<1 day, 5:50:06][39m
+[titan] 2025-09-09 12:50:43,859 - root - INFO - [31mstep: 23675 [32mloss: 2.7823 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.30 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.8097 [37mglobal_avg_top_loss: 1.9726
+[titan] 2025-09-09 12:50:43,860 - root - INFO - [34mlr: 8.5499e-06 gnorm: 0.34 [35m[1 day, 19:15:15<1 day, 5:49:33][39m
+[titan] 2025-09-09 12:51:15,789 - root - INFO - [31mstep: 23680 [32mloss: 2.8048 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.12 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.8042 [37mglobal_avg_top_loss: 2.0006
+[titan] 2025-09-09 12:51:15,790 - root - INFO - [34mlr: 8.5464e-06 gnorm: 0.36 [35m[1 day, 19:15:47<1 day, 5:48:59][39m
+[titan] 2025-09-09 12:51:47,681 - root - INFO - [31mstep: 23685 [32mloss: 3.0752 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.71 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.9326 [37mglobal_avg_top_loss: 2.1426
+[titan] 2025-09-09 12:51:47,681 - root - INFO - [34mlr: 8.5430e-06 gnorm: 0.52 [35m[1 day, 19:16:19<1 day, 5:48:26][39m
+[titan] 2025-09-09 12:52:19,455 - root - INFO - [31mstep: 23690 [32mloss: 2.7414 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,313 [36mtflops: 491.51 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.7755 [37mglobal_avg_top_loss: 1.9659
+[titan] 2025-09-09 12:52:19,456 - root - INFO - [34mlr: 8.5396e-06 gnorm: 0.35 [35m[1 day, 19:16:51<1 day, 5:47:52][39m
+[titan] 2025-09-09 12:52:51,258 - root - INFO - [31mstep: 23695 [32mloss: 2.8033 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.07 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.8079 [37mglobal_avg_top_loss: 1.9955
+[titan] 2025-09-09 12:52:51,259 - root - INFO - [34mlr: 8.5361e-06 gnorm: 0.35 [35m[1 day, 19:17:23<1 day, 5:47:18][39m
+[titan] 2025-09-09 12:53:17,026 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:53:23,372 - root - INFO - [31mstep: 23700 [32mloss: 2.7465 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.31 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7785 [37mglobal_avg_top_loss: 1.9680
+[titan] 2025-09-09 12:53:23,373 - root - INFO - [34mlr: 8.5327e-06 gnorm: 0.36 [35m[1 day, 19:17:55<1 day, 5:46:45][39m
+[titan] 2025-09-09 12:53:55,414 - root - INFO - [31mstep: 23705 [32mloss: 2.7954 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.40 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.8048 [37mglobal_avg_top_loss: 1.9906
+[titan] 2025-09-09 12:53:55,415 - root - INFO - [34mlr: 8.5293e-06 gnorm: 0.35 [35m[1 day, 19:18:27<1 day, 5:46:11][39m
+[titan] 2025-09-09 12:54:27,389 - root - INFO - [31mstep: 23710 [32mloss: 2.8390 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.43 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.8229 [37mglobal_avg_top_loss: 2.0161
+[titan] 2025-09-09 12:54:27,390 - root - INFO - [34mlr: 8.5258e-06 gnorm: 0.38 [35m[1 day, 19:18:59<1 day, 5:45:38][39m
+[titan] 2025-09-09 12:54:59,612 - root - INFO - [31mstep: 23715 [32mloss: 2.7635 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,169 [36mtflops: 484.67 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 0.7885 [37mglobal_avg_top_loss: 1.9750
+[titan] 2025-09-09 12:54:59,613 - root - INFO - [34mlr: 8.5224e-06 gnorm: 0.35 [35m[1 day, 19:19:31<1 day, 5:45:05][39m
+[titan] 2025-09-09 12:55:31,684 - root - INFO - [31mstep: 23720 [32mloss: 2.7533 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.95 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7840 [37mglobal_avg_top_loss: 1.9693
+[titan] 2025-09-09 12:55:31,685 - root - INFO - [34mlr: 8.5190e-06 gnorm: 0.37 [35m[1 day, 19:20:03<1 day, 5:44:31][39m
+[titan] 2025-09-09 12:56:03,627 - root - INFO - [31mstep: 23725 [32mloss: 2.7679 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.93 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7896 [37mglobal_avg_top_loss: 1.9782
+[titan] 2025-09-09 12:56:03,627 - root - INFO - [34mlr: 8.5156e-06 gnorm: 0.35 [35m[1 day, 19:20:35<1 day, 5:43:58][39m
+[titan] 2025-09-09 12:56:35,610 - root - INFO - [31mstep: 23730 [32mloss: 2.7012 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.30 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7586 [37mglobal_avg_top_loss: 1.9426
+[titan] 2025-09-09 12:56:35,611 - root - INFO - [34mlr: 8.5121e-06 gnorm: 0.33 [35m[1 day, 19:21:07<1 day, 5:43:24][39m
+[titan] 2025-09-09 12:57:07,595 - root - INFO - [31mstep: 23735 [32mloss: 2.6789 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.27 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7473 [37mglobal_avg_top_loss: 1.9316
+[titan] 2025-09-09 12:57:07,596 - root - INFO - [34mlr: 8.5087e-06 gnorm: 0.34 [35m[1 day, 19:21:39<1 day, 5:42:51][39m
+[titan] 2025-09-09 12:57:39,489 - root - INFO - [31mstep: 23740 [32mloss: 2.7761 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.68 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7952 [37mglobal_avg_top_loss: 1.9809
+[titan] 2025-09-09 12:57:39,489 - root - INFO - [34mlr: 8.5053e-06 gnorm: 0.34 [35m[1 day, 19:22:11<1 day, 5:42:17][39m
+[titan] 2025-09-09 12:58:11,609 - root - INFO - [31mstep: 23745 [32mloss: 2.7724 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,202 [36mtflops: 486.22 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7937 [37mglobal_avg_top_loss: 1.9787
+[titan] 2025-09-09 12:58:11,609 - root - INFO - [34mlr: 8.5018e-06 gnorm: 0.35 [35m[1 day, 19:22:43<1 day, 5:41:44][39m
+[titan] 2025-09-09 12:58:37,354 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 12:58:43,798 - root - INFO - [31mstep: 23750 [32mloss: 2.7206 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,180 [36mtflops: 485.18 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7687 [37mglobal_avg_top_loss: 1.9519
+[titan] 2025-09-09 12:58:43,798 - root - INFO - [34mlr: 8.4984e-06 gnorm: 0.35 [35m[1 day, 19:23:15<1 day, 5:41:10][39m
+[titan] 2025-09-09 12:59:15,640 - root - INFO - [31mstep: 23755 [32mloss: 2.7171 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.46 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7654 [37mglobal_avg_top_loss: 1.9517
+[titan] 2025-09-09 12:59:15,640 - root - INFO - [34mlr: 8.4950e-06 gnorm: 0.34 [35m[1 day, 19:23:47<1 day, 5:40:37][39m
+[titan] 2025-09-09 12:59:47,612 - root - INFO - [31mstep: 23760 [32mloss: 2.7101 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.47 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7638 [37mglobal_avg_top_loss: 1.9463
+[titan] 2025-09-09 12:59:47,612 - root - INFO - [34mlr: 8.4915e-06 gnorm: 0.37 [35m[1 day, 19:24:19<1 day, 5:40:03][39m
+[titan] 2025-09-09 13:00:20,051 - root - INFO - [31mstep: 23765 [32mloss: 2.7622 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,102 [36mtflops: 481.44 [35mmfu: 48.68%[39m [37mglobal_avg_ntp_loss: 0.7887 [37mglobal_avg_top_loss: 1.9735
+[titan] 2025-09-09 13:00:20,051 - root - INFO - [34mlr: 8.4881e-06 gnorm: 0.35 [35m[1 day, 19:24:52<1 day, 5:39:30][39m
+[titan] 2025-09-09 13:00:51,953 - root - INFO - [31mstep: 23770 [32mloss: 2.6923 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.54 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7562 [37mglobal_avg_top_loss: 1.9362
+[titan] 2025-09-09 13:00:51,953 - root - INFO - [34mlr: 8.4847e-06 gnorm: 0.39 [35m[1 day, 19:25:23<1 day, 5:38:56][39m
+[titan] 2025-09-09 13:01:24,150 - root - INFO - [31mstep: 23775 [32mloss: 2.7301 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.06 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7714 [37mglobal_avg_top_loss: 1.9587
+[titan] 2025-09-09 13:01:24,150 - root - INFO - [34mlr: 8.4813e-06 gnorm: 0.44 [35m[1 day, 19:25:56<1 day, 5:38:23][39m
+[titan] 2025-09-09 13:01:56,234 - root - INFO - [31mstep: 23780 [32mloss: 2.7156 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.77 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7650 [37mglobal_avg_top_loss: 1.9506
+[titan] 2025-09-09 13:01:56,234 - root - INFO - [34mlr: 8.4778e-06 gnorm: 0.34 [35m[1 day, 19:26:28<1 day, 5:37:50][39m
+[titan] 2025-09-09 13:02:28,321 - root - INFO - [31mstep: 23785 [32mloss: 2.7678 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.72 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7887 [37mglobal_avg_top_loss: 1.9791
+[titan] 2025-09-09 13:02:28,321 - root - INFO - [34mlr: 8.4744e-06 gnorm: 0.36 [35m[1 day, 19:27:00<1 day, 5:37:16][39m
+[titan] 2025-09-09 13:03:00,254 - root - INFO - [31mstep: 23790 [32mloss: 2.7838 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.07 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8106 [37mglobal_avg_top_loss: 1.9732
+[titan] 2025-09-09 13:03:00,254 - root - INFO - [34mlr: 8.4710e-06 gnorm: 0.35 [35m[1 day, 19:27:32<1 day, 5:36:43][39m
+[titan] 2025-09-09 13:03:32,362 - root - INFO - [31mstep: 23795 [32mloss: 2.7578 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.40 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7861 [37mglobal_avg_top_loss: 1.9717
+[titan] 2025-09-09 13:03:32,363 - root - INFO - [34mlr: 8.4676e-06 gnorm: 0.41 [35m[1 day, 19:28:04<1 day, 5:36:09][39m
+[titan] 2025-09-09 13:03:58,011 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:04:04,398 - root - INFO - [31mstep: 23800 [32mloss: 2.6799 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.50 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7520 [37mglobal_avg_top_loss: 1.9279
+[titan] 2025-09-09 13:04:04,399 - root - INFO - [34mlr: 8.4641e-06 gnorm: 0.37 [35m[1 day, 19:28:36<1 day, 5:35:36][39m
+[titan] 2025-09-09 13:04:36,295 - root - INFO - [31mstep: 23805 [32mloss: 2.7871 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.63 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7972 [37mglobal_avg_top_loss: 1.9899
+[titan] 2025-09-09 13:04:36,295 - root - INFO - [34mlr: 8.4607e-06 gnorm: 0.35 [35m[1 day, 19:29:08<1 day, 5:35:02][39m
+[titan] 2025-09-09 13:05:08,114 - root - INFO - [31mstep: 23810 [32mloss: 2.7643 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.82 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7882 [37mglobal_avg_top_loss: 1.9761
+[titan] 2025-09-09 13:05:08,115 - root - INFO - [34mlr: 8.4573e-06 gnorm: 0.37 [35m[1 day, 19:29:40<1 day, 5:34:29][39m
+[titan] 2025-09-09 13:05:40,072 - root - INFO - [31mstep: 23815 [32mloss: 2.8376 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.70 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.8248 [37mglobal_avg_top_loss: 2.0128
+[titan] 2025-09-09 13:05:40,072 - root - INFO - [34mlr: 8.4539e-06 gnorm: 0.39 [35m[1 day, 19:30:12<1 day, 5:33:55][39m
+[titan] 2025-09-09 13:06:12,175 - root - INFO - [31mstep: 23820 [32mloss: 2.7753 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.47 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7953 [37mglobal_avg_top_loss: 1.9800
+[titan] 2025-09-09 13:06:12,175 - root - INFO - [34mlr: 8.4504e-06 gnorm: 0.38 [35m[1 day, 19:30:44<1 day, 5:33:22][39m
+[titan] 2025-09-09 13:06:44,115 - root - INFO - [31mstep: 23825 [32mloss: 2.7898 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.96 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8021 [37mglobal_avg_top_loss: 1.9877
+[titan] 2025-09-09 13:06:44,115 - root - INFO - [34mlr: 8.4470e-06 gnorm: 0.36 [35m[1 day, 19:31:16<1 day, 5:32:48][39m
+[titan] 2025-09-09 13:07:15,890 - root - INFO - [31mstep: 23830 [32mloss: 3.1567 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,313 [36mtflops: 491.51 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 1.0199 [37mglobal_avg_top_loss: 2.1368
+[titan] 2025-09-09 13:07:15,890 - root - INFO - [34mlr: 8.4436e-06 gnorm: 0.44 [35m[1 day, 19:31:47<1 day, 5:32:15][39m
+[titan] 2025-09-09 13:07:47,767 - root - INFO - [31mstep: 23835 [32mloss: 2.7924 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.92 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7984 [37mglobal_avg_top_loss: 1.9939
+[titan] 2025-09-09 13:07:47,768 - root - INFO - [34mlr: 8.4402e-06 gnorm: 0.35 [35m[1 day, 19:32:19<1 day, 5:31:41][39m
+[titan] 2025-09-09 13:08:19,663 - root - INFO - [31mstep: 23840 [32mloss: 2.6894 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.64 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7556 [37mglobal_avg_top_loss: 1.9338
+[titan] 2025-09-09 13:08:19,663 - root - INFO - [34mlr: 8.4367e-06 gnorm: 0.36 [35m[1 day, 19:32:51<1 day, 5:31:08][39m
+[titan] 2025-09-09 13:08:51,714 - root - INFO - [31mstep: 23845 [32mloss: 2.7534 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.27 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7857 [37mglobal_avg_top_loss: 1.9676
+[titan] 2025-09-09 13:08:51,714 - root - INFO - [34mlr: 8.4333e-06 gnorm: 0.35 [35m[1 day, 19:33:23<1 day, 5:30:34][39m
+[titan] 2025-09-09 13:09:17,454 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:09:23,828 - root - INFO - [31mstep: 23850 [32mloss: 2.7687 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.31 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7978 [37mglobal_avg_top_loss: 1.9710
+[titan] 2025-09-09 13:09:23,828 - root - INFO - [34mlr: 8.4299e-06 gnorm: 0.38 [35m[1 day, 19:33:55<1 day, 5:30:01][39m
+[titan] 2025-09-09 13:09:55,720 - root - INFO - [31mstep: 23855 [32mloss: 2.7126 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.70 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7691 [37mglobal_avg_top_loss: 1.9434
+[titan] 2025-09-09 13:09:55,720 - root - INFO - [34mlr: 8.4265e-06 gnorm: 0.34 [35m[1 day, 19:34:27<1 day, 5:29:27][39m
+[titan] 2025-09-09 13:10:27,633 - root - INFO - [31mstep: 23860 [32mloss: 2.7057 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.37 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7603 [37mglobal_avg_top_loss: 1.9454
+[titan] 2025-09-09 13:10:27,633 - root - INFO - [34mlr: 8.4231e-06 gnorm: 0.78 [35m[1 day, 19:34:59<1 day, 5:28:54][39m
+[titan] 2025-09-09 13:10:59,580 - root - INFO - [31mstep: 23865 [32mloss: 2.7233 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.86 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7694 [37mglobal_avg_top_loss: 1.9539
+[titan] 2025-09-09 13:10:59,580 - root - INFO - [34mlr: 8.4196e-06 gnorm: 0.37 [35m[1 day, 19:35:31<1 day, 5:28:20][39m
+[titan] 2025-09-09 13:11:31,518 - root - INFO - [31mstep: 23870 [32mloss: 2.7358 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.99 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7752 [37mglobal_avg_top_loss: 1.9606
+[titan] 2025-09-09 13:11:31,518 - root - INFO - [34mlr: 8.4162e-06 gnorm: 0.35 [35m[1 day, 19:36:03<1 day, 5:27:47][39m
+[titan] 2025-09-09 13:12:03,634 - root - INFO - [31mstep: 23875 [32mloss: 2.7294 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.27 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7740 [37mglobal_avg_top_loss: 1.9553
+[titan] 2025-09-09 13:12:03,635 - root - INFO - [34mlr: 8.4128e-06 gnorm: 0.35 [35m[1 day, 19:36:35<1 day, 5:27:13][39m
+[titan] 2025-09-09 13:12:35,543 - root - INFO - [31mstep: 23880 [32mloss: 2.6747 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.45 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7496 [37mglobal_avg_top_loss: 1.9251
+[titan] 2025-09-09 13:12:35,543 - root - INFO - [34mlr: 8.4094e-06 gnorm: 0.36 [35m[1 day, 19:37:07<1 day, 5:26:40][39m
+[titan] 2025-09-09 13:13:07,652 - root - INFO - [31mstep: 23885 [32mloss: 2.7978 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.39 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.8052 [37mglobal_avg_top_loss: 1.9926
+[titan] 2025-09-09 13:13:07,652 - root - INFO - [34mlr: 8.4060e-06 gnorm: 0.35 [35m[1 day, 19:37:39<1 day, 5:26:06][39m
+[titan] 2025-09-09 13:13:39,698 - root - INFO - [31mstep: 23890 [32mloss: 2.6990 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.34 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7588 [37mglobal_avg_top_loss: 1.9402
+[titan] 2025-09-09 13:13:39,699 - root - INFO - [34mlr: 8.4025e-06 gnorm: 0.36 [35m[1 day, 19:38:11<1 day, 5:25:33][39m
+[titan] 2025-09-09 13:14:11,783 - root - INFO - [31mstep: 23895 [32mloss: 2.7727 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.76 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7720 [37mglobal_avg_top_loss: 2.0007
+[titan] 2025-09-09 13:14:11,784 - root - INFO - [34mlr: 8.3991e-06 gnorm: 1.27 [35m[1 day, 19:38:43<1 day, 5:24:59][39m
+[titan] 2025-09-09 13:14:37,525 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:14:43,968 - root - INFO - [31mstep: 23900 [32mloss: 2.7869 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,181 [36mtflops: 485.24 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7982 [37mglobal_avg_top_loss: 1.9887
+[titan] 2025-09-09 13:14:43,968 - root - INFO - [34mlr: 8.3957e-06 gnorm: 0.36 [35m[1 day, 19:39:15<1 day, 5:24:26][39m
+[titan] 2025-09-09 13:15:15,840 - root - INFO - [31mstep: 23905 [32mloss: 2.7571 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 490.01 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7893 [37mglobal_avg_top_loss: 1.9677
+[titan] 2025-09-09 13:15:15,840 - root - INFO - [34mlr: 8.3923e-06 gnorm: 0.79 [35m[1 day, 19:39:47<1 day, 5:23:53][39m
+[titan] 2025-09-09 13:15:47,681 - root - INFO - [31mstep: 23910 [32mloss: 3.0514 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.48 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.9716 [37mglobal_avg_top_loss: 2.0798
+[titan] 2025-09-09 13:15:47,682 - root - INFO - [34mlr: 8.3889e-06 gnorm: 0.35 [35m[1 day, 19:40:19<1 day, 5:23:19][39m
+[titan] 2025-09-09 13:16:19,629 - root - INFO - [31mstep: 23915 [32mloss: 2.7363 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.84 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7733 [37mglobal_avg_top_loss: 1.9630
+[titan] 2025-09-09 13:16:19,630 - root - INFO - [34mlr: 8.3855e-06 gnorm: 0.35 [35m[1 day, 19:40:51<1 day, 5:22:45][39m
+[titan] 2025-09-09 13:16:51,675 - root - INFO - [31mstep: 23920 [32mloss: 2.6696 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.35 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7477 [37mglobal_avg_top_loss: 1.9219
+[titan] 2025-09-09 13:16:51,675 - root - INFO - [34mlr: 8.3820e-06 gnorm: 0.35 [35m[1 day, 19:41:23<1 day, 5:22:12][39m
+[titan] 2025-09-09 13:17:23,605 - root - INFO - [31mstep: 23925 [32mloss: 2.6664 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.11 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7422 [37mglobal_avg_top_loss: 1.9242
+[titan] 2025-09-09 13:17:23,605 - root - INFO - [34mlr: 8.3786e-06 gnorm: 0.36 [35m[1 day, 19:41:55<1 day, 5:21:38][39m
+[titan] 2025-09-09 13:17:55,554 - root - INFO - [31mstep: 23930 [32mloss: 2.7392 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.83 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7781 [37mglobal_avg_top_loss: 1.9611
+[titan] 2025-09-09 13:17:55,554 - root - INFO - [34mlr: 8.3752e-06 gnorm: 0.36 [35m[1 day, 19:42:27<1 day, 5:21:05][39m
+[titan] 2025-09-09 13:18:27,600 - root - INFO - [31mstep: 23935 [32mloss: 2.7458 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.33 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7804 [37mglobal_avg_top_loss: 1.9654
+[titan] 2025-09-09 13:18:27,601 - root - INFO - [34mlr: 8.3718e-06 gnorm: 0.44 [35m[1 day, 19:42:59<1 day, 5:20:32][39m
+[titan] 2025-09-09 13:18:59,407 - root - INFO - [31mstep: 23940 [32mloss: 2.6329 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.01 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7268 [37mglobal_avg_top_loss: 1.9061
+[titan] 2025-09-09 13:18:59,408 - root - INFO - [34mlr: 8.3684e-06 gnorm: 0.60 [35m[1 day, 19:43:31<1 day, 5:19:58][39m
+[titan] 2025-09-09 13:19:31,349 - root - INFO - [31mstep: 23945 [32mloss: 2.8411 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.93 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8344 [37mglobal_avg_top_loss: 2.0067
+[titan] 2025-09-09 13:19:31,350 - root - INFO - [34mlr: 8.3650e-06 gnorm: 0.36 [35m[1 day, 19:44:03<1 day, 5:19:24][39m
+[titan] 2025-09-09 13:19:56,911 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:20:03,314 - root - INFO - [31mstep: 23950 [32mloss: 2.7655 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.59 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7908 [37mglobal_avg_top_loss: 1.9747
+[titan] 2025-09-09 13:20:03,314 - root - INFO - [34mlr: 8.3616e-06 gnorm: 0.35 [35m[1 day, 19:44:35<1 day, 5:18:51][39m
+[titan] 2025-09-09 13:20:35,440 - root - INFO - [31mstep: 23955 [32mloss: 2.7691 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,200 [36mtflops: 486.12 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7909 [37mglobal_avg_top_loss: 1.9782
+[titan] 2025-09-09 13:20:35,441 - root - INFO - [34mlr: 8.3581e-06 gnorm: 0.35 [35m[1 day, 19:45:07<1 day, 5:18:18][39m
+[titan] 2025-09-09 13:21:07,624 - root - INFO - [31mstep: 23960 [32mloss: 2.7289 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,182 [36mtflops: 485.27 [35mmfu: 49.07%[39m [37mglobal_avg_ntp_loss: 0.7718 [37mglobal_avg_top_loss: 1.9571
+[titan] 2025-09-09 13:21:07,624 - root - INFO - [34mlr: 8.3547e-06 gnorm: 0.36 [35m[1 day, 19:45:39<1 day, 5:17:44][39m
+[titan] 2025-09-09 13:21:39,703 - root - INFO - [31mstep: 23965 [32mloss: 2.7192 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,215 [36mtflops: 486.84 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7663 [37mglobal_avg_top_loss: 1.9529
+[titan] 2025-09-09 13:21:39,703 - root - INFO - [34mlr: 8.3513e-06 gnorm: 0.36 [35m[1 day, 19:46:11<1 day, 5:17:11][39m
+[titan] 2025-09-09 13:22:11,722 - root - INFO - [31mstep: 23970 [32mloss: 2.7345 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.76 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7679 [37mglobal_avg_top_loss: 1.9666
+[titan] 2025-09-09 13:22:11,722 - root - INFO - [34mlr: 8.3479e-06 gnorm: 0.37 [35m[1 day, 19:46:43<1 day, 5:16:37][39m
+[titan] 2025-09-09 13:22:43,657 - root - INFO - [31mstep: 23975 [32mloss: 3.5589 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.03 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 1.2390 [37mglobal_avg_top_loss: 2.3199
+[titan] 2025-09-09 13:22:43,658 - root - INFO - [34mlr: 8.3445e-06 gnorm: 0.36 [35m[1 day, 19:47:15<1 day, 5:16:04][39m
+[titan] 2025-09-09 13:23:15,707 - root - INFO - [31mstep: 23980 [32mloss: 2.8295 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.29 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.8171 [37mglobal_avg_top_loss: 2.0124
+[titan] 2025-09-09 13:23:15,707 - root - INFO - [34mlr: 8.3411e-06 gnorm: 0.37 [35m[1 day, 19:47:47<1 day, 5:15:30][39m
+[titan] 2025-09-09 13:23:47,768 - root - INFO - [31mstep: 23985 [32mloss: 2.6171 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.11 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7251 [37mglobal_avg_top_loss: 1.8921
+[titan] 2025-09-09 13:23:47,769 - root - INFO - [34mlr: 8.3377e-06 gnorm: 0.44 [35m[1 day, 19:48:19<1 day, 5:14:57][39m
+[titan] 2025-09-09 13:24:19,781 - root - INFO - [31mstep: 23990 [32mloss: 3.2770 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.85 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 1.0733 [37mglobal_avg_top_loss: 2.2038
+[titan] 2025-09-09 13:24:19,782 - root - INFO - [34mlr: 8.3343e-06 gnorm: 0.38 [35m[1 day, 19:48:51<1 day, 5:14:24][39m
+[titan] 2025-09-09 13:24:52,008 - root - INFO - [31mstep: 23995 [32mloss: 2.7645 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,168 [36mtflops: 484.62 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 0.7881 [37mglobal_avg_top_loss: 1.9764
+[titan] 2025-09-09 13:24:52,008 - root - INFO - [34mlr: 8.3309e-06 gnorm: 0.35 [35m[1 day, 19:49:23<1 day, 5:13:50][39m
+[titan] 2025-09-09 13:25:17,570 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:25:23,948 - root - INFO - [31mstep: 24000 [32mloss: 2.6465 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.95 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7351 [37mglobal_avg_top_loss: 1.9114
+[titan] 2025-09-09 13:25:23,949 - root - INFO - [34mlr: 8.3274e-06 gnorm: 0.35 [35m[1 day, 19:49:55<1 day, 5:13:17][39m
+[titan] 2025-09-09 13:25:56,153 - root - INFO - [31mstep: 24005 [32mloss: 2.7422 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,175 [36mtflops: 484.94 [35mmfu: 49.03%[39m [37mglobal_avg_ntp_loss: 0.7750 [37mglobal_avg_top_loss: 1.9672
+[titan] 2025-09-09 13:25:56,153 - root - INFO - [34mlr: 8.3240e-06 gnorm: 0.48 [35m[1 day, 19:50:28<1 day, 5:12:43][39m
+[titan] 2025-09-09 13:26:27,923 - root - INFO - [31mstep: 24010 [32mloss: 2.7806 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,314 [36mtflops: 491.57 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.7938 [37mglobal_avg_top_loss: 1.9868
+[titan] 2025-09-09 13:26:27,924 - root - INFO - [34mlr: 8.3206e-06 gnorm: 0.39 [35m[1 day, 19:50:59<1 day, 5:12:10][39m
+[titan] 2025-09-09 13:26:59,996 - root - INFO - [31mstep: 24015 [32mloss: 2.9127 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.94 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.8678 [37mglobal_avg_top_loss: 2.0449
+[titan] 2025-09-09 13:26:59,996 - root - INFO - [34mlr: 8.3172e-06 gnorm: 0.40 [35m[1 day, 19:51:31<1 day, 5:11:36][39m
+[titan] 2025-09-09 13:27:31,819 - root - INFO - [31mstep: 24020 [32mloss: 2.6760 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,297 [36mtflops: 490.76 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7498 [37mglobal_avg_top_loss: 1.9262
+[titan] 2025-09-09 13:27:31,819 - root - INFO - [34mlr: 8.3138e-06 gnorm: 0.54 [35m[1 day, 19:52:03<1 day, 5:11:03][39m
+[titan] 2025-09-09 13:28:03,919 - root - INFO - [31mstep: 24025 [32mloss: 2.6722 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,208 [36mtflops: 486.52 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7470 [37mglobal_avg_top_loss: 1.9252
+[titan] 2025-09-09 13:28:03,919 - root - INFO - [34mlr: 8.3104e-06 gnorm: 0.34 [35m[1 day, 19:52:35<1 day, 5:10:29][39m
+[titan] 2025-09-09 13:28:35,728 - root - INFO - [31mstep: 24030 [32mloss: 2.8901 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.97 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.8443 [37mglobal_avg_top_loss: 2.0458
+[titan] 2025-09-09 13:28:35,729 - root - INFO - [34mlr: 8.3070e-06 gnorm: 0.40 [35m[1 day, 19:53:07<1 day, 5:09:56][39m
+[titan] 2025-09-09 13:29:07,702 - root - INFO - [31mstep: 24035 [32mloss: 2.7610 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.45 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7864 [37mglobal_avg_top_loss: 1.9745
+[titan] 2025-09-09 13:29:07,702 - root - INFO - [34mlr: 8.3036e-06 gnorm: 0.38 [35m[1 day, 19:53:39<1 day, 5:09:22][39m
+[titan] 2025-09-09 13:29:39,793 - root - INFO - [31mstep: 24040 [32mloss: 2.6985 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.66 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7585 [37mglobal_avg_top_loss: 1.9401
+[titan] 2025-09-09 13:29:39,793 - root - INFO - [34mlr: 8.3002e-06 gnorm: 0.40 [35m[1 day, 19:54:11<1 day, 5:08:49][39m
+[titan] 2025-09-09 13:30:11,646 - root - INFO - [31mstep: 24045 [32mloss: 3.0818 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.30 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.9355 [37mglobal_avg_top_loss: 2.1463
+[titan] 2025-09-09 13:30:11,646 - root - INFO - [34mlr: 8.2968e-06 gnorm: 0.43 [35m[1 day, 19:54:43<1 day, 5:08:15][39m
+[titan] 2025-09-09 13:30:37,177 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:30:43,693 - root - INFO - [31mstep: 24050 [32mloss: 2.7219 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.33 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7698 [37mglobal_avg_top_loss: 1.9521
+[titan] 2025-09-09 13:30:43,693 - root - INFO - [34mlr: 8.2934e-06 gnorm: 0.35 [35m[1 day, 19:55:15<1 day, 5:07:42][39m
+[titan] 2025-09-09 13:31:15,798 - root - INFO - [31mstep: 24055 [32mloss: 3.1788 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.45 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 1.0313 [37mglobal_avg_top_loss: 2.1476
+[titan] 2025-09-09 13:31:15,798 - root - INFO - [34mlr: 8.2900e-06 gnorm: 0.37 [35m[1 day, 19:55:47<1 day, 5:07:09][39m
+[titan] 2025-09-09 13:31:47,562 - root - INFO - [31mstep: 24060 [32mloss: 2.6929 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,316 [36mtflops: 491.67 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.7555 [37mglobal_avg_top_loss: 1.9374
+[titan] 2025-09-09 13:31:47,562 - root - INFO - [34mlr: 8.2866e-06 gnorm: 0.38 [35m[1 day, 19:56:19<1 day, 5:06:35][39m
+[titan] 2025-09-09 13:32:13,694 - root - INFO - Dumping profiler traces at step 24064
+[titan] 2025-09-09 13:32:13,766 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 13:32:19,995 - root - INFO - [31mstep: 24065 [32mloss: 2.6309 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,103 [36mtflops: 481.52 [35mmfu: 48.69%[39m [37mglobal_avg_ntp_loss: 0.7302 [37mglobal_avg_top_loss: 1.9008
+[titan] 2025-09-09 13:32:19,995 - root - INFO - [34mlr: 8.2832e-06 gnorm: 0.33 [35m[1 day, 19:56:51<1 day, 5:06:02][39m
+[titan] 2025-09-09 13:32:51,952 - root - INFO - [31mstep: 24070 [32mloss: 2.6834 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.70 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7503 [37mglobal_avg_top_loss: 1.9331
+[titan] 2025-09-09 13:32:51,952 - root - INFO - [34mlr: 8.2798e-06 gnorm: 0.35 [35m[1 day, 19:57:23<1 day, 5:05:28][39m
+[titan] 2025-09-09 13:33:23,868 - root - INFO - [31mstep: 24075 [32mloss: 2.7221 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.33 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7654 [37mglobal_avg_top_loss: 1.9567
+[titan] 2025-09-09 13:33:23,868 - root - INFO - [34mlr: 8.2764e-06 gnorm: 0.35 [35m[1 day, 19:57:55<1 day, 5:04:55][39m
+[titan] 2025-09-09 13:33:55,860 - root - INFO - [31mstep: 24080 [32mloss: 2.7347 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.17 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7893 [37mglobal_avg_top_loss: 1.9455
+[titan] 2025-09-09 13:33:55,861 - root - INFO - [34mlr: 8.2730e-06 gnorm: 0.52 [35m[1 day, 19:58:27<1 day, 5:04:21][39m
+[titan] 2025-09-09 13:34:27,919 - root - INFO - [31mstep: 24085 [32mloss: 2.7412 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.15 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7809 [37mglobal_avg_top_loss: 1.9603
+[titan] 2025-09-09 13:34:27,920 - root - INFO - [34mlr: 8.2696e-06 gnorm: 0.36 [35m[1 day, 19:58:59<1 day, 5:03:48][39m
+[titan] 2025-09-09 13:35:00,212 - root - INFO - [31mstep: 24090 [32mloss: 2.7386 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,147 [36mtflops: 483.62 [35mmfu: 48.90%[39m [37mglobal_avg_ntp_loss: 0.7762 [37mglobal_avg_top_loss: 1.9623
+[titan] 2025-09-09 13:35:00,212 - root - INFO - [34mlr: 8.2662e-06 gnorm: 0.35 [35m[1 day, 19:59:32<1 day, 5:03:15][39m
+[titan] 2025-09-09 13:35:32,409 - root - INFO - [31mstep: 24095 [32mloss: 2.8068 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.06 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.8017 [37mglobal_avg_top_loss: 2.0051
+[titan] 2025-09-09 13:35:32,410 - root - INFO - [34mlr: 8.2627e-06 gnorm: 0.68 [35m[1 day, 20:00:04<1 day, 5:02:42][39m
+[titan] 2025-09-09 13:35:58,030 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:36:04,421 - root - INFO - [31mstep: 24100 [32mloss: 2.7521 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.86 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7877 [37mglobal_avg_top_loss: 1.9645
+[titan] 2025-09-09 13:36:04,422 - root - INFO - [34mlr: 8.2593e-06 gnorm: 0.78 [35m[1 day, 20:00:36<1 day, 5:02:08][39m
+[titan] 2025-09-09 13:36:36,524 - root - INFO - [31mstep: 24105 [32mloss: 2.7440 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,208 [36mtflops: 486.49 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7771 [37mglobal_avg_top_loss: 1.9669
+[titan] 2025-09-09 13:36:36,524 - root - INFO - [34mlr: 8.2559e-06 gnorm: 0.35 [35m[1 day, 20:01:08<1 day, 5:01:35][39m
+[titan] 2025-09-09 13:37:08,694 - root - INFO - [31mstep: 24110 [32mloss: 2.7829 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,186 [36mtflops: 485.46 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 0.7966 [37mglobal_avg_top_loss: 1.9863
+[titan] 2025-09-09 13:37:08,695 - root - INFO - [34mlr: 8.2525e-06 gnorm: 0.35 [35m[1 day, 20:01:40<1 day, 5:01:01][39m
+[titan] 2025-09-09 13:37:40,675 - root - INFO - [31mstep: 24115 [32mloss: 2.7578 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.34 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7857 [37mglobal_avg_top_loss: 1.9722
+[titan] 2025-09-09 13:37:40,676 - root - INFO - [34mlr: 8.2491e-06 gnorm: 0.35 [35m[1 day, 20:02:12<1 day, 5:00:28][39m
+[titan] 2025-09-09 13:38:12,695 - root - INFO - [31mstep: 24120 [32mloss: 2.7801 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.74 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7919 [37mglobal_avg_top_loss: 1.9882
+[titan] 2025-09-09 13:38:12,696 - root - INFO - [34mlr: 8.2458e-06 gnorm: 0.36 [35m[1 day, 20:02:44<1 day, 4:59:54][39m
+[titan] 2025-09-09 13:38:44,627 - root - INFO - [31mstep: 24125 [32mloss: 2.7460 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.08 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7799 [37mglobal_avg_top_loss: 1.9661
+[titan] 2025-09-09 13:38:44,628 - root - INFO - [34mlr: 8.2424e-06 gnorm: 0.35 [35m[1 day, 20:03:16<1 day, 4:59:21][39m
+[titan] 2025-09-09 13:39:16,379 - root - INFO - [31mstep: 24130 [32mloss: 2.7060 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,320 [36mtflops: 491.86 [35mmfu: 49.73%[39m [37mglobal_avg_ntp_loss: 0.7611 [37mglobal_avg_top_loss: 1.9450
+[titan] 2025-09-09 13:39:16,380 - root - INFO - [34mlr: 8.2390e-06 gnorm: 0.36 [35m[1 day, 20:03:48<1 day, 4:58:47][39m
+[titan] 2025-09-09 13:39:48,502 - root - INFO - [31mstep: 24135 [32mloss: 3.1977 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,201 [36mtflops: 486.18 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 1.0387 [37mglobal_avg_top_loss: 2.1590
+[titan] 2025-09-09 13:39:48,502 - root - INFO - [34mlr: 8.2356e-06 gnorm: 0.37 [35m[1 day, 20:04:20<1 day, 4:58:14][39m
+[titan] 2025-09-09 13:40:20,686 - root - INFO - [31mstep: 24140 [32mloss: 2.8257 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,182 [36mtflops: 485.26 [35mmfu: 49.07%[39m [37mglobal_avg_ntp_loss: 0.8179 [37mglobal_avg_top_loss: 2.0078
+[titan] 2025-09-09 13:40:20,686 - root - INFO - [34mlr: 8.2322e-06 gnorm: 0.37 [35m[1 day, 20:04:52<1 day, 4:57:41][39m
+[titan] 2025-09-09 13:40:52,635 - root - INFO - [31mstep: 24145 [32mloss: 2.7141 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.83 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7642 [37mglobal_avg_top_loss: 1.9499
+[titan] 2025-09-09 13:40:52,635 - root - INFO - [34mlr: 8.2288e-06 gnorm: 0.37 [35m[1 day, 20:05:24<1 day, 4:57:07][39m
+[titan] 2025-09-09 13:41:18,162 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:41:24,562 - root - INFO - [31mstep: 24150 [32mloss: 3.0353 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.15 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.9486 [37mglobal_avg_top_loss: 2.0866
+[titan] 2025-09-09 13:41:24,563 - root - INFO - [34mlr: 8.2254e-06 gnorm: 0.39 [35m[1 day, 20:05:56<1 day, 4:56:34][39m
+[titan] 2025-09-09 13:41:56,895 - root - INFO - [31mstep: 24155 [32mloss: 3.0151 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,135 [36mtflops: 483.02 [35mmfu: 48.84%[39m [37mglobal_avg_ntp_loss: 0.9217 [37mglobal_avg_top_loss: 2.0934
+[titan] 2025-09-09 13:41:56,896 - root - INFO - [34mlr: 8.2220e-06 gnorm: 0.35 [35m[1 day, 20:06:28<1 day, 4:56:01][39m
+[titan] 2025-09-09 13:42:28,889 - root - INFO - [31mstep: 24160 [32mloss: 2.7939 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.15 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8035 [37mglobal_avg_top_loss: 1.9904
+[titan] 2025-09-09 13:42:28,889 - root - INFO - [34mlr: 8.2186e-06 gnorm: 0.37 [35m[1 day, 20:07:00<1 day, 4:55:27][39m
+[titan] 2025-09-09 13:43:00,976 - root - INFO - [31mstep: 24165 [32mloss: 2.6934 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.72 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7547 [37mglobal_avg_top_loss: 1.9387
+[titan] 2025-09-09 13:43:00,976 - root - INFO - [34mlr: 8.2152e-06 gnorm: 0.35 [35m[1 day, 20:07:32<1 day, 4:54:54][39m
+[titan] 2025-09-09 13:43:32,860 - root - INFO - [31mstep: 24170 [32mloss: 2.6097 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.82 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7178 [37mglobal_avg_top_loss: 1.8919
+[titan] 2025-09-09 13:43:32,861 - root - INFO - [34mlr: 8.2118e-06 gnorm: 0.35 [35m[1 day, 20:08:04<1 day, 4:54:20][39m
+[titan] 2025-09-09 13:44:04,719 - root - INFO - [31mstep: 24175 [32mloss: 2.7924 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.22 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.8012 [37mglobal_avg_top_loss: 1.9913
+[titan] 2025-09-09 13:44:04,719 - root - INFO - [34mlr: 8.2084e-06 gnorm: 0.53 [35m[1 day, 20:08:36<1 day, 4:53:47][39m
+[titan] 2025-09-09 13:44:36,913 - root - INFO - [31mstep: 24180 [32mloss: 2.7876 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.10 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7980 [37mglobal_avg_top_loss: 1.9896
+[titan] 2025-09-09 13:44:36,914 - root - INFO - [34mlr: 8.2050e-06 gnorm: 0.37 [35m[1 day, 20:09:08<1 day, 4:53:13][39m
+[titan] 2025-09-09 13:45:08,972 - root - INFO - [31mstep: 24185 [32mloss: 2.7621 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,222 [36mtflops: 487.15 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7901 [37mglobal_avg_top_loss: 1.9720
+[titan] 2025-09-09 13:45:08,973 - root - INFO - [34mlr: 8.2016e-06 gnorm: 0.35 [35m[1 day, 20:09:40<1 day, 4:52:40][39m
+[titan] 2025-09-09 13:45:40,840 - root - INFO - [31mstep: 24190 [32mloss: 2.8281 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.07 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.8131 [37mglobal_avg_top_loss: 2.0150
+[titan] 2025-09-09 13:45:40,840 - root - INFO - [34mlr: 8.1982e-06 gnorm: 0.34 [35m[1 day, 20:10:12<1 day, 4:52:06][39m
+[titan] 2025-09-09 13:46:12,871 - root - INFO - [31mstep: 24195 [32mloss: 2.7647 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.57 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7854 [37mglobal_avg_top_loss: 1.9793
+[titan] 2025-09-09 13:46:12,871 - root - INFO - [34mlr: 8.1948e-06 gnorm: 0.36 [35m[1 day, 20:10:44<1 day, 4:51:33][39m
+[titan] 2025-09-09 13:46:38,454 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:46:44,925 - root - INFO - [31mstep: 24200 [32mloss: 2.7885 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.22 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.8007 [37mglobal_avg_top_loss: 1.9878
+[titan] 2025-09-09 13:46:44,925 - root - INFO - [34mlr: 8.1914e-06 gnorm: 0.42 [35m[1 day, 20:11:16<1 day, 4:51:00][39m
+[titan] 2025-09-09 13:47:16,947 - root - INFO - [31mstep: 24205 [32mloss: 2.7038 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.71 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7580 [37mglobal_avg_top_loss: 1.9458
+[titan] 2025-09-09 13:47:16,947 - root - INFO - [34mlr: 8.1880e-06 gnorm: 0.46 [35m[1 day, 20:11:48<1 day, 4:50:26][39m
+[titan] 2025-09-09 13:47:49,054 - root - INFO - [31mstep: 24210 [32mloss: 2.8731 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.42 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.8388 [37mglobal_avg_top_loss: 2.0343
+[titan] 2025-09-09 13:47:49,054 - root - INFO - [34mlr: 8.1846e-06 gnorm: 0.55 [35m[1 day, 20:12:20<1 day, 4:49:53][39m
+[titan] 2025-09-09 13:48:21,089 - root - INFO - [31mstep: 24215 [32mloss: 3.4447 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.51 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 1.1791 [37mglobal_avg_top_loss: 2.2656
+[titan] 2025-09-09 13:48:21,089 - root - INFO - [34mlr: 8.1813e-06 gnorm: 0.41 [35m[1 day, 20:12:53<1 day, 4:49:19][39m
+[titan] 2025-09-09 13:48:52,860 - root - INFO - [31mstep: 24220 [32mloss: 2.7942 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,314 [36mtflops: 491.55 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.8011 [37mglobal_avg_top_loss: 1.9931
+[titan] 2025-09-09 13:48:52,861 - root - INFO - [34mlr: 8.1779e-06 gnorm: 0.38 [35m[1 day, 20:13:24<1 day, 4:48:46][39m
+[titan] 2025-09-09 13:49:24,901 - root - INFO - [31mstep: 24225 [32mloss: 2.7667 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.42 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7918 [37mglobal_avg_top_loss: 1.9749
+[titan] 2025-09-09 13:49:24,902 - root - INFO - [34mlr: 8.1745e-06 gnorm: 0.35 [35m[1 day, 20:13:56<1 day, 4:48:12][39m
+[titan] 2025-09-09 13:49:57,046 - root - INFO - [31mstep: 24230 [32mloss: 2.7424 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.84 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7777 [37mglobal_avg_top_loss: 1.9647
+[titan] 2025-09-09 13:49:57,047 - root - INFO - [34mlr: 8.1711e-06 gnorm: 0.40 [35m[1 day, 20:14:28<1 day, 4:47:39][39m
+[titan] 2025-09-09 13:50:29,068 - root - INFO - [31mstep: 24235 [32mloss: 2.7678 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.72 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7885 [37mglobal_avg_top_loss: 1.9793
+[titan] 2025-09-09 13:50:29,068 - root - INFO - [34mlr: 8.1677e-06 gnorm: 0.35 [35m[1 day, 20:15:01<1 day, 4:47:06][39m
+[titan] 2025-09-09 13:51:00,968 - root - INFO - [31mstep: 24240 [32mloss: 2.8187 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.58 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8246 [37mglobal_avg_top_loss: 1.9941
+[titan] 2025-09-09 13:51:00,968 - root - INFO - [34mlr: 8.1643e-06 gnorm: 0.41 [35m[1 day, 20:15:32<1 day, 4:46:32][39m
+[titan] 2025-09-09 13:51:33,059 - root - INFO - [31mstep: 24245 [32mloss: 2.6846 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.66 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7499 [37mglobal_avg_top_loss: 1.9347
+[titan] 2025-09-09 13:51:33,060 - root - INFO - [34mlr: 8.1609e-06 gnorm: 0.38 [35m[1 day, 20:16:04<1 day, 4:45:59][39m
+[titan] 2025-09-09 13:51:58,559 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:52:04,983 - root - INFO - [31mstep: 24250 [32mloss: 2.7423 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.21 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7762 [37mglobal_avg_top_loss: 1.9661
+[titan] 2025-09-09 13:52:04,983 - root - INFO - [34mlr: 8.1575e-06 gnorm: 0.40 [35m[1 day, 20:16:36<1 day, 4:45:25][39m
+[titan] 2025-09-09 13:52:36,875 - root - INFO - [31mstep: 24255 [32mloss: 2.7311 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.70 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7760 [37mglobal_avg_top_loss: 1.9550
+[titan] 2025-09-09 13:52:36,875 - root - INFO - [34mlr: 8.1541e-06 gnorm: 0.53 [35m[1 day, 20:17:08<1 day, 4:44:52][39m
+[titan] 2025-09-09 13:53:09,121 - root - INFO - [31mstep: 24260 [32mloss: 2.7513 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,162 [36mtflops: 484.31 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.7812 [37mglobal_avg_top_loss: 1.9701
+[titan] 2025-09-09 13:53:09,122 - root - INFO - [34mlr: 8.1508e-06 gnorm: 0.35 [35m[1 day, 20:17:41<1 day, 4:44:19][39m
+[titan] 2025-09-09 13:53:41,070 - root - INFO - [31mstep: 24265 [32mloss: 2.5740 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.84 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7042 [37mglobal_avg_top_loss: 1.8698
+[titan] 2025-09-09 13:53:41,070 - root - INFO - [34mlr: 8.1474e-06 gnorm: 0.36 [35m[1 day, 20:18:13<1 day, 4:43:45][39m
+[titan] 2025-09-09 13:54:13,046 - root - INFO - [31mstep: 24270 [32mloss: 2.7462 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.41 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7769 [37mglobal_avg_top_loss: 1.9692
+[titan] 2025-09-09 13:54:13,046 - root - INFO - [34mlr: 8.1440e-06 gnorm: 0.36 [35m[1 day, 20:18:44<1 day, 4:43:12][39m
+[titan] 2025-09-09 13:54:44,868 - root - INFO - [31mstep: 24275 [32mloss: 2.7525 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,297 [36mtflops: 490.77 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7841 [37mglobal_avg_top_loss: 1.9684
+[titan] 2025-09-09 13:54:44,869 - root - INFO - [34mlr: 8.1406e-06 gnorm: 0.37 [35m[1 day, 20:19:16<1 day, 4:42:38][39m
+[titan] 2025-09-09 13:55:16,883 - root - INFO - [31mstep: 24280 [32mloss: 2.7304 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.82 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7685 [37mglobal_avg_top_loss: 1.9619
+[titan] 2025-09-09 13:55:16,883 - root - INFO - [34mlr: 8.1372e-06 gnorm: 0.35 [35m[1 day, 20:19:48<1 day, 4:42:05][39m
+[titan] 2025-09-09 13:55:48,743 - root - INFO - [31mstep: 24285 [32mloss: 2.7590 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.19 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7876 [37mglobal_avg_top_loss: 1.9713
+[titan] 2025-09-09 13:55:48,743 - root - INFO - [34mlr: 8.1338e-06 gnorm: 0.35 [35m[1 day, 20:20:20<1 day, 4:41:31][39m
+[titan] 2025-09-09 13:56:20,549 - root - INFO - [31mstep: 24290 [32mloss: 2.7280 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.03 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7720 [37mglobal_avg_top_loss: 1.9559
+[titan] 2025-09-09 13:56:20,549 - root - INFO - [34mlr: 8.1305e-06 gnorm: 0.46 [35m[1 day, 20:20:52<1 day, 4:40:58][39m
+[titan] 2025-09-09 13:56:52,585 - root - INFO - [31mstep: 24295 [32mloss: 3.7614 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.49 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 1.3523 [37mglobal_avg_top_loss: 2.4091
+[titan] 2025-09-09 13:56:52,585 - root - INFO - [34mlr: 8.1271e-06 gnorm: 0.44 [35m[1 day, 20:21:24<1 day, 4:40:24][39m
+[titan] 2025-09-09 13:57:18,113 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 13:57:24,604 - root - INFO - [31mstep: 24300 [32mloss: 2.7631 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.75 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7887 [37mglobal_avg_top_loss: 1.9745
+[titan] 2025-09-09 13:57:24,604 - root - INFO - [34mlr: 8.1237e-06 gnorm: 0.35 [35m[1 day, 20:21:56<1 day, 4:39:51][39m
+[titan] 2025-09-09 13:57:56,423 - root - INFO - [31mstep: 24305 [32mloss: 2.7928 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.82 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7999 [37mglobal_avg_top_loss: 1.9929
+[titan] 2025-09-09 13:57:56,423 - root - INFO - [34mlr: 8.1203e-06 gnorm: 0.35 [35m[1 day, 20:22:28<1 day, 4:39:17][39m
+[titan] 2025-09-09 13:58:28,385 - root - INFO - [31mstep: 24310 [32mloss: 3.2507 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.63 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 1.0666 [37mglobal_avg_top_loss: 2.1841
+[titan] 2025-09-09 13:58:28,385 - root - INFO - [34mlr: 8.1169e-06 gnorm: 0.42 [35m[1 day, 20:23:00<1 day, 4:38:44][39m
+[titan] 2025-09-09 13:59:00,442 - root - INFO - [31mstep: 24315 [32mloss: 2.8062 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,222 [36mtflops: 487.18 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.8062 [37mglobal_avg_top_loss: 2.0000
+[titan] 2025-09-09 13:59:00,442 - root - INFO - [34mlr: 8.1135e-06 gnorm: 0.35 [35m[1 day, 20:23:32<1 day, 4:38:10][39m
+[titan] 2025-09-09 13:59:32,433 - root - INFO - [31mstep: 24320 [32mloss: 2.6860 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7554 [37mglobal_avg_top_loss: 1.9306
+[titan] 2025-09-09 13:59:32,433 - root - INFO - [34mlr: 8.1102e-06 gnorm: 0.35 [35m[1 day, 20:24:04<1 day, 4:37:37][39m
+[titan] 2025-09-09 14:00:04,473 - root - INFO - [31mstep: 24325 [32mloss: 2.7479 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.43 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7839 [37mglobal_avg_top_loss: 1.9641
+[titan] 2025-09-09 14:00:04,474 - root - INFO - [34mlr: 8.1068e-06 gnorm: 0.35 [35m[1 day, 20:24:36<1 day, 4:37:04][39m
+[titan] 2025-09-09 14:00:36,320 - root - INFO - [31mstep: 24330 [32mloss: 2.7769 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.39 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7931 [37mglobal_avg_top_loss: 1.9839
+[titan] 2025-09-09 14:00:36,321 - root - INFO - [34mlr: 8.1034e-06 gnorm: 0.59 [35m[1 day, 20:25:08<1 day, 4:36:30][39m
+[titan] 2025-09-09 14:01:08,166 - root - INFO - [31mstep: 24335 [32mloss: 2.6969 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.40 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7575 [37mglobal_avg_top_loss: 1.9394
+[titan] 2025-09-09 14:01:08,167 - root - INFO - [34mlr: 8.1000e-06 gnorm: 0.46 [35m[1 day, 20:25:40<1 day, 4:35:57][39m
+[titan] 2025-09-09 14:01:40,149 - root - INFO - [31mstep: 24340 [32mloss: 2.6555 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.31 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7437 [37mglobal_avg_top_loss: 1.9118
+[titan] 2025-09-09 14:01:40,149 - root - INFO - [34mlr: 8.0966e-06 gnorm: 0.35 [35m[1 day, 20:26:12<1 day, 4:35:23][39m
+[titan] 2025-09-09 14:02:12,035 - root - INFO - [31mstep: 24345 [32mloss: 3.1250 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.78 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 1.0039 [37mglobal_avg_top_loss: 2.1211
+[titan] 2025-09-09 14:02:12,035 - root - INFO - [34mlr: 8.0933e-06 gnorm: 0.41 [35m[1 day, 20:26:43<1 day, 4:34:50][39m
+[titan] 2025-09-09 14:02:37,510 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:02:43,884 - root - INFO - [31mstep: 24350 [32mloss: 2.7159 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,289 [36mtflops: 490.36 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7667 [37mglobal_avg_top_loss: 1.9492
+[titan] 2025-09-09 14:02:43,885 - root - INFO - [34mlr: 8.0899e-06 gnorm: 0.35 [35m[1 day, 20:27:15<1 day, 4:34:16][39m
+[titan] 2025-09-09 14:03:15,714 - root - INFO - [31mstep: 24355 [32mloss: 2.6675 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.66 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7449 [37mglobal_avg_top_loss: 1.9226
+[titan] 2025-09-09 14:03:15,714 - root - INFO - [34mlr: 8.0865e-06 gnorm: 0.35 [35m[1 day, 20:27:47<1 day, 4:33:43][39m
+[titan] 2025-09-09 14:03:47,788 - root - INFO - [31mstep: 24360 [32mloss: 2.7716 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.92 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7906 [37mglobal_avg_top_loss: 1.9810
+[titan] 2025-09-09 14:03:47,788 - root - INFO - [34mlr: 8.0831e-06 gnorm: 0.37 [35m[1 day, 20:28:19<1 day, 4:33:09][39m
+[titan] 2025-09-09 14:04:19,866 - root - INFO - [31mstep: 24365 [32mloss: 2.8802 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,215 [36mtflops: 486.86 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8467 [37mglobal_avg_top_loss: 2.0336
+[titan] 2025-09-09 14:04:19,866 - root - INFO - [34mlr: 8.0797e-06 gnorm: 0.36 [35m[1 day, 20:28:51<1 day, 4:32:36][39m
+[titan] 2025-09-09 14:04:51,742 - root - INFO - [31mstep: 24370 [32mloss: 2.7575 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.94 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7834 [37mglobal_avg_top_loss: 1.9741
+[titan] 2025-09-09 14:04:51,742 - root - INFO - [34mlr: 8.0764e-06 gnorm: 0.37 [35m[1 day, 20:29:23<1 day, 4:32:02][39m
+[titan] 2025-09-09 14:05:23,958 - root - INFO - [31mstep: 24375 [32mloss: 3.2172 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,172 [36mtflops: 484.78 [35mmfu: 49.02%[39m [37mglobal_avg_ntp_loss: 1.0461 [37mglobal_avg_top_loss: 2.1712
+[titan] 2025-09-09 14:05:23,958 - root - INFO - [34mlr: 8.0730e-06 gnorm: 0.37 [35m[1 day, 20:29:55<1 day, 4:31:29][39m
+[titan] 2025-09-09 14:05:55,934 - root - INFO - [31mstep: 24380 [32mloss: 2.8040 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.41 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.8065 [37mglobal_avg_top_loss: 1.9975
+[titan] 2025-09-09 14:05:55,934 - root - INFO - [34mlr: 8.0696e-06 gnorm: 0.34 [35m[1 day, 20:30:27<1 day, 4:30:56][39m
+[titan] 2025-09-09 14:06:27,730 - root - INFO - [31mstep: 24385 [32mloss: 2.7604 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.18 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7856 [37mglobal_avg_top_loss: 1.9748
+[titan] 2025-09-09 14:06:27,730 - root - INFO - [34mlr: 8.0662e-06 gnorm: 0.36 [35m[1 day, 20:30:59<1 day, 4:30:22][39m
+[titan] 2025-09-09 14:06:59,607 - root - INFO - [31mstep: 24390 [32mloss: 3.1832 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.93 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 1.0331 [37mglobal_avg_top_loss: 2.1501
+[titan] 2025-09-09 14:06:59,607 - root - INFO - [34mlr: 8.0629e-06 gnorm: 0.37 [35m[1 day, 20:31:31<1 day, 4:29:49][39m
+[titan] 2025-09-09 14:07:31,570 - root - INFO - [31mstep: 24395 [32mloss: 3.1253 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.61 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.9935 [37mglobal_avg_top_loss: 2.1319
+[titan] 2025-09-09 14:07:31,570 - root - INFO - [34mlr: 8.0595e-06 gnorm: 0.34 [35m[1 day, 20:32:03<1 day, 4:29:15][39m
+[titan] 2025-09-09 14:07:57,159 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:08:03,581 - root - INFO - [31mstep: 24400 [32mloss: 2.7249 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.87 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7699 [37mglobal_avg_top_loss: 1.9550
+[titan] 2025-09-09 14:08:03,582 - root - INFO - [34mlr: 8.0561e-06 gnorm: 0.35 [35m[1 day, 20:32:35<1 day, 4:28:42][39m
+[titan] 2025-09-09 14:08:35,429 - root - INFO - [31mstep: 24405 [32mloss: 2.7365 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,289 [36mtflops: 490.38 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7742 [37mglobal_avg_top_loss: 1.9623
+[titan] 2025-09-09 14:08:35,430 - root - INFO - [34mlr: 8.0527e-06 gnorm: 0.36 [35m[1 day, 20:33:07<1 day, 4:28:08][39m
+[titan] 2025-09-09 14:09:07,322 - root - INFO - [31mstep: 24410 [32mloss: 2.5786 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.69 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7040 [37mglobal_avg_top_loss: 1.8746
+[titan] 2025-09-09 14:09:07,322 - root - INFO - [34mlr: 8.0494e-06 gnorm: 0.34 [35m[1 day, 20:33:39<1 day, 4:27:35][39m
+[titan] 2025-09-09 14:09:38,999 - root - INFO - [31mstep: 24415 [32mloss: 2.7584 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,345 [36mtflops: 493.01 [35mmfu: 49.85%[39m [37mglobal_avg_ntp_loss: 0.7876 [37mglobal_avg_top_loss: 1.9708
+[titan] 2025-09-09 14:09:39,000 - root - INFO - [34mlr: 8.0460e-06 gnorm: 0.36 [35m[1 day, 20:34:10<1 day, 4:27:01][39m
+[titan] 2025-09-09 14:10:10,907 - root - INFO - [31mstep: 24420 [32mloss: 2.7637 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.46 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7908 [37mglobal_avg_top_loss: 1.9729
+[titan] 2025-09-09 14:10:10,907 - root - INFO - [34mlr: 8.0426e-06 gnorm: 0.35 [35m[1 day, 20:34:42<1 day, 4:26:28][39m
+[titan] 2025-09-09 14:10:42,813 - root - INFO - [31mstep: 24425 [32mloss: 3.0768 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.49 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.9865 [37mglobal_avg_top_loss: 2.0903
+[titan] 2025-09-09 14:10:42,813 - root - INFO - [34mlr: 8.0393e-06 gnorm: 0.38 [35m[1 day, 20:35:14<1 day, 4:25:54][39m
+[titan] 2025-09-09 14:11:14,959 - root - INFO - [31mstep: 24430 [32mloss: 2.6405 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.82 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7345 [37mglobal_avg_top_loss: 1.9060
+[titan] 2025-09-09 14:11:14,960 - root - INFO - [34mlr: 8.0359e-06 gnorm: 0.37 [35m[1 day, 20:35:46<1 day, 4:25:21][39m
+[titan] 2025-09-09 14:11:46,820 - root - INFO - [31mstep: 24435 [32mloss: 2.7144 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.18 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7668 [37mglobal_avg_top_loss: 1.9476
+[titan] 2025-09-09 14:11:46,820 - root - INFO - [34mlr: 8.0325e-06 gnorm: 0.36 [35m[1 day, 20:36:18<1 day, 4:24:48][39m
+[titan] 2025-09-09 14:12:18,694 - root - INFO - [31mstep: 24440 [32mloss: 2.7474 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.96 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7800 [37mglobal_avg_top_loss: 1.9674
+[titan] 2025-09-09 14:12:18,695 - root - INFO - [34mlr: 8.0291e-06 gnorm: 0.38 [35m[1 day, 20:36:50<1 day, 4:24:14][39m
+[titan] 2025-09-09 14:12:50,660 - root - INFO - [31mstep: 24445 [32mloss: 2.7675 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.57 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7923 [37mglobal_avg_top_loss: 1.9751
+[titan] 2025-09-09 14:12:50,660 - root - INFO - [34mlr: 8.0258e-06 gnorm: 0.35 [35m[1 day, 20:37:22<1 day, 4:23:41][39m
+[titan] 2025-09-09 14:13:16,180 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:13:22,525 - root - INFO - [31mstep: 24450 [32mloss: 2.6339 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,284 [36mtflops: 490.12 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7252 [37mglobal_avg_top_loss: 1.9088
+[titan] 2025-09-09 14:13:22,525 - root - INFO - [34mlr: 8.0224e-06 gnorm: 0.53 [35m[1 day, 20:37:54<1 day, 4:23:07][39m
+[titan] 2025-09-09 14:13:54,340 - root - INFO - [31mstep: 24455 [32mloss: 3.2545 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.88 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 1.0613 [37mglobal_avg_top_loss: 2.1931
+[titan] 2025-09-09 14:13:54,340 - root - INFO - [34mlr: 8.0190e-06 gnorm: 0.39 [35m[1 day, 20:38:26<1 day, 4:22:34][39m
+[titan] 2025-09-09 14:14:26,271 - root - INFO - [31mstep: 24460 [32mloss: 2.7233 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.10 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7694 [37mglobal_avg_top_loss: 1.9539
+[titan] 2025-09-09 14:14:26,272 - root - INFO - [34mlr: 8.0157e-06 gnorm: 0.36 [35m[1 day, 20:38:58<1 day, 4:22:00][39m
+[titan] 2025-09-09 14:14:58,325 - root - INFO - [31mstep: 24465 [32mloss: 2.7286 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.23 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7700 [37mglobal_avg_top_loss: 1.9586
+[titan] 2025-09-09 14:14:58,325 - root - INFO - [34mlr: 8.0123e-06 gnorm: 0.36 [35m[1 day, 20:39:30<1 day, 4:21:27][39m
+[titan] 2025-09-09 14:15:30,262 - root - INFO - [31mstep: 24470 [32mloss: 3.2392 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.01 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 1.0598 [37mglobal_avg_top_loss: 2.1794
+[titan] 2025-09-09 14:15:30,262 - root - INFO - [34mlr: 8.0089e-06 gnorm: 0.39 [35m[1 day, 20:40:02<1 day, 4:20:53][39m
+[titan] 2025-09-09 14:16:02,029 - root - INFO - [31mstep: 24475 [32mloss: 2.7670 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,315 [36mtflops: 491.62 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.7878 [37mglobal_avg_top_loss: 1.9792
+[titan] 2025-09-09 14:16:02,029 - root - INFO - [34mlr: 8.0056e-06 gnorm: 0.36 [35m[1 day, 20:40:33<1 day, 4:20:20][39m
+[titan] 2025-09-09 14:16:33,793 - root - INFO - [31mstep: 24480 [32mloss: 2.7746 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,316 [36mtflops: 491.67 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.7928 [37mglobal_avg_top_loss: 1.9818
+[titan] 2025-09-09 14:16:33,794 - root - INFO - [34mlr: 8.0022e-06 gnorm: 0.35 [35m[1 day, 20:41:05<1 day, 4:19:46][39m
+[titan] 2025-09-09 14:17:05,709 - root - INFO - [31mstep: 24485 [32mloss: 2.8442 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.33 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.8201 [37mglobal_avg_top_loss: 2.0241
+[titan] 2025-09-09 14:17:05,710 - root - INFO - [34mlr: 7.9988e-06 gnorm: 0.37 [35m[1 day, 20:41:37<1 day, 4:19:13][39m
+[titan] 2025-09-09 14:17:37,523 - root - INFO - [31mstep: 24490 [32mloss: 2.7197 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.91 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7665 [37mglobal_avg_top_loss: 1.9532
+[titan] 2025-09-09 14:17:37,523 - root - INFO - [34mlr: 7.9955e-06 gnorm: 0.35 [35m[1 day, 20:42:09<1 day, 4:18:39][39m
+[titan] 2025-09-09 14:18:09,650 - root - INFO - [31mstep: 24495 [32mloss: 2.8103 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,200 [36mtflops: 486.11 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.8092 [37mglobal_avg_top_loss: 2.0011
+[titan] 2025-09-09 14:18:09,650 - root - INFO - [34mlr: 7.9921e-06 gnorm: 0.34 [35m[1 day, 20:42:41<1 day, 4:18:06][39m
+[titan] 2025-09-09 14:18:35,131 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:18:41,666 - root - INFO - [31mstep: 24500 [32mloss: 2.7095 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.80 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7635 [37mglobal_avg_top_loss: 1.9460
+[titan] 2025-09-09 14:18:41,666 - root - INFO - [34mlr: 7.9887e-06 gnorm: 0.35 [35m[1 day, 20:43:13<1 day, 4:17:33][39m
+[titan] 2025-09-09 14:19:13,423 - root - INFO - [31mstep: 24505 [32mloss: 3.1674 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,319 [36mtflops: 491.78 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 1.0246 [37mglobal_avg_top_loss: 2.1428
+[titan] 2025-09-09 14:19:13,423 - root - INFO - [34mlr: 7.9854e-06 gnorm: 0.36 [35m[1 day, 20:43:45<1 day, 4:16:59][39m
+[titan] 2025-09-09 14:19:45,316 - root - INFO - [31mstep: 24510 [32mloss: 2.7075 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.67 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7621 [37mglobal_avg_top_loss: 1.9453
+[titan] 2025-09-09 14:19:45,317 - root - INFO - [34mlr: 7.9820e-06 gnorm: 0.34 [35m[1 day, 20:44:17<1 day, 4:16:26][39m
+[titan] 2025-09-09 14:20:17,117 - root - INFO - [31mstep: 24515 [32mloss: 2.6781 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.11 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7481 [37mglobal_avg_top_loss: 1.9299
+[titan] 2025-09-09 14:20:17,117 - root - INFO - [34mlr: 7.9787e-06 gnorm: 0.38 [35m[1 day, 20:44:48<1 day, 4:15:52][39m
+[titan] 2025-09-09 14:20:49,105 - root - INFO - [31mstep: 24520 [32mloss: 2.5948 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.22 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7119 [37mglobal_avg_top_loss: 1.8829
+[titan] 2025-09-09 14:20:49,106 - root - INFO - [34mlr: 7.9753e-06 gnorm: 0.41 [35m[1 day, 20:45:20<1 day, 4:15:19][39m
+[titan] 2025-09-09 14:21:21,055 - root - INFO - [31mstep: 24525 [32mloss: 2.8212 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.81 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.8133 [37mglobal_avg_top_loss: 2.0079
+[titan] 2025-09-09 14:21:21,056 - root - INFO - [34mlr: 7.9719e-06 gnorm: 0.38 [35m[1 day, 20:45:52<1 day, 4:14:45][39m
+[titan] 2025-09-09 14:21:53,160 - root - INFO - [31mstep: 24530 [32mloss: 2.7070 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.46 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7627 [37mglobal_avg_top_loss: 1.9444
+[titan] 2025-09-09 14:21:53,160 - root - INFO - [34mlr: 7.9686e-06 gnorm: 0.38 [35m[1 day, 20:46:25<1 day, 4:14:12][39m
+[titan] 2025-09-09 14:22:25,241 - root - INFO - [31mstep: 24535 [32mloss: 2.7028 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.80 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7609 [37mglobal_avg_top_loss: 1.9418
+[titan] 2025-09-09 14:22:25,242 - root - INFO - [34mlr: 7.9652e-06 gnorm: 0.43 [35m[1 day, 20:46:57<1 day, 4:13:39][39m
+[titan] 2025-09-09 14:22:57,345 - root - INFO - [31mstep: 24540 [32mloss: 2.7080 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.47 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7649 [37mglobal_avg_top_loss: 1.9431
+[titan] 2025-09-09 14:22:57,345 - root - INFO - [34mlr: 7.9618e-06 gnorm: 0.34 [35m[1 day, 20:47:29<1 day, 4:13:05][39m
+[titan] 2025-09-09 14:23:29,243 - root - INFO - [31mstep: 24545 [32mloss: 2.7602 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.60 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7864 [37mglobal_avg_top_loss: 1.9739
+[titan] 2025-09-09 14:23:29,244 - root - INFO - [34mlr: 7.9585e-06 gnorm: 0.36 [35m[1 day, 20:48:01<1 day, 4:12:32][39m
+[titan] 2025-09-09 14:23:54,694 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:24:01,078 - root - INFO - [31mstep: 24550 [32mloss: 2.7308 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.59 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7721 [37mglobal_avg_top_loss: 1.9586
+[titan] 2025-09-09 14:24:01,078 - root - INFO - [34mlr: 7.9551e-06 gnorm: 0.42 [35m[1 day, 20:48:32<1 day, 4:11:58][39m
+[titan] 2025-09-09 14:24:33,037 - root - INFO - [31mstep: 24555 [32mloss: 2.7520 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.67 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7840 [37mglobal_avg_top_loss: 1.9680
+[titan] 2025-09-09 14:24:33,037 - root - INFO - [34mlr: 7.9518e-06 gnorm: 0.36 [35m[1 day, 20:49:04<1 day, 4:11:25][39m
+[titan] 2025-09-09 14:25:04,856 - root - INFO - [31mstep: 24560 [32mloss: 2.7458 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.81 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7833 [37mglobal_avg_top_loss: 1.9625
+[titan] 2025-09-09 14:25:04,857 - root - INFO - [34mlr: 7.9484e-06 gnorm: 0.38 [35m[1 day, 20:49:36<1 day, 4:10:51][39m
+[titan] 2025-09-09 14:25:36,808 - root - INFO - [31mstep: 24565 [32mloss: 2.7147 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.78 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7641 [37mglobal_avg_top_loss: 1.9505
+[titan] 2025-09-09 14:25:36,809 - root - INFO - [34mlr: 7.9451e-06 gnorm: 0.35 [35m[1 day, 20:50:08<1 day, 4:10:18][39m
+[titan] 2025-09-09 14:26:08,795 - root - INFO - [31mstep: 24570 [32mloss: 2.6328 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.25 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7274 [37mglobal_avg_top_loss: 1.9054
+[titan] 2025-09-09 14:26:08,795 - root - INFO - [34mlr: 7.9417e-06 gnorm: 0.37 [35m[1 day, 20:50:40<1 day, 4:09:44][39m
+[titan] 2025-09-09 14:26:40,892 - root - INFO - [31mstep: 24575 [32mloss: 2.8019 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.56 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.8043 [37mglobal_avg_top_loss: 1.9976
+[titan] 2025-09-09 14:26:40,893 - root - INFO - [34mlr: 7.9383e-06 gnorm: 0.36 [35m[1 day, 20:51:12<1 day, 4:09:11][39m
+[titan] 2025-09-09 14:26:47,479 - root - INFO - Dumping profiler traces at step 24576
+[titan] 2025-09-09 14:26:47,548 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 14:27:12,818 - root - INFO - [31mstep: 24580 [32mloss: 2.7852 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.18 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7954 [37mglobal_avg_top_loss: 1.9898
+[titan] 2025-09-09 14:27:12,818 - root - INFO - [34mlr: 7.9350e-06 gnorm: 0.38 [35m[1 day, 20:51:44<1 day, 4:08:38][39m
+[titan] 2025-09-09 14:27:44,784 - root - INFO - [31mstep: 24585 [32mloss: 3.1717 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.57 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 1.0256 [37mglobal_avg_top_loss: 2.1461
+[titan] 2025-09-09 14:27:44,784 - root - INFO - [34mlr: 7.9316e-06 gnorm: 0.34 [35m[1 day, 20:52:16<1 day, 4:08:04][39m
+[titan] 2025-09-09 14:28:16,876 - root - INFO - [31mstep: 24590 [32mloss: 2.6412 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.64 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7344 [37mglobal_avg_top_loss: 1.9068
+[titan] 2025-09-09 14:28:16,877 - root - INFO - [34mlr: 7.9283e-06 gnorm: 0.34 [35m[1 day, 20:52:48<1 day, 4:07:31][39m
+[titan] 2025-09-09 14:28:48,799 - root - INFO - [31mstep: 24595 [32mloss: 2.6406 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.23 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7305 [37mglobal_avg_top_loss: 1.9101
+[titan] 2025-09-09 14:28:48,799 - root - INFO - [34mlr: 7.9249e-06 gnorm: 0.36 [35m[1 day, 20:53:20<1 day, 4:06:58][39m
+[titan] 2025-09-09 14:29:14,287 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:29:20,650 - root - INFO - [31mstep: 24600 [32mloss: 2.6756 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.32 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7485 [37mglobal_avg_top_loss: 1.9271
+[titan] 2025-09-09 14:29:20,651 - root - INFO - [34mlr: 7.9216e-06 gnorm: 0.35 [35m[1 day, 20:53:52<1 day, 4:06:24][39m
+[titan] 2025-09-09 14:29:52,464 - root - INFO - [31mstep: 24605 [32mloss: 2.7393 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.90 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7822 [37mglobal_avg_top_loss: 1.9571
+[titan] 2025-09-09 14:29:52,465 - root - INFO - [34mlr: 7.9182e-06 gnorm: 0.36 [35m[1 day, 20:54:24<1 day, 4:05:51][39m
+[titan] 2025-09-09 14:30:24,386 - root - INFO - [31mstep: 24610 [32mloss: 2.7336 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.24 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7730 [37mglobal_avg_top_loss: 1.9606
+[titan] 2025-09-09 14:30:24,386 - root - INFO - [34mlr: 7.9148e-06 gnorm: 0.36 [35m[1 day, 20:54:56<1 day, 4:05:17][39m
+[titan] 2025-09-09 14:30:56,469 - root - INFO - [31mstep: 24615 [32mloss: 2.7362 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.79 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7740 [37mglobal_avg_top_loss: 1.9622
+[titan] 2025-09-09 14:30:56,469 - root - INFO - [34mlr: 7.9115e-06 gnorm: 0.35 [35m[1 day, 20:55:28<1 day, 4:04:44][39m
+[titan] 2025-09-09 14:31:28,485 - root - INFO - [31mstep: 24620 [32mloss: 2.7094 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.80 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7571 [37mglobal_avg_top_loss: 1.9523
+[titan] 2025-09-09 14:31:28,485 - root - INFO - [34mlr: 7.9081e-06 gnorm: 0.54 [35m[1 day, 20:56:00<1 day, 4:04:10][39m
+[titan] 2025-09-09 14:32:00,388 - root - INFO - [31mstep: 24625 [32mloss: 2.6534 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.53 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7373 [37mglobal_avg_top_loss: 1.9160
+[titan] 2025-09-09 14:32:00,388 - root - INFO - [34mlr: 7.9048e-06 gnorm: 0.37 [35m[1 day, 20:56:32<1 day, 4:03:37][39m
+[titan] 2025-09-09 14:32:32,442 - root - INFO - [31mstep: 24630 [32mloss: 2.7817 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.21 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7958 [37mglobal_avg_top_loss: 1.9859
+[titan] 2025-09-09 14:32:32,443 - root - INFO - [34mlr: 7.9014e-06 gnorm: 0.39 [35m[1 day, 20:57:04<1 day, 4:03:04][39m
+[titan] 2025-09-09 14:33:04,238 - root - INFO - [31mstep: 24635 [32mloss: 2.7515 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.18 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7818 [37mglobal_avg_top_loss: 1.9697
+[titan] 2025-09-09 14:33:04,239 - root - INFO - [34mlr: 7.8981e-06 gnorm: 0.34 [35m[1 day, 20:57:36<1 day, 4:02:30][39m
+[titan] 2025-09-09 14:33:36,037 - root - INFO - [31mstep: 24640 [32mloss: 2.7131 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.13 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7644 [37mglobal_avg_top_loss: 1.9488
+[titan] 2025-09-09 14:33:36,038 - root - INFO - [34mlr: 7.8947e-06 gnorm: 0.35 [35m[1 day, 20:58:07<1 day, 4:01:57][39m
+[titan] 2025-09-09 14:34:07,907 - root - INFO - [31mstep: 24645 [32mloss: 2.7299 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.05 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7729 [37mglobal_avg_top_loss: 1.9570
+[titan] 2025-09-09 14:34:07,907 - root - INFO - [34mlr: 7.8914e-06 gnorm: 0.34 [35m[1 day, 20:58:39<1 day, 4:01:23][39m
+[titan] 2025-09-09 14:34:33,349 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:34:39,733 - root - INFO - [31mstep: 24650 [32mloss: 2.7870 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.71 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7977 [37mglobal_avg_top_loss: 1.9893
+[titan] 2025-09-09 14:34:39,733 - root - INFO - [34mlr: 7.8880e-06 gnorm: 0.36 [35m[1 day, 20:59:11<1 day, 4:00:50][39m
+[titan] 2025-09-09 14:35:11,660 - root - INFO - [31mstep: 24655 [32mloss: 2.7815 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.16 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7969 [37mglobal_avg_top_loss: 1.9845
+[titan] 2025-09-09 14:35:11,661 - root - INFO - [34mlr: 7.8847e-06 gnorm: 0.34 [35m[1 day, 20:59:43<1 day, 4:00:16][39m
+[titan] 2025-09-09 14:35:43,537 - root - INFO - [31mstep: 24660 [32mloss: 2.7108 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.93 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7669 [37mglobal_avg_top_loss: 1.9439
+[titan] 2025-09-09 14:35:43,538 - root - INFO - [34mlr: 7.8813e-06 gnorm: 0.34 [35m[1 day, 21:00:15<1 day, 3:59:43][39m
+[titan] 2025-09-09 14:36:15,608 - root - INFO - [31mstep: 24665 [32mloss: 2.7022 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.96 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7633 [37mglobal_avg_top_loss: 1.9389
+[titan] 2025-09-09 14:36:15,609 - root - INFO - [34mlr: 7.8780e-06 gnorm: 0.36 [35m[1 day, 21:00:47<1 day, 3:59:09][39m
+[titan] 2025-09-09 14:36:47,445 - root - INFO - [31mstep: 24670 [32mloss: 2.6612 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.55 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7436 [37mglobal_avg_top_loss: 1.9175
+[titan] 2025-09-09 14:36:47,446 - root - INFO - [34mlr: 7.8746e-06 gnorm: 0.33 [35m[1 day, 21:01:19<1 day, 3:58:36][39m
+[titan] 2025-09-09 14:37:19,288 - root - INFO - [31mstep: 24675 [32mloss: 2.7244 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.46 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7718 [37mglobal_avg_top_loss: 1.9526
+[titan] 2025-09-09 14:37:19,288 - root - INFO - [34mlr: 7.8713e-06 gnorm: 0.37 [35m[1 day, 21:01:51<1 day, 3:58:03][39m
+[titan] 2025-09-09 14:37:51,363 - root - INFO - [31mstep: 24680 [32mloss: 2.7588 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.90 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7815 [37mglobal_avg_top_loss: 1.9773
+[titan] 2025-09-09 14:37:51,363 - root - INFO - [34mlr: 7.8679e-06 gnorm: 0.35 [35m[1 day, 21:02:23<1 day, 3:57:29][39m
+[titan] 2025-09-09 14:38:23,298 - root - INFO - [31mstep: 24685 [32mloss: 2.7681 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.04 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7891 [37mglobal_avg_top_loss: 1.9790
+[titan] 2025-09-09 14:38:23,299 - root - INFO - [34mlr: 7.8646e-06 gnorm: 0.39 [35m[1 day, 21:02:55<1 day, 3:56:56][39m
+[titan] 2025-09-09 14:38:55,253 - root - INFO - [31mstep: 24690 [32mloss: 2.7401 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.73 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7722 [37mglobal_avg_top_loss: 1.9679
+[titan] 2025-09-09 14:38:55,253 - root - INFO - [34mlr: 7.8613e-06 gnorm: 0.37 [35m[1 day, 21:03:27<1 day, 3:56:22][39m
+[titan] 2025-09-09 14:39:26,993 - root - INFO - [31mstep: 24695 [32mloss: 2.8241 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,324 [36mtflops: 492.05 [35mmfu: 49.75%[39m [37mglobal_avg_ntp_loss: 0.8151 [37mglobal_avg_top_loss: 2.0090
+[titan] 2025-09-09 14:39:26,993 - root - INFO - [34mlr: 7.8579e-06 gnorm: 0.35 [35m[1 day, 21:03:58<1 day, 3:55:49][39m
+[titan] 2025-09-09 14:39:52,703 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:39:59,087 - root - INFO - [31mstep: 24700 [32mloss: 2.8121 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,210 [36mtflops: 486.61 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7910 [37mglobal_avg_top_loss: 2.0212
+[titan] 2025-09-09 14:39:59,088 - root - INFO - [34mlr: 7.8546e-06 gnorm: 3.70 [35m[1 day, 21:04:30<1 day, 3:55:16][39m
+[titan] 2025-09-09 14:40:30,950 - root - INFO - [31mstep: 24705 [32mloss: 2.7303 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,284 [36mtflops: 490.15 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7716 [37mglobal_avg_top_loss: 1.9588
+[titan] 2025-09-09 14:40:30,950 - root - INFO - [34mlr: 7.8512e-06 gnorm: 0.35 [35m[1 day, 21:05:02<1 day, 3:54:42][39m
+[titan] 2025-09-09 14:41:03,059 - root - INFO - [31mstep: 24710 [32mloss: 2.6409 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.38 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7367 [37mglobal_avg_top_loss: 1.9042
+[titan] 2025-09-09 14:41:03,060 - root - INFO - [34mlr: 7.8479e-06 gnorm: 0.35 [35m[1 day, 21:05:34<1 day, 3:54:09][39m
+[titan] 2025-09-09 14:41:34,968 - root - INFO - [31mstep: 24715 [32mloss: 2.7260 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.44 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7745 [37mglobal_avg_top_loss: 1.9516
+[titan] 2025-09-09 14:41:34,968 - root - INFO - [34mlr: 7.8445e-06 gnorm: 0.35 [35m[1 day, 21:06:06<1 day, 3:53:35][39m
+[titan] 2025-09-09 14:42:06,966 - root - INFO - [31mstep: 24720 [32mloss: 2.7557 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.07 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7831 [37mglobal_avg_top_loss: 1.9726
+[titan] 2025-09-09 14:42:06,967 - root - INFO - [34mlr: 7.8412e-06 gnorm: 0.36 [35m[1 day, 21:06:38<1 day, 3:53:02][39m
+[titan] 2025-09-09 14:42:38,908 - root - INFO - [31mstep: 24725 [32mloss: 2.6834 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.93 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7521 [37mglobal_avg_top_loss: 1.9313
+[titan] 2025-09-09 14:42:38,909 - root - INFO - [34mlr: 7.8378e-06 gnorm: 0.34 [35m[1 day, 21:07:10<1 day, 3:52:29][39m
+[titan] 2025-09-09 14:43:10,814 - root - INFO - [31mstep: 24730 [32mloss: 2.7838 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.49 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7999 [37mglobal_avg_top_loss: 1.9839
+[titan] 2025-09-09 14:43:10,815 - root - INFO - [34mlr: 7.8345e-06 gnorm: 0.35 [35m[1 day, 21:07:42<1 day, 3:51:55][39m
+[titan] 2025-09-09 14:43:42,705 - root - INFO - [31mstep: 24735 [32mloss: 2.7348 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.72 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7740 [37mglobal_avg_top_loss: 1.9608
+[titan] 2025-09-09 14:43:42,705 - root - INFO - [34mlr: 7.8312e-06 gnorm: 0.35 [35m[1 day, 21:08:14<1 day, 3:51:22][39m
+[titan] 2025-09-09 14:44:14,645 - root - INFO - [31mstep: 24740 [32mloss: 2.7169 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.97 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7670 [37mglobal_avg_top_loss: 1.9499
+[titan] 2025-09-09 14:44:14,645 - root - INFO - [34mlr: 7.8278e-06 gnorm: 0.34 [35m[1 day, 21:08:46<1 day, 3:50:48][39m
+[titan] 2025-09-09 14:44:46,722 - root - INFO - [31mstep: 24745 [32mloss: 2.6963 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.87 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7644 [37mglobal_avg_top_loss: 1.9319
+[titan] 2025-09-09 14:44:46,723 - root - INFO - [34mlr: 7.8245e-06 gnorm: 0.36 [35m[1 day, 21:09:18<1 day, 3:50:15][39m
+[titan] 2025-09-09 14:45:12,224 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:45:18,632 - root - INFO - [31mstep: 24750 [32mloss: 2.7410 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.42 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7766 [37mglobal_avg_top_loss: 1.9644
+[titan] 2025-09-09 14:45:18,632 - root - INFO - [34mlr: 7.8211e-06 gnorm: 0.36 [35m[1 day, 21:09:50<1 day, 3:49:42][39m
+[titan] 2025-09-09 14:45:50,412 - root - INFO - [31mstep: 24755 [32mloss: 2.6954 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,311 [36mtflops: 491.42 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.7550 [37mglobal_avg_top_loss: 1.9404
+[titan] 2025-09-09 14:45:50,413 - root - INFO - [34mlr: 7.8178e-06 gnorm: 0.35 [35m[1 day, 21:10:22<1 day, 3:49:08][39m
+[titan] 2025-09-09 14:46:22,365 - root - INFO - [31mstep: 24760 [32mloss: 3.7430 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.77 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 1.3471 [37mglobal_avg_top_loss: 2.3960
+[titan] 2025-09-09 14:46:22,365 - root - INFO - [34mlr: 7.8145e-06 gnorm: 0.39 [35m[1 day, 21:10:54<1 day, 3:48:35][39m
+[titan] 2025-09-09 14:46:54,412 - root - INFO - [31mstep: 24765 [32mloss: 2.7549 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.33 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7850 [37mglobal_avg_top_loss: 1.9699
+[titan] 2025-09-09 14:46:54,412 - root - INFO - [34mlr: 7.8111e-06 gnorm: 0.34 [35m[1 day, 21:11:26<1 day, 3:48:01][39m
+[titan] 2025-09-09 14:47:26,371 - root - INFO - [31mstep: 24770 [32mloss: 2.8128 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.67 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.8189 [37mglobal_avg_top_loss: 1.9939
+[titan] 2025-09-09 14:47:26,371 - root - INFO - [34mlr: 7.8078e-06 gnorm: 0.34 [35m[1 day, 21:11:58<1 day, 3:47:28][39m
+[titan] 2025-09-09 14:47:58,294 - root - INFO - [31mstep: 24775 [32mloss: 2.7395 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.22 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7772 [37mglobal_avg_top_loss: 1.9623
+[titan] 2025-09-09 14:47:58,295 - root - INFO - [34mlr: 7.8044e-06 gnorm: 0.37 [35m[1 day, 21:12:30<1 day, 3:46:54][39m
+[titan] 2025-09-09 14:48:30,389 - root - INFO - [31mstep: 24780 [32mloss: 2.7780 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,210 [36mtflops: 486.60 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7968 [37mglobal_avg_top_loss: 1.9812
+[titan] 2025-09-09 14:48:30,390 - root - INFO - [34mlr: 7.8011e-06 gnorm: 0.35 [35m[1 day, 21:13:02<1 day, 3:46:21][39m
+[titan] 2025-09-09 14:49:02,456 - root - INFO - [31mstep: 24785 [32mloss: 2.7093 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.02 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7584 [37mglobal_avg_top_loss: 1.9509
+[titan] 2025-09-09 14:49:02,457 - root - INFO - [34mlr: 7.7978e-06 gnorm: 0.41 [35m[1 day, 21:13:34<1 day, 3:45:48][39m
+[titan] 2025-09-09 14:49:34,287 - root - INFO - [31mstep: 24790 [32mloss: 2.6898 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.65 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7607 [37mglobal_avg_top_loss: 1.9290
+[titan] 2025-09-09 14:49:34,287 - root - INFO - [34mlr: 7.7944e-06 gnorm: 0.34 [35m[1 day, 21:14:06<1 day, 3:45:14][39m
+[titan] 2025-09-09 14:50:06,094 - root - INFO - [31mstep: 24795 [32mloss: 3.1757 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.99 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 1.0245 [37mglobal_avg_top_loss: 2.1511
+[titan] 2025-09-09 14:50:06,095 - root - INFO - [34mlr: 7.7911e-06 gnorm: 0.36 [35m[1 day, 21:14:37<1 day, 3:44:41][39m
+[titan] 2025-09-09 14:50:31,674 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:50:38,103 - root - INFO - [31mstep: 24800 [32mloss: 2.7427 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.92 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7777 [37mglobal_avg_top_loss: 1.9651
+[titan] 2025-09-09 14:50:38,103 - root - INFO - [34mlr: 7.7878e-06 gnorm: 0.36 [35m[1 day, 21:15:09<1 day, 3:44:08][39m
+[titan] 2025-09-09 14:51:10,096 - root - INFO - [31mstep: 24805 [32mloss: 2.7060 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.15 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7642 [37mglobal_avg_top_loss: 1.9418
+[titan] 2025-09-09 14:51:10,097 - root - INFO - [34mlr: 7.7844e-06 gnorm: 0.37 [35m[1 day, 21:15:41<1 day, 3:43:34][39m
+[titan] 2025-09-09 14:51:42,191 - root - INFO - [31mstep: 24810 [32mloss: 2.7922 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,210 [36mtflops: 486.60 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7986 [37mglobal_avg_top_loss: 1.9936
+[titan] 2025-09-09 14:51:42,191 - root - INFO - [34mlr: 7.7811e-06 gnorm: 0.36 [35m[1 day, 21:16:14<1 day, 3:43:01][39m
+[titan] 2025-09-09 14:52:14,027 - root - INFO - [31mstep: 24815 [32mloss: 2.7399 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.55 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7776 [37mglobal_avg_top_loss: 1.9623
+[titan] 2025-09-09 14:52:14,028 - root - INFO - [34mlr: 7.7778e-06 gnorm: 0.34 [35m[1 day, 21:16:45<1 day, 3:42:27][39m
+[titan] 2025-09-09 14:52:45,978 - root - INFO - [31mstep: 24820 [32mloss: 2.8046 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.80 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.8083 [37mglobal_avg_top_loss: 1.9964
+[titan] 2025-09-09 14:52:45,978 - root - INFO - [34mlr: 7.7744e-06 gnorm: 0.37 [35m[1 day, 21:17:17<1 day, 3:41:54][39m
+[titan] 2025-09-09 14:53:18,076 - root - INFO - [31mstep: 24825 [32mloss: 2.7670 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.55 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7887 [37mglobal_avg_top_loss: 1.9784
+[titan] 2025-09-09 14:53:18,077 - root - INFO - [34mlr: 7.7711e-06 gnorm: 0.35 [35m[1 day, 21:17:49<1 day, 3:41:21][39m
+[titan] 2025-09-09 14:53:50,118 - root - INFO - [31mstep: 24830 [32mloss: 2.5914 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.41 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7095 [37mglobal_avg_top_loss: 1.8820
+[titan] 2025-09-09 14:53:50,118 - root - INFO - [34mlr: 7.7678e-06 gnorm: 0.33 [35m[1 day, 21:18:21<1 day, 3:40:47][39m
+[titan] 2025-09-09 14:54:22,208 - root - INFO - [31mstep: 24835 [32mloss: 2.7687 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.67 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7876 [37mglobal_avg_top_loss: 1.9811
+[titan] 2025-09-09 14:54:22,208 - root - INFO - [34mlr: 7.7644e-06 gnorm: 0.36 [35m[1 day, 21:18:54<1 day, 3:40:14][39m
+[titan] 2025-09-09 14:54:54,169 - root - INFO - [31mstep: 24840 [32mloss: 3.1903 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 1.0353 [37mglobal_avg_top_loss: 2.1550
+[titan] 2025-09-09 14:54:54,169 - root - INFO - [34mlr: 7.7611e-06 gnorm: 0.36 [35m[1 day, 21:19:25<1 day, 3:39:41][39m
+[titan] 2025-09-09 14:55:26,267 - root - INFO - [31mstep: 24845 [32mloss: 2.7710 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.54 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7889 [37mglobal_avg_top_loss: 1.9821
+[titan] 2025-09-09 14:55:26,268 - root - INFO - [34mlr: 7.7578e-06 gnorm: 0.45 [35m[1 day, 21:19:58<1 day, 3:39:07][39m
+[titan] 2025-09-09 14:55:51,734 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 14:55:58,123 - root - INFO - [31mstep: 24850 [32mloss: 2.6850 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.25 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7535 [37mglobal_avg_top_loss: 1.9316
+[titan] 2025-09-09 14:55:58,124 - root - INFO - [34mlr: 7.7544e-06 gnorm: 0.38 [35m[1 day, 21:20:29<1 day, 3:38:34][39m
+[titan] 2025-09-09 14:56:30,002 - root - INFO - [31mstep: 24855 [32mloss: 2.6466 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.91 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7419 [37mglobal_avg_top_loss: 1.9047
+[titan] 2025-09-09 14:56:30,002 - root - INFO - [34mlr: 7.7511e-06 gnorm: 0.35 [35m[1 day, 21:21:01<1 day, 3:38:00][39m
+[titan] 2025-09-09 14:57:01,912 - root - INFO - [31mstep: 24860 [32mloss: 2.8290 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.41 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8163 [37mglobal_avg_top_loss: 2.0127
+[titan] 2025-09-09 14:57:01,913 - root - INFO - [34mlr: 7.7478e-06 gnorm: 0.39 [35m[1 day, 21:21:33<1 day, 3:37:27][39m
+[titan] 2025-09-09 14:57:33,748 - root - INFO - [31mstep: 24865 [32mloss: 2.7765 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.57 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7959 [37mglobal_avg_top_loss: 1.9806
+[titan] 2025-09-09 14:57:33,748 - root - INFO - [34mlr: 7.7445e-06 gnorm: 0.39 [35m[1 day, 21:22:05<1 day, 3:36:54][39m
+[titan] 2025-09-09 14:58:05,708 - root - INFO - [31mstep: 24870 [32mloss: 2.5812 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7076 [37mglobal_avg_top_loss: 1.8736
+[titan] 2025-09-09 14:58:05,708 - root - INFO - [34mlr: 7.7411e-06 gnorm: 0.38 [35m[1 day, 21:22:37<1 day, 3:36:20][39m
+[titan] 2025-09-09 14:58:37,698 - root - INFO - [31mstep: 24875 [32mloss: 2.7751 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.21 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7920 [37mglobal_avg_top_loss: 1.9831
+[titan] 2025-09-09 14:58:37,698 - root - INFO - [34mlr: 7.7378e-06 gnorm: 0.35 [35m[1 day, 21:23:09<1 day, 3:35:47][39m
+[titan] 2025-09-09 14:59:09,602 - root - INFO - [31mstep: 24880 [32mloss: 2.8366 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.51 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8213 [37mglobal_avg_top_loss: 2.0153
+[titan] 2025-09-09 14:59:09,603 - root - INFO - [34mlr: 7.7345e-06 gnorm: 0.37 [35m[1 day, 21:23:41<1 day, 3:35:13][39m
+[titan] 2025-09-09 14:59:41,691 - root - INFO - [31mstep: 24885 [32mloss: 2.7641 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.70 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7857 [37mglobal_avg_top_loss: 1.9783
+[titan] 2025-09-09 14:59:41,691 - root - INFO - [34mlr: 7.7312e-06 gnorm: 0.36 [35m[1 day, 21:24:13<1 day, 3:34:40][39m
+[titan] 2025-09-09 15:00:13,595 - root - INFO - [31mstep: 24890 [32mloss: 2.7429 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.51 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7790 [37mglobal_avg_top_loss: 1.9639
+[titan] 2025-09-09 15:00:13,595 - root - INFO - [34mlr: 7.7278e-06 gnorm: 0.36 [35m[1 day, 21:24:45<1 day, 3:34:07][39m
+[titan] 2025-09-09 15:00:45,716 - root - INFO - [31mstep: 24895 [32mloss: 2.7298 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,202 [36mtflops: 486.20 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7738 [37mglobal_avg_top_loss: 1.9559
+[titan] 2025-09-09 15:00:45,717 - root - INFO - [34mlr: 7.7245e-06 gnorm: 0.36 [35m[1 day, 21:25:17<1 day, 3:33:33][39m
+[titan] 2025-09-09 15:01:11,130 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 15:01:17,533 - root - INFO - [31mstep: 24900 [32mloss: 3.1743 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.86 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 1.0251 [37mglobal_avg_top_loss: 2.1492
+[titan] 2025-09-09 15:01:17,533 - root - INFO - [34mlr: 7.7212e-06 gnorm: 0.51 [35m[1 day, 21:25:49<1 day, 3:33:00][39m
+[titan] 2025-09-09 15:01:49,285 - root - INFO - [31mstep: 24905 [32mloss: 2.7488 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,320 [36mtflops: 491.84 [35mmfu: 49.73%[39m [37mglobal_avg_ntp_loss: 0.7819 [37mglobal_avg_top_loss: 1.9670
+[titan] 2025-09-09 15:01:49,286 - root - INFO - [34mlr: 7.7178e-06 gnorm: 0.36 [35m[1 day, 21:26:21<1 day, 3:32:27][39m
+[titan] 2025-09-09 15:02:21,257 - root - INFO - [31mstep: 24910 [32mloss: 2.7033 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.48 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7582 [37mglobal_avg_top_loss: 1.9451
+[titan] 2025-09-09 15:02:21,257 - root - INFO - [34mlr: 7.7145e-06 gnorm: 0.36 [35m[1 day, 21:26:53<1 day, 3:31:53][39m
+[titan] 2025-09-09 15:02:53,178 - root - INFO - [31mstep: 24915 [32mloss: 2.7503 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.25 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7838 [37mglobal_avg_top_loss: 1.9664
+[titan] 2025-09-09 15:02:53,179 - root - INFO - [34mlr: 7.7112e-06 gnorm: 0.35 [35m[1 day, 21:27:24<1 day, 3:31:20][39m
+[titan] 2025-09-09 15:03:24,949 - root - INFO - [31mstep: 24920 [32mloss: 3.1861 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,314 [36mtflops: 491.57 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 1.0351 [37mglobal_avg_top_loss: 2.1510
+[titan] 2025-09-09 15:03:24,949 - root - INFO - [34mlr: 7.7079e-06 gnorm: 0.36 [35m[1 day, 21:27:56<1 day, 3:30:46][39m
+[titan] 2025-09-09 15:03:56,819 - root - INFO - [31mstep: 24925 [32mloss: 2.7772 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.03 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7955 [37mglobal_avg_top_loss: 1.9817
+[titan] 2025-09-09 15:03:56,820 - root - INFO - [34mlr: 7.7046e-06 gnorm: 0.37 [35m[1 day, 21:28:28<1 day, 3:30:13][39m
+[titan] 2025-09-09 15:04:28,916 - root - INFO - [31mstep: 24930 [32mloss: 2.7256 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.57 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7724 [37mglobal_avg_top_loss: 1.9532
+[titan] 2025-09-09 15:04:28,917 - root - INFO - [34mlr: 7.7012e-06 gnorm: 0.36 [35m[1 day, 21:29:00<1 day, 3:29:40][39m
+[titan] 2025-09-09 15:05:00,579 - root - INFO - [31mstep: 24935 [32mloss: 2.6950 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,349 [36mtflops: 493.24 [35mmfu: 49.87%[39m [37mglobal_avg_ntp_loss: 0.7613 [37mglobal_avg_top_loss: 1.9337
+[titan] 2025-09-09 15:05:00,579 - root - INFO - [34mlr: 7.6979e-06 gnorm: 0.36 [35m[1 day, 21:29:32<1 day, 3:29:06][39m
+[titan] 2025-09-09 15:05:32,534 - root - INFO - [31mstep: 24940 [32mloss: 2.6885 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.73 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7537 [37mglobal_avg_top_loss: 1.9348
+[titan] 2025-09-09 15:05:32,534 - root - INFO - [34mlr: 7.6946e-06 gnorm: 0.34 [35m[1 day, 21:30:04<1 day, 3:28:33][39m
+[titan] 2025-09-09 15:06:04,529 - root - INFO - [31mstep: 24945 [32mloss: 2.6884 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.12 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7578 [37mglobal_avg_top_loss: 1.9305
+[titan] 2025-09-09 15:06:04,530 - root - INFO - [34mlr: 7.6913e-06 gnorm: 0.36 [35m[1 day, 21:30:36<1 day, 3:27:59][39m
+[titan] 2025-09-09 15:06:30,063 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 15:06:36,469 - root - INFO - [31mstep: 24950 [32mloss: 2.7473 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.96 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7793 [37mglobal_avg_top_loss: 1.9680
+[titan] 2025-09-09 15:06:36,470 - root - INFO - [34mlr: 7.6880e-06 gnorm: 0.34 [35m[1 day, 21:31:08<1 day, 3:27:26][39m
+[titan] 2025-09-09 15:07:08,507 - root - INFO - [31mstep: 24955 [32mloss: 2.9943 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.48 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.9099 [37mglobal_avg_top_loss: 2.0844
+[titan] 2025-09-09 15:07:08,507 - root - INFO - [34mlr: 7.6846e-06 gnorm: 0.35 [35m[1 day, 21:31:40<1 day, 3:26:53][39m
+[titan] 2025-09-09 15:07:40,317 - root - INFO - [31mstep: 24960 [32mloss: 2.7376 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,301 [36mtflops: 490.95 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7739 [37mglobal_avg_top_loss: 1.9637
+[titan] 2025-09-09 15:07:40,318 - root - INFO - [34mlr: 7.6813e-06 gnorm: 0.35 [35m[1 day, 21:32:12<1 day, 3:26:19][39m
+[titan] 2025-09-09 15:08:12,213 - root - INFO - [31mstep: 24965 [32mloss: 2.7561 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.64 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7854 [37mglobal_avg_top_loss: 1.9707
+[titan] 2025-09-09 15:08:12,213 - root - INFO - [34mlr: 7.6780e-06 gnorm: 0.35 [35m[1 day, 21:32:44<1 day, 3:25:46][39m
+[titan] 2025-09-09 15:08:44,173 - root - INFO - [31mstep: 24970 [32mloss: 3.2382 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 1.0568 [37mglobal_avg_top_loss: 2.1814
+[titan] 2025-09-09 15:08:44,173 - root - INFO - [34mlr: 7.6747e-06 gnorm: 0.36 [35m[1 day, 21:33:15<1 day, 3:25:12][39m
+[titan] 2025-09-09 15:09:16,220 - root - INFO - [31mstep: 24975 [32mloss: 2.7016 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.32 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7614 [37mglobal_avg_top_loss: 1.9402
+[titan] 2025-09-09 15:09:16,221 - root - INFO - [34mlr: 7.6714e-06 gnorm: 0.35 [35m[1 day, 21:33:48<1 day, 3:24:39][39m
+[titan] 2025-09-09 15:09:48,045 - root - INFO - [31mstep: 24980 [32mloss: 2.8174 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,297 [36mtflops: 490.74 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.8100 [37mglobal_avg_top_loss: 2.0074
+[titan] 2025-09-09 15:09:48,045 - root - INFO - [34mlr: 7.6680e-06 gnorm: 0.36 [35m[1 day, 21:34:19<1 day, 3:24:06][39m
+[titan] 2025-09-09 15:10:19,922 - root - INFO - [31mstep: 24985 [32mloss: 2.7007 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.93 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7611 [37mglobal_avg_top_loss: 1.9396
+[titan] 2025-09-09 15:10:19,922 - root - INFO - [34mlr: 7.6647e-06 gnorm: 0.38 [35m[1 day, 21:34:51<1 day, 3:23:32][39m
+[titan] 2025-09-09 15:10:51,843 - root - INFO - [31mstep: 24990 [32mloss: 2.7268 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.26 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7742 [37mglobal_avg_top_loss: 1.9525
+[titan] 2025-09-09 15:10:51,843 - root - INFO - [34mlr: 7.6614e-06 gnorm: 0.34 [35m[1 day, 21:35:23<1 day, 3:22:59][39m
+[titan] 2025-09-09 15:11:23,641 - root - INFO - [31mstep: 24995 [32mloss: 2.7406 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.15 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7777 [37mglobal_avg_top_loss: 1.9629
+[titan] 2025-09-09 15:11:23,641 - root - INFO - [34mlr: 7.6581e-06 gnorm: 0.35 [35m[1 day, 21:35:55<1 day, 3:22:25][39m
+[titan] 2025-09-09 15:11:49,083 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 15:11:55,515 - root - INFO - [31mstep: 25000 [32mloss: 3.2441 [33mmemory: 122.03GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.97 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 1.0620 [37mglobal_avg_top_loss: 2.1820
+[titan] 2025-09-09 15:11:55,515 - root - INFO - [34mlr: 7.6548e-06 gnorm: 0.34 [35m[1 day, 21:36:27<1 day, 3:21:52][39m
+[titan] 2025-09-09 15:11:55,515 - root - INFO - Saving the checkpoint (or staging if async is enabled).
+[titan] 2025-09-09 15:12:27,709 - root - INFO - [GC] GC collection invoked by checkpointer. 0.01 seconds.
+[titan] 2025-09-09 15:12:27,710 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 32.19 seconds.
+[titan] 2025-09-09 15:12:27,710 - root - INFO - Ensuring repository zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757 exists...
+[titan] 2025-09-09 15:12:28,272 - root - INFO - Repository zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757 ensured.
+[titan] 2025-09-09 15:12:28,272 - root - INFO - Uploading exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/checkpoint/step-25000 to zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757/step-25000 on Hugging Face Hub...
+Processing Files (9 / 9) : 100%|██████████| 83.3GB / 83.3GB, 0.00B/s
+New Data Upload : 100%|██████████| 83.3GB / 83.3GB, 0.00B/s
+ ...ine/checkpoint/step-25000/.metadata: 100%|██████████| 2.47MB / 2.47MB
+ .../checkpoint/step-25000/__1_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-25000/__4_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-25000/__7_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-25000/__2_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-25000/__0_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-25000/__5_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-25000/__3_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-25000/__6_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+[titan] 2025-09-09 15:30:21,940 - root - INFO - Successfully uploaded step 25000 to zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757.
+[titan] 2025-09-09 15:30:51,806 - root - INFO - [31mstep: 25005 [32mloss: 2.6619 [33mmemory: 122.04GiB(87.57%) [34mtps: 288 [36mtflops: 13.74 [35mmfu: 1.39%[39m [37mglobal_avg_ntp_loss: 0.7419 [37mglobal_avg_top_loss: 1.9200
+[titan] 2025-09-09 15:30:51,806 - root - INFO - [34mlr: 7.6515e-06 gnorm: 0.34 [35m[1 day, 21:55:23<1 day, 3:32:21][39m
+[titan] 2025-09-09 15:31:21,791 - root - INFO - [31mstep: 25010 [32mloss: 2.6994 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,929 [36mtflops: 520.85 [35mmfu: 52.66%[39m [37mglobal_avg_ntp_loss: 0.7595 [37mglobal_avg_top_loss: 1.9399
+[titan] 2025-09-09 15:31:21,791 - root - INFO - [34mlr: 7.6482e-06 gnorm: 0.34 [35m[1 day, 21:55:53<1 day, 3:31:46][39m
+[titan] 2025-09-09 15:31:51,816 - root - INFO - [31mstep: 25015 [32mloss: 2.7680 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,914 [36mtflops: 520.14 [35mmfu: 52.59%[39m [37mglobal_avg_ntp_loss: 0.7980 [37mglobal_avg_top_loss: 1.9700
+[titan] 2025-09-09 15:31:51,817 - root - INFO - [34mlr: 7.6448e-06 gnorm: 0.39 [35m[1 day, 21:56:23<1 day, 3:31:11][39m
+[titan] 2025-09-09 15:32:22,100 - root - INFO - [31mstep: 25020 [32mloss: 2.8093 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,821 [36mtflops: 515.70 [35mmfu: 52.14%[39m [37mglobal_avg_ntp_loss: 0.8147 [37mglobal_avg_top_loss: 1.9946
+[titan] 2025-09-09 15:32:22,101 - root - INFO - [34mlr: 7.6415e-06 gnorm: 0.35 [35m[1 day, 21:56:53<1 day, 3:30:36][39m
+[titan] 2025-09-09 15:32:52,563 - root - INFO - [31mstep: 25025 [32mloss: 2.7062 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,757 [36mtflops: 512.67 [35mmfu: 51.84%[39m [37mglobal_avg_ntp_loss: 0.7619 [37mglobal_avg_top_loss: 1.9443
+[titan] 2025-09-09 15:32:52,564 - root - INFO - [34mlr: 7.6382e-06 gnorm: 0.34 [35m[1 day, 21:57:24<1 day, 3:30:02][39m
+[titan] 2025-09-09 15:33:23,014 - root - INFO - [31mstep: 25030 [32mloss: 2.7342 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,762 [36mtflops: 512.89 [35mmfu: 51.86%[39m [37mglobal_avg_ntp_loss: 0.7730 [37mglobal_avg_top_loss: 1.9612
+[titan] 2025-09-09 15:33:23,014 - root - INFO - [34mlr: 7.6349e-06 gnorm: 0.35 [35m[1 day, 21:57:54<1 day, 3:29:27][39m
+[titan] 2025-09-09 15:33:53,496 - root - INFO - [31mstep: 25035 [32mloss: 2.7302 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,750 [36mtflops: 512.34 [35mmfu: 51.80%[39m [37mglobal_avg_ntp_loss: 0.7741 [37mglobal_avg_top_loss: 1.9561
+[titan] 2025-09-09 15:33:53,497 - root - INFO - [34mlr: 7.6316e-06 gnorm: 0.36 [35m[1 day, 21:58:25<1 day, 3:28:52][39m
+[titan] 2025-09-09 15:34:24,349 - root - INFO - [31mstep: 25040 [32mloss: 2.6863 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,621 [36mtflops: 506.19 [35mmfu: 51.18%[39m [37mglobal_avg_ntp_loss: 0.7550 [37mglobal_avg_top_loss: 1.9313
+[titan] 2025-09-09 15:34:24,349 - root - INFO - [34mlr: 7.6283e-06 gnorm: 0.35 [35m[1 day, 21:58:56<1 day, 3:28:18][39m
+[titan] 2025-09-09 15:34:55,417 - root - INFO - [31mstep: 25045 [32mloss: 2.7444 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,548 [36mtflops: 502.69 [35mmfu: 50.83%[39m [37mglobal_avg_ntp_loss: 0.7779 [37mglobal_avg_top_loss: 1.9664
+[titan] 2025-09-09 15:34:55,417 - root - INFO - [34mlr: 7.6250e-06 gnorm: 0.34 [35m[1 day, 21:59:27<1 day, 3:27:44][39m
+[titan] 2025-09-09 15:35:20,334 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 15:35:26,657 - root - INFO - [31mstep: 25050 [32mloss: 3.2923 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,489 [36mtflops: 499.92 [35mmfu: 50.55%[39m [37mglobal_avg_ntp_loss: 1.0811 [37mglobal_avg_top_loss: 2.2112
+[titan] 2025-09-09 15:35:26,658 - root - INFO - [34mlr: 7.6217e-06 gnorm: 0.37 [35m[1 day, 21:59:58<1 day, 3:27:10][39m
+[titan] 2025-09-09 15:35:58,018 - root - INFO - [31mstep: 25055 [32mloss: 2.7621 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,449 [36mtflops: 497.99 [35mmfu: 50.35%[39m [37mglobal_avg_ntp_loss: 0.7882 [37mglobal_avg_top_loss: 1.9739
+[titan] 2025-09-09 15:35:58,019 - root - INFO - [34mlr: 7.6184e-06 gnorm: 0.34 [35m[1 day, 22:00:29<1 day, 3:26:36][39m
+[titan] 2025-09-09 15:36:29,514 - root - INFO - [31mstep: 25060 [32mloss: 2.6972 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,404 [36mtflops: 495.85 [35mmfu: 50.14%[39m [37mglobal_avg_ntp_loss: 0.7655 [37mglobal_avg_top_loss: 1.9317
+[titan] 2025-09-09 15:36:29,515 - root - INFO - [34mlr: 7.6151e-06 gnorm: 0.36 [35m[1 day, 22:01:01<1 day, 3:26:02][39m
+[titan] 2025-09-09 15:37:01,088 - root - INFO - [31mstep: 25065 [32mloss: 2.7690 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,379 [36mtflops: 494.64 [35mmfu: 50.01%[39m [37mglobal_avg_ntp_loss: 0.7900 [37mglobal_avg_top_loss: 1.9790
+[titan] 2025-09-09 15:37:01,089 - root - INFO - [34mlr: 7.6117e-06 gnorm: 0.36 [35m[1 day, 22:01:32<1 day, 3:25:28][39m
+[titan] 2025-09-09 15:37:32,800 - root - INFO - [31mstep: 25070 [32mloss: 2.6993 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,333 [36mtflops: 492.48 [35mmfu: 49.80%[39m [37mglobal_avg_ntp_loss: 0.7578 [37mglobal_avg_top_loss: 1.9415
+[titan] 2025-09-09 15:37:32,801 - root - INFO - [34mlr: 7.6084e-06 gnorm: 0.34 [35m[1 day, 22:02:04<1 day, 3:24:54][39m
+[titan] 2025-09-09 15:38:04,486 - root - INFO - [31mstep: 25075 [32mloss: 2.7475 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,342 [36mtflops: 492.88 [35mmfu: 49.84%[39m [37mglobal_avg_ntp_loss: 0.7803 [37mglobal_avg_top_loss: 1.9672
+[titan] 2025-09-09 15:38:04,487 - root - INFO - [34mlr: 7.6051e-06 gnorm: 0.36 [35m[1 day, 22:02:36<1 day, 3:24:20][39m
+[titan] 2025-09-09 15:38:36,007 - root - INFO - [31mstep: 25080 [32mloss: 3.2259 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,396 [36mtflops: 495.47 [35mmfu: 50.10%[39m [37mglobal_avg_ntp_loss: 1.0516 [37mglobal_avg_top_loss: 2.1744
+[titan] 2025-09-09 15:38:36,007 - root - INFO - [34mlr: 7.6018e-06 gnorm: 0.37 [35m[1 day, 22:03:07<1 day, 3:23:46][39m
+[titan] 2025-09-09 15:39:07,903 - root - INFO - [31mstep: 25085 [32mloss: 2.8085 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.63 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.8177 [37mglobal_avg_top_loss: 1.9908
+[titan] 2025-09-09 15:39:07,903 - root - INFO - [34mlr: 7.5985e-06 gnorm: 0.36 [35m[1 day, 22:03:39<1 day, 3:23:12][39m
+[titan] 2025-09-09 15:39:27,232 - root - INFO - Dumping profiler traces at step 25088
+[titan] 2025-09-09 15:39:27,290 - root - INFO - Finished dumping profiler traces in 0.06 seconds
+[titan] 2025-09-09 15:39:39,964 - root - INFO - [31mstep: 25090 [32mloss: 2.7552 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.12 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7837 [37mglobal_avg_top_loss: 1.9715
+[titan] 2025-09-09 15:39:39,964 - root - INFO - [34mlr: 7.5952e-06 gnorm: 0.35 [35m[1 day, 22:04:11<1 day, 3:22:39][39m
+[titan] 2025-09-09 15:40:11,634 - root - INFO - [31mstep: 25095 [32mloss: 2.6368 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,347 [36mtflops: 493.14 [35mmfu: 49.86%[39m [37mglobal_avg_ntp_loss: 0.7343 [37mglobal_avg_top_loss: 1.9025
+[titan] 2025-09-09 15:40:11,634 - root - INFO - [34mlr: 7.5919e-06 gnorm: 0.35 [35m[1 day, 22:04:43<1 day, 3:22:05][39m
+[titan] 2025-09-09 15:40:37,141 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 15:40:43,505 - root - INFO - [31mstep: 25100 [32mloss: 2.7106 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.01 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7623 [37mglobal_avg_top_loss: 1.9483
+[titan] 2025-09-09 15:40:43,505 - root - INFO - [34mlr: 7.5886e-06 gnorm: 0.36 [35m[1 day, 22:05:15<1 day, 3:21:31][39m
+[titan] 2025-09-09 15:41:15,404 - root - INFO - [31mstep: 25105 [32mloss: 2.8491 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.60 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8388 [37mglobal_avg_top_loss: 2.0103
+[titan] 2025-09-09 15:41:15,404 - root - INFO - [34mlr: 7.5853e-06 gnorm: 0.36 [35m[1 day, 22:05:47<1 day, 3:20:57][39m
+[titan] 2025-09-09 15:41:47,217 - root - INFO - [31mstep: 25110 [32mloss: 3.1185 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.91 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.9999 [37mglobal_avg_top_loss: 2.1186
+[titan] 2025-09-09 15:41:47,217 - root - INFO - [34mlr: 7.5820e-06 gnorm: 0.36 [35m[1 day, 22:06:18<1 day, 3:20:24][39m
+[titan] 2025-09-09 15:42:19,236 - root - INFO - [31mstep: 25115 [32mloss: 2.7550 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.74 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7815 [37mglobal_avg_top_loss: 1.9735
+[titan] 2025-09-09 15:42:19,237 - root - INFO - [34mlr: 7.5787e-06 gnorm: 0.37 [35m[1 day, 22:06:50<1 day, 3:19:50][39m
+[titan] 2025-09-09 15:42:51,092 - root - INFO - [31mstep: 25120 [32mloss: 2.6678 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.26 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7426 [37mglobal_avg_top_loss: 1.9252
+[titan] 2025-09-09 15:42:51,092 - root - INFO - [34mlr: 7.5754e-06 gnorm: 0.35 [35m[1 day, 22:07:22<1 day, 3:19:16][39m
+[titan] 2025-09-09 15:43:22,873 - root - INFO - [31mstep: 25125 [32mloss: 2.7577 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,311 [36mtflops: 491.41 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.7896 [37mglobal_avg_top_loss: 1.9681
+[titan] 2025-09-09 15:43:22,873 - root - INFO - [34mlr: 7.5721e-06 gnorm: 0.35 [35m[1 day, 22:07:54<1 day, 3:18:42][39m
+[titan] 2025-09-09 15:43:55,009 - root - INFO - [31mstep: 25130 [32mloss: 3.8224 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,197 [36mtflops: 485.97 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 1.3779 [37mglobal_avg_top_loss: 2.4445
+[titan] 2025-09-09 15:43:55,009 - root - INFO - [34mlr: 7.5688e-06 gnorm: 0.37 [35m[1 day, 22:08:26<1 day, 3:18:09][39m
+[titan] 2025-09-09 15:44:26,868 - root - INFO - [31mstep: 25135 [32mloss: 2.7911 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.21 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.8004 [37mglobal_avg_top_loss: 1.9907
+[titan] 2025-09-09 15:44:26,868 - root - INFO - [34mlr: 7.5655e-06 gnorm: 0.36 [35m[1 day, 22:08:58<1 day, 3:17:35][39m
+[titan] 2025-09-09 15:44:58,746 - root - INFO - [31mstep: 25140 [32mloss: 2.7238 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.91 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7708 [37mglobal_avg_top_loss: 1.9530
+[titan] 2025-09-09 15:44:58,746 - root - INFO - [34mlr: 7.5622e-06 gnorm: 0.37 [35m[1 day, 22:09:30<1 day, 3:17:01][39m
+[titan] 2025-09-09 15:45:30,719 - root - INFO - [31mstep: 25145 [32mloss: 2.6947 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.46 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7596 [37mglobal_avg_top_loss: 1.9351
+[titan] 2025-09-09 15:45:30,720 - root - INFO - [34mlr: 7.5589e-06 gnorm: 0.39 [35m[1 day, 22:10:02<1 day, 3:16:28][39m
+[titan] 2025-09-09 15:45:56,231 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 15:46:02,680 - root - INFO - [31mstep: 25150 [32mloss: 2.7373 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.64 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7793 [37mglobal_avg_top_loss: 1.9580
+[titan] 2025-09-09 15:46:02,681 - root - INFO - [34mlr: 7.5556e-06 gnorm: 0.37 [35m[1 day, 22:10:34<1 day, 3:15:54][39m
+[titan] 2025-09-09 15:46:34,515 - root - INFO - [31mstep: 25155 [32mloss: 2.8187 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.59 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.8108 [37mglobal_avg_top_loss: 2.0079
+[titan] 2025-09-09 15:46:34,515 - root - INFO - [34mlr: 7.5523e-06 gnorm: 0.36 [35m[1 day, 22:11:06<1 day, 3:15:20][39m
+[titan] 2025-09-09 15:47:06,566 - root - INFO - [31mstep: 25160 [32mloss: 3.1470 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.27 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 1.0122 [37mglobal_avg_top_loss: 2.1347
+[titan] 2025-09-09 15:47:06,566 - root - INFO - [34mlr: 7.5490e-06 gnorm: 0.53 [35m[1 day, 22:11:38<1 day, 3:14:46][39m
+[titan] 2025-09-09 15:47:38,505 - root - INFO - [31mstep: 25165 [32mloss: 2.7516 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.97 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7797 [37mglobal_avg_top_loss: 1.9719
+[titan] 2025-09-09 15:47:38,505 - root - INFO - [34mlr: 7.5457e-06 gnorm: 0.34 [35m[1 day, 22:12:10<1 day, 3:14:13][39m
+[titan] 2025-09-09 15:48:10,738 - root - INFO - [31mstep: 25170 [32mloss: 2.6569 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,166 [36mtflops: 484.51 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7375 [37mglobal_avg_top_loss: 1.9194
+[titan] 2025-09-09 15:48:10,739 - root - INFO - [34mlr: 7.5424e-06 gnorm: 0.39 [35m[1 day, 22:12:42<1 day, 3:13:39][39m
+[titan] 2025-09-09 15:48:43,104 - root - INFO - [31mstep: 25175 [32mloss: 2.7905 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,125 [36mtflops: 482.54 [35mmfu: 48.79%[39m [37mglobal_avg_ntp_loss: 0.8050 [37mglobal_avg_top_loss: 1.9855
+[titan] 2025-09-09 15:48:43,104 - root - INFO - [34mlr: 7.5391e-06 gnorm: 0.38 [35m[1 day, 22:13:14<1 day, 3:13:06][39m
+[titan] 2025-09-09 15:49:15,524 - root - INFO - [31mstep: 25180 [32mloss: 2.6982 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,107 [36mtflops: 481.71 [35mmfu: 48.71%[39m [37mglobal_avg_ntp_loss: 0.7601 [37mglobal_avg_top_loss: 1.9381
+[titan] 2025-09-09 15:49:15,525 - root - INFO - [34mlr: 7.5358e-06 gnorm: 0.35 [35m[1 day, 22:13:47<1 day, 3:12:32][39m
+[titan] 2025-09-09 15:49:47,605 - root - INFO - [31mstep: 25185 [32mloss: 2.7802 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,215 [36mtflops: 486.83 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7961 [37mglobal_avg_top_loss: 1.9841
+[titan] 2025-09-09 15:49:47,605 - root - INFO - [34mlr: 7.5325e-06 gnorm: 0.35 [35m[1 day, 22:14:19<1 day, 3:11:59][39m
+[titan] 2025-09-09 15:50:19,580 - root - INFO - [31mstep: 25190 [32mloss: 2.6150 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.42 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7175 [37mglobal_avg_top_loss: 1.8975
+[titan] 2025-09-09 15:50:19,581 - root - INFO - [34mlr: 7.5292e-06 gnorm: 0.35 [35m[1 day, 22:14:51<1 day, 3:11:25][39m
+[titan] 2025-09-09 15:50:51,496 - root - INFO - [31mstep: 25195 [32mloss: 2.7843 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.33 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7977 [37mglobal_avg_top_loss: 1.9866
+[titan] 2025-09-09 15:50:51,497 - root - INFO - [34mlr: 7.5259e-06 gnorm: 0.36 [35m[1 day, 22:15:23<1 day, 3:10:51][39m
+[titan] 2025-09-09 15:51:17,078 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 15:51:23,453 - root - INFO - [31mstep: 25200 [32mloss: 2.8446 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.70 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.8239 [37mglobal_avg_top_loss: 2.0207
+[titan] 2025-09-09 15:51:23,454 - root - INFO - [34mlr: 7.5226e-06 gnorm: 0.35 [35m[1 day, 22:15:55<1 day, 3:10:18][39m
+[titan] 2025-09-09 15:51:55,522 - root - INFO - [31mstep: 25205 [32mloss: 2.7168 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,218 [36mtflops: 487.00 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7654 [37mglobal_avg_top_loss: 1.9514
+[titan] 2025-09-09 15:51:55,522 - root - INFO - [34mlr: 7.5194e-06 gnorm: 0.34 [35m[1 day, 22:16:27<1 day, 3:09:44][39m
+[titan] 2025-09-09 15:52:27,545 - root - INFO - [31mstep: 25210 [32mloss: 3.2243 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.69 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 1.0547 [37mglobal_avg_top_loss: 2.1697
+[titan] 2025-09-09 15:52:27,545 - root - INFO - [34mlr: 7.5161e-06 gnorm: 0.36 [35m[1 day, 22:16:59<1 day, 3:09:10][39m
+[titan] 2025-09-09 15:52:59,589 - root - INFO - [31mstep: 25215 [32mloss: 2.7330 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.37 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7738 [37mglobal_avg_top_loss: 1.9592
+[titan] 2025-09-09 15:52:59,590 - root - INFO - [34mlr: 7.5128e-06 gnorm: 0.35 [35m[1 day, 22:17:31<1 day, 3:08:37][39m
+[titan] 2025-09-09 15:53:31,498 - root - INFO - [31mstep: 25220 [32mloss: 2.7844 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.44 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7998 [37mglobal_avg_top_loss: 1.9846
+[titan] 2025-09-09 15:53:31,499 - root - INFO - [34mlr: 7.5095e-06 gnorm: 0.36 [35m[1 day, 22:18:03<1 day, 3:08:03][39m
+[titan] 2025-09-09 15:54:03,681 - root - INFO - [31mstep: 25225 [32mloss: 3.2497 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,182 [36mtflops: 485.28 [35mmfu: 49.07%[39m [37mglobal_avg_ntp_loss: 1.0622 [37mglobal_avg_top_loss: 2.1876
+[titan] 2025-09-09 15:54:03,681 - root - INFO - [34mlr: 7.5062e-06 gnorm: 0.38 [35m[1 day, 22:18:35<1 day, 3:07:29][39m
+[titan] 2025-09-09 15:54:35,599 - root - INFO - [31mstep: 25230 [32mloss: 2.7190 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.30 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7677 [37mglobal_avg_top_loss: 1.9513
+[titan] 2025-09-09 15:54:35,600 - root - INFO - [34mlr: 7.5029e-06 gnorm: 0.34 [35m[1 day, 22:19:07<1 day, 3:06:56][39m
+[titan] 2025-09-09 15:55:07,717 - root - INFO - [31mstep: 25235 [32mloss: 2.7784 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.25 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7948 [37mglobal_avg_top_loss: 1.9836
+[titan] 2025-09-09 15:55:07,718 - root - INFO - [34mlr: 7.4996e-06 gnorm: 0.36 [35m[1 day, 22:19:39<1 day, 3:06:22][39m
+[titan] 2025-09-09 15:55:39,841 - root - INFO - [31mstep: 25240 [32mloss: 2.7457 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,201 [36mtflops: 486.16 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7792 [37mglobal_avg_top_loss: 1.9665
+[titan] 2025-09-09 15:55:39,842 - root - INFO - [34mlr: 7.4963e-06 gnorm: 0.37 [35m[1 day, 22:20:11<1 day, 3:05:49][39m
+[titan] 2025-09-09 15:56:11,940 - root - INFO - [31mstep: 25245 [32mloss: 2.9717 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.54 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.9030 [37mglobal_avg_top_loss: 2.0687
+[titan] 2025-09-09 15:56:11,940 - root - INFO - [34mlr: 7.4930e-06 gnorm: 0.37 [35m[1 day, 22:20:43<1 day, 3:05:15][39m
+[titan] 2025-09-09 15:56:37,249 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 15:56:43,757 - root - INFO - [31mstep: 25250 [32mloss: 2.7741 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.85 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7955 [37mglobal_avg_top_loss: 1.9786
+[titan] 2025-09-09 15:56:43,757 - root - INFO - [34mlr: 7.4897e-06 gnorm: 0.35 [35m[1 day, 22:21:15<1 day, 3:04:41][39m
+[titan] 2025-09-09 15:57:15,958 - root - INFO - [31mstep: 25255 [32mloss: 2.6937 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,177 [36mtflops: 485.01 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7581 [37mglobal_avg_top_loss: 1.9355
+[titan] 2025-09-09 15:57:15,958 - root - INFO - [34mlr: 7.4865e-06 gnorm: 0.35 [35m[1 day, 22:21:47<1 day, 3:04:08][39m
+[titan] 2025-09-09 15:57:48,186 - root - INFO - [31mstep: 25260 [32mloss: 2.7593 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,168 [36mtflops: 484.58 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 0.7973 [37mglobal_avg_top_loss: 1.9620
+[titan] 2025-09-09 15:57:48,187 - root - INFO - [34mlr: 7.4832e-06 gnorm: 0.35 [35m[1 day, 22:22:19<1 day, 3:03:34][39m
+[titan] 2025-09-09 15:58:20,115 - root - INFO - [31mstep: 25265 [32mloss: 2.7804 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.13 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7960 [37mglobal_avg_top_loss: 1.9844
+[titan] 2025-09-09 15:58:20,116 - root - INFO - [34mlr: 7.4799e-06 gnorm: 0.35 [35m[1 day, 22:22:51<1 day, 3:03:00][39m
+[titan] 2025-09-09 15:58:52,148 - root - INFO - [31mstep: 25270 [32mloss: 2.6477 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.55 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7338 [37mglobal_avg_top_loss: 1.9139
+[titan] 2025-09-09 15:58:52,148 - root - INFO - [34mlr: 7.4766e-06 gnorm: 0.37 [35m[1 day, 22:23:23<1 day, 3:02:27][39m
+[titan] 2025-09-09 15:59:24,266 - root - INFO - [31mstep: 25275 [32mloss: 3.2774 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.26 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 1.0742 [37mglobal_avg_top_loss: 2.2032
+[titan] 2025-09-09 15:59:24,266 - root - INFO - [34mlr: 7.4733e-06 gnorm: 0.35 [35m[1 day, 22:23:55<1 day, 3:01:53][39m
+[titan] 2025-09-09 15:59:56,183 - root - INFO - [31mstep: 25280 [32mloss: 2.6913 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.31 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7585 [37mglobal_avg_top_loss: 1.9327
+[titan] 2025-09-09 15:59:56,183 - root - INFO - [34mlr: 7.4700e-06 gnorm: 0.34 [35m[1 day, 22:24:27<1 day, 3:01:20][39m
+[titan] 2025-09-09 16:00:28,156 - root - INFO - [31mstep: 25285 [32mloss: 2.7369 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.45 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7726 [37mglobal_avg_top_loss: 1.9643
+[titan] 2025-09-09 16:00:28,157 - root - INFO - [34mlr: 7.4668e-06 gnorm: 0.36 [35m[1 day, 22:24:59<1 day, 3:00:46][39m
+[titan] 2025-09-09 16:01:00,271 - root - INFO - [31mstep: 25290 [32mloss: 3.2298 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.30 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 1.0548 [37mglobal_avg_top_loss: 2.1750
+[titan] 2025-09-09 16:01:00,271 - root - INFO - [34mlr: 7.4635e-06 gnorm: 0.38 [35m[1 day, 22:25:31<1 day, 3:00:12][39m
+[titan] 2025-09-09 16:01:32,358 - root - INFO - [31mstep: 25295 [32mloss: 2.6449 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.72 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7307 [37mglobal_avg_top_loss: 1.9141
+[titan] 2025-09-09 16:01:32,359 - root - INFO - [34mlr: 7.4602e-06 gnorm: 0.36 [35m[1 day, 22:26:04<1 day, 2:59:39][39m
+[titan] 2025-09-09 16:01:58,088 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:02:04,546 - root - INFO - [31mstep: 25300 [32mloss: 2.7873 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,181 [36mtflops: 485.20 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7996 [37mglobal_avg_top_loss: 1.9877
+[titan] 2025-09-09 16:02:04,546 - root - INFO - [34mlr: 7.4569e-06 gnorm: 0.37 [35m[1 day, 22:26:36<1 day, 2:59:05][39m
+[titan] 2025-09-09 16:02:36,423 - root - INFO - [31mstep: 25305 [32mloss: 3.1935 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.92 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 1.0325 [37mglobal_avg_top_loss: 2.1610
+[titan] 2025-09-09 16:02:36,424 - root - INFO - [34mlr: 7.4536e-06 gnorm: 0.42 [35m[1 day, 22:27:08<1 day, 2:58:31][39m
+[titan] 2025-09-09 16:03:08,493 - root - INFO - [31mstep: 25310 [32mloss: 2.7130 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.99 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7612 [37mglobal_avg_top_loss: 1.9518
+[titan] 2025-09-09 16:03:08,493 - root - INFO - [34mlr: 7.4503e-06 gnorm: 0.39 [35m[1 day, 22:27:40<1 day, 2:57:58][39m
+[titan] 2025-09-09 16:03:40,384 - root - INFO - [31mstep: 25315 [32mloss: 2.7946 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.71 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8052 [37mglobal_avg_top_loss: 1.9894
+[titan] 2025-09-09 16:03:40,384 - root - INFO - [34mlr: 7.4471e-06 gnorm: 0.37 [35m[1 day, 22:28:12<1 day, 2:57:24][39m
+[titan] 2025-09-09 16:04:12,553 - root - INFO - [31mstep: 25320 [32mloss: 2.7190 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,187 [36mtflops: 485.48 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 0.7686 [37mglobal_avg_top_loss: 1.9503
+[titan] 2025-09-09 16:04:12,553 - root - INFO - [34mlr: 7.4438e-06 gnorm: 0.37 [35m[1 day, 22:28:44<1 day, 2:56:51][39m
+[titan] 2025-09-09 16:04:44,447 - root - INFO - [31mstep: 25325 [32mloss: 2.7165 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.66 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7663 [37mglobal_avg_top_loss: 1.9502
+[titan] 2025-09-09 16:04:44,447 - root - INFO - [34mlr: 7.4405e-06 gnorm: 0.36 [35m[1 day, 22:29:16<1 day, 2:56:17][39m
+[titan] 2025-09-09 16:05:16,590 - root - INFO - [31mstep: 25330 [32mloss: 2.7882 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,195 [36mtflops: 485.88 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7978 [37mglobal_avg_top_loss: 1.9904
+[titan] 2025-09-09 16:05:16,590 - root - INFO - [34mlr: 7.4372e-06 gnorm: 0.36 [35m[1 day, 22:29:48<1 day, 2:55:43][39m
+[titan] 2025-09-09 16:05:48,482 - root - INFO - [31mstep: 25335 [32mloss: 2.6578 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.69 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7428 [37mglobal_avg_top_loss: 1.9151
+[titan] 2025-09-09 16:05:48,483 - root - INFO - [34mlr: 7.4339e-06 gnorm: 0.34 [35m[1 day, 22:30:20<1 day, 2:55:10][39m
+[titan] 2025-09-09 16:06:20,500 - root - INFO - [31mstep: 25340 [32mloss: 2.7553 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.78 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7829 [37mglobal_avg_top_loss: 1.9724
+[titan] 2025-09-09 16:06:20,500 - root - INFO - [34mlr: 7.4307e-06 gnorm: 0.50 [35m[1 day, 22:30:52<1 day, 2:54:36][39m
+[titan] 2025-09-09 16:06:52,374 - root - INFO - [31mstep: 25345 [32mloss: 2.7815 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.97 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7959 [37mglobal_avg_top_loss: 1.9856
+[titan] 2025-09-09 16:06:52,374 - root - INFO - [34mlr: 7.4274e-06 gnorm: 0.35 [35m[1 day, 22:31:24<1 day, 2:54:02][39m
+[titan] 2025-09-09 16:07:17,986 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:07:24,276 - root - INFO - [31mstep: 25350 [32mloss: 2.9346 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.54 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.8872 [37mglobal_avg_top_loss: 2.0474
+[titan] 2025-09-09 16:07:24,277 - root - INFO - [34mlr: 7.4241e-06 gnorm: 0.35 [35m[1 day, 22:31:55<1 day, 2:53:29][39m
+[titan] 2025-09-09 16:07:56,252 - root - INFO - [31mstep: 25355 [32mloss: 3.1781 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.41 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 1.0354 [37mglobal_avg_top_loss: 2.1427
+[titan] 2025-09-09 16:07:56,252 - root - INFO - [34mlr: 7.4208e-06 gnorm: 0.35 [35m[1 day, 22:32:27<1 day, 2:52:55][39m
+[titan] 2025-09-09 16:08:28,188 - root - INFO - [31mstep: 25360 [32mloss: 2.7347 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.02 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7702 [37mglobal_avg_top_loss: 1.9646
+[titan] 2025-09-09 16:08:28,189 - root - INFO - [34mlr: 7.4176e-06 gnorm: 0.37 [35m[1 day, 22:32:59<1 day, 2:52:21][39m
+[titan] 2025-09-09 16:09:00,346 - root - INFO - [31mstep: 25365 [32mloss: 2.6873 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.66 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.7526 [37mglobal_avg_top_loss: 1.9347
+[titan] 2025-09-09 16:09:00,346 - root - INFO - [34mlr: 7.4143e-06 gnorm: 0.36 [35m[1 day, 22:33:32<1 day, 2:51:48][39m
+[titan] 2025-09-09 16:09:32,367 - root - INFO - [31mstep: 25370 [32mloss: 3.2046 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.73 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 1.0395 [37mglobal_avg_top_loss: 2.1651
+[titan] 2025-09-09 16:09:32,367 - root - INFO - [34mlr: 7.4110e-06 gnorm: 0.36 [35m[1 day, 22:34:04<1 day, 2:51:14][39m
+[titan] 2025-09-09 16:10:04,450 - root - INFO - [31mstep: 25375 [32mloss: 2.6200 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.78 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7211 [37mglobal_avg_top_loss: 1.8989
+[titan] 2025-09-09 16:10:04,450 - root - INFO - [34mlr: 7.4077e-06 gnorm: 0.34 [35m[1 day, 22:34:36<1 day, 2:50:40][39m
+[titan] 2025-09-09 16:10:36,322 - root - INFO - [31mstep: 25380 [32mloss: 2.7924 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,281 [36mtflops: 490.00 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7981 [37mglobal_avg_top_loss: 1.9943
+[titan] 2025-09-09 16:10:36,323 - root - INFO - [34mlr: 7.4045e-06 gnorm: 0.37 [35m[1 day, 22:35:08<1 day, 2:50:07][39m
+[titan] 2025-09-09 16:11:08,390 - root - INFO - [31mstep: 25385 [32mloss: 3.2676 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.02 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 1.0810 [37mglobal_avg_top_loss: 2.1866
+[titan] 2025-09-09 16:11:08,390 - root - INFO - [34mlr: 7.4012e-06 gnorm: 0.39 [35m[1 day, 22:35:40<1 day, 2:49:33][39m
+[titan] 2025-09-09 16:11:40,275 - root - INFO - [31mstep: 25390 [32mloss: 2.7992 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.80 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7962 [37mglobal_avg_top_loss: 2.0030
+[titan] 2025-09-09 16:11:40,276 - root - INFO - [34mlr: 7.3979e-06 gnorm: 1.10 [35m[1 day, 22:36:11<1 day, 2:48:59][39m
+[titan] 2025-09-09 16:12:12,338 - root - INFO - [31mstep: 25395 [32mloss: 2.6125 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.09 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7189 [37mglobal_avg_top_loss: 1.8936
+[titan] 2025-09-09 16:12:12,338 - root - INFO - [34mlr: 7.3947e-06 gnorm: 0.34 [35m[1 day, 22:36:44<1 day, 2:48:26][39m
+[titan] 2025-09-09 16:12:37,996 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:12:44,312 - root - INFO - [31mstep: 25400 [32mloss: 2.6897 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.45 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7574 [37mglobal_avg_top_loss: 1.9323
+[titan] 2025-09-09 16:12:44,312 - root - INFO - [34mlr: 7.3914e-06 gnorm: 0.38 [35m[1 day, 22:37:16<1 day, 2:47:52][39m
+[titan] 2025-09-09 16:13:16,348 - root - INFO - [31mstep: 25405 [32mloss: 2.6516 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.50 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7371 [37mglobal_avg_top_loss: 1.9145
+[titan] 2025-09-09 16:13:16,348 - root - INFO - [34mlr: 7.3881e-06 gnorm: 0.35 [35m[1 day, 22:37:48<1 day, 2:47:19][39m
+[titan] 2025-09-09 16:13:48,361 - root - INFO - [31mstep: 25410 [32mloss: 2.8164 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.84 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.8323 [37mglobal_avg_top_loss: 1.9841
+[titan] 2025-09-09 16:13:48,361 - root - INFO - [34mlr: 7.3848e-06 gnorm: 0.40 [35m[1 day, 22:38:20<1 day, 2:46:45][39m
+[titan] 2025-09-09 16:14:20,324 - root - INFO - [31mstep: 25415 [32mloss: 2.6447 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.61 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7317 [37mglobal_avg_top_loss: 1.9130
+[titan] 2025-09-09 16:14:20,325 - root - INFO - [34mlr: 7.3816e-06 gnorm: 0.56 [35m[1 day, 22:38:52<1 day, 2:46:11][39m
+[titan] 2025-09-09 16:14:52,409 - root - INFO - [31mstep: 25420 [32mloss: 2.6779 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.75 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7545 [37mglobal_avg_top_loss: 1.9234
+[titan] 2025-09-09 16:14:52,410 - root - INFO - [34mlr: 7.3783e-06 gnorm: 0.35 [35m[1 day, 22:39:24<1 day, 2:45:38][39m
+[titan] 2025-09-09 16:15:24,375 - root - INFO - [31mstep: 25425 [32mloss: 2.6319 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.58 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7303 [37mglobal_avg_top_loss: 1.9015
+[titan] 2025-09-09 16:15:24,375 - root - INFO - [34mlr: 7.3750e-06 gnorm: 0.41 [35m[1 day, 22:39:56<1 day, 2:45:04][39m
+[titan] 2025-09-09 16:15:56,587 - root - INFO - [31mstep: 25430 [32mloss: 2.7570 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,173 [36mtflops: 484.83 [35mmfu: 49.02%[39m [37mglobal_avg_ntp_loss: 0.7801 [37mglobal_avg_top_loss: 1.9769
+[titan] 2025-09-09 16:15:56,587 - root - INFO - [34mlr: 7.3718e-06 gnorm: 0.37 [35m[1 day, 22:40:28<1 day, 2:44:31][39m
+[titan] 2025-09-09 16:16:28,929 - root - INFO - [31mstep: 25435 [32mloss: 3.1302 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,132 [36mtflops: 482.88 [35mmfu: 48.83%[39m [37mglobal_avg_ntp_loss: 1.0074 [37mglobal_avg_top_loss: 2.1228
+[titan] 2025-09-09 16:16:28,929 - root - INFO - [34mlr: 7.3685e-06 gnorm: 0.38 [35m[1 day, 22:41:00<1 day, 2:43:57][39m
+[titan] 2025-09-09 16:17:00,957 - root - INFO - [31mstep: 25440 [32mloss: 2.5854 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.61 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7078 [37mglobal_avg_top_loss: 1.8776
+[titan] 2025-09-09 16:17:00,958 - root - INFO - [34mlr: 7.3652e-06 gnorm: 0.33 [35m[1 day, 22:41:32<1 day, 2:43:23][39m
+[titan] 2025-09-09 16:17:33,262 - root - INFO - [31mstep: 25445 [32mloss: 2.7243 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,144 [36mtflops: 483.44 [35mmfu: 48.88%[39m [37mglobal_avg_ntp_loss: 0.7702 [37mglobal_avg_top_loss: 1.9541
+[titan] 2025-09-09 16:17:33,263 - root - INFO - [34mlr: 7.3620e-06 gnorm: 0.39 [35m[1 day, 22:42:04<1 day, 2:42:50][39m
+[titan] 2025-09-09 16:17:58,885 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:18:05,281 - root - INFO - [31mstep: 25450 [32mloss: 3.2906 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.76 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 1.0822 [37mglobal_avg_top_loss: 2.2084
+[titan] 2025-09-09 16:18:05,281 - root - INFO - [34mlr: 7.3587e-06 gnorm: 0.37 [35m[1 day, 22:42:36<1 day, 2:42:16][39m
+[titan] 2025-09-09 16:18:37,229 - root - INFO - [31mstep: 25455 [32mloss: 2.8179 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.84 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.8167 [37mglobal_avg_top_loss: 2.0013
+[titan] 2025-09-09 16:18:37,229 - root - INFO - [34mlr: 7.3554e-06 gnorm: 0.37 [35m[1 day, 22:43:08<1 day, 2:41:43][39m
+[titan] 2025-09-09 16:19:09,122 - root - INFO - [31mstep: 25460 [32mloss: 2.7257 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.68 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7717 [37mglobal_avg_top_loss: 1.9540
+[titan] 2025-09-09 16:19:09,122 - root - INFO - [34mlr: 7.3522e-06 gnorm: 0.43 [35m[1 day, 22:43:40<1 day, 2:41:09][39m
+[titan] 2025-09-09 16:19:41,167 - root - INFO - [31mstep: 25465 [32mloss: 3.0916 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.36 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.9822 [37mglobal_avg_top_loss: 2.1094
+[titan] 2025-09-09 16:19:41,167 - root - INFO - [34mlr: 7.3489e-06 gnorm: 0.40 [35m[1 day, 22:44:12<1 day, 2:40:35][39m
+[titan] 2025-09-09 16:20:13,272 - root - INFO - [31mstep: 25470 [32mloss: 2.7736 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.45 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7890 [37mglobal_avg_top_loss: 1.9846
+[titan] 2025-09-09 16:20:13,273 - root - INFO - [34mlr: 7.3457e-06 gnorm: 0.46 [35m[1 day, 22:44:44<1 day, 2:40:02][39m
+[titan] 2025-09-09 16:20:45,235 - root - INFO - [31mstep: 25475 [32mloss: 2.7916 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.61 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.8181 [37mglobal_avg_top_loss: 1.9735
+[titan] 2025-09-09 16:20:45,236 - root - INFO - [34mlr: 7.3424e-06 gnorm: 0.59 [35m[1 day, 22:45:16<1 day, 2:39:28][39m
+[titan] 2025-09-09 16:21:17,231 - root - INFO - [31mstep: 25480 [32mloss: 2.7603 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.11 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7877 [37mglobal_avg_top_loss: 1.9726
+[titan] 2025-09-09 16:21:17,231 - root - INFO - [34mlr: 7.3391e-06 gnorm: 0.37 [35m[1 day, 22:45:48<1 day, 2:38:55][39m
+[titan] 2025-09-09 16:21:49,241 - root - INFO - [31mstep: 25485 [32mloss: 2.7340 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7740 [37mglobal_avg_top_loss: 1.9601
+[titan] 2025-09-09 16:21:49,242 - root - INFO - [34mlr: 7.3359e-06 gnorm: 0.37 [35m[1 day, 22:46:20<1 day, 2:38:21][39m
+[titan] 2025-09-09 16:22:21,439 - root - INFO - [31mstep: 25490 [32mloss: 2.6630 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,177 [36mtflops: 485.04 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7415 [37mglobal_avg_top_loss: 1.9215
+[titan] 2025-09-09 16:22:21,440 - root - INFO - [34mlr: 7.3326e-06 gnorm: 0.36 [35m[1 day, 22:46:53<1 day, 2:37:47][39m
+[titan] 2025-09-09 16:22:53,657 - root - INFO - [31mstep: 25495 [32mloss: 3.2009 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,171 [36mtflops: 484.75 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 1.0382 [37mglobal_avg_top_loss: 2.1628
+[titan] 2025-09-09 16:22:53,657 - root - INFO - [34mlr: 7.3293e-06 gnorm: 0.37 [35m[1 day, 22:47:25<1 day, 2:37:14][39m
+[titan] 2025-09-09 16:23:19,122 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:23:25,492 - root - INFO - [31mstep: 25500 [32mloss: 2.7314 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.58 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7747 [37mglobal_avg_top_loss: 1.9567
+[titan] 2025-09-09 16:23:25,492 - root - INFO - [34mlr: 7.3261e-06 gnorm: 0.36 [35m[1 day, 22:47:57<1 day, 2:36:40][39m
+[titan] 2025-09-09 16:23:57,488 - root - INFO - [31mstep: 25505 [32mloss: 2.6509 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.09 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7394 [37mglobal_avg_top_loss: 1.9115
+[titan] 2025-09-09 16:23:57,489 - root - INFO - [34mlr: 7.3228e-06 gnorm: 0.35 [35m[1 day, 22:48:29<1 day, 2:36:07][39m
+[titan] 2025-09-09 16:24:29,550 - root - INFO - [31mstep: 25510 [32mloss: 2.8351 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.11 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.8257 [37mglobal_avg_top_loss: 2.0094
+[titan] 2025-09-09 16:24:29,550 - root - INFO - [34mlr: 7.3196e-06 gnorm: 0.38 [35m[1 day, 22:49:01<1 day, 2:35:33][39m
+[titan] 2025-09-09 16:25:01,466 - root - INFO - [31mstep: 25515 [32mloss: 3.2158 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.33 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 1.0482 [37mglobal_avg_top_loss: 2.1676
+[titan] 2025-09-09 16:25:01,466 - root - INFO - [34mlr: 7.3163e-06 gnorm: 0.36 [35m[1 day, 22:49:33<1 day, 2:34:59][39m
+[titan] 2025-09-09 16:25:33,569 - root - INFO - [31mstep: 25520 [32mloss: 2.7626 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.47 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7863 [37mglobal_avg_top_loss: 1.9763
+[titan] 2025-09-09 16:25:33,570 - root - INFO - [34mlr: 7.3131e-06 gnorm: 0.43 [35m[1 day, 22:50:05<1 day, 2:34:26][39m
+[titan] 2025-09-09 16:26:05,485 - root - INFO - [31mstep: 25525 [32mloss: 2.7257 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.34 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7780 [37mglobal_avg_top_loss: 1.9477
+[titan] 2025-09-09 16:26:05,486 - root - INFO - [34mlr: 7.3098e-06 gnorm: 0.39 [35m[1 day, 22:50:37<1 day, 2:33:52][39m
+[titan] 2025-09-09 16:26:37,376 - root - INFO - [31mstep: 25530 [32mloss: 2.8116 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.71 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8103 [37mglobal_avg_top_loss: 2.0013
+[titan] 2025-09-09 16:26:37,377 - root - INFO - [34mlr: 7.3065e-06 gnorm: 0.37 [35m[1 day, 22:51:09<1 day, 2:33:18][39m
+[titan] 2025-09-09 16:27:09,382 - root - INFO - [31mstep: 25535 [32mloss: 2.7881 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.96 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8013 [37mglobal_avg_top_loss: 1.9869
+[titan] 2025-09-09 16:27:09,382 - root - INFO - [34mlr: 7.3033e-06 gnorm: 0.40 [35m[1 day, 22:51:41<1 day, 2:32:45][39m
+[titan] 2025-09-09 16:27:41,250 - root - INFO - [31mstep: 25540 [32mloss: 2.7665 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.07 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7870 [37mglobal_avg_top_loss: 1.9794
+[titan] 2025-09-09 16:27:41,250 - root - INFO - [34mlr: 7.3000e-06 gnorm: 0.36 [35m[1 day, 22:52:12<1 day, 2:32:11][39m
+[titan] 2025-09-09 16:28:13,155 - root - INFO - [31mstep: 25545 [32mloss: 2.6871 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.49 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7578 [37mglobal_avg_top_loss: 1.9293
+[titan] 2025-09-09 16:28:13,156 - root - INFO - [34mlr: 7.2968e-06 gnorm: 0.35 [35m[1 day, 22:52:44<1 day, 2:31:37][39m
+[titan] 2025-09-09 16:28:38,632 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:28:44,999 - root - INFO - [31mstep: 25550 [32mloss: 2.6822 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.44 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7496 [37mglobal_avg_top_loss: 1.9326
+[titan] 2025-09-09 16:28:44,999 - root - INFO - [34mlr: 7.2935e-06 gnorm: 0.37 [35m[1 day, 22:53:16<1 day, 2:31:04][39m
+[titan] 2025-09-09 16:29:16,906 - root - INFO - [31mstep: 25555 [32mloss: 2.7471 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.47 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7891 [37mglobal_avg_top_loss: 1.9580
+[titan] 2025-09-09 16:29:16,906 - root - INFO - [34mlr: 7.2903e-06 gnorm: 0.38 [35m[1 day, 22:53:48<1 day, 2:30:30][39m
+[titan] 2025-09-09 16:29:49,018 - root - INFO - [31mstep: 25560 [32mloss: 2.8651 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.35 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.8428 [37mglobal_avg_top_loss: 2.0224
+[titan] 2025-09-09 16:29:49,018 - root - INFO - [34mlr: 7.2870e-06 gnorm: 0.37 [35m[1 day, 22:54:20<1 day, 2:29:57][39m
+[titan] 2025-09-09 16:30:20,955 - root - INFO - [31mstep: 25565 [32mloss: 2.7229 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,260 [36mtflops: 489.01 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7705 [37mglobal_avg_top_loss: 1.9524
+[titan] 2025-09-09 16:30:20,955 - root - INFO - [34mlr: 7.2838e-06 gnorm: 0.35 [35m[1 day, 22:54:52<1 day, 2:29:23][39m
+[titan] 2025-09-09 16:30:52,822 - root - INFO - [31mstep: 25570 [32mloss: 2.7319 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.08 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7726 [37mglobal_avg_top_loss: 1.9592
+[titan] 2025-09-09 16:30:52,823 - root - INFO - [34mlr: 7.2805e-06 gnorm: 0.36 [35m[1 day, 22:55:24<1 day, 2:28:49][39m
+[titan] 2025-09-09 16:31:24,711 - root - INFO - [31mstep: 25575 [32mloss: 3.2013 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.75 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 1.0378 [37mglobal_avg_top_loss: 2.1635
+[titan] 2025-09-09 16:31:24,711 - root - INFO - [34mlr: 7.2773e-06 gnorm: 0.37 [35m[1 day, 22:55:56<1 day, 2:28:16][39m
+[titan] 2025-09-09 16:31:56,873 - root - INFO - [31mstep: 25580 [32mloss: 2.7531 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,189 [36mtflops: 485.59 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7817 [37mglobal_avg_top_loss: 1.9714
+[titan] 2025-09-09 16:31:56,873 - root - INFO - [34mlr: 7.2740e-06 gnorm: 0.37 [35m[1 day, 22:56:28<1 day, 2:27:42][39m
+[titan] 2025-09-09 16:32:28,799 - root - INFO - [31mstep: 25585 [32mloss: 2.6784 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.18 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7474 [37mglobal_avg_top_loss: 1.9310
+[titan] 2025-09-09 16:32:28,799 - root - INFO - [34mlr: 7.2708e-06 gnorm: 0.35 [35m[1 day, 22:57:00<1 day, 2:27:08][39m
+[titan] 2025-09-09 16:33:00,631 - root - INFO - [31mstep: 25590 [32mloss: 2.6783 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.61 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7502 [37mglobal_avg_top_loss: 1.9281
+[titan] 2025-09-09 16:33:00,632 - root - INFO - [34mlr: 7.2675e-06 gnorm: 0.35 [35m[1 day, 22:57:32<1 day, 2:26:35][39m
+[titan] 2025-09-09 16:33:32,642 - root - INFO - [31mstep: 25595 [32mloss: 3.1604 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.88 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 1.0208 [37mglobal_avg_top_loss: 2.1396
+[titan] 2025-09-09 16:33:32,643 - root - INFO - [34mlr: 7.2643e-06 gnorm: 0.35 [35m[1 day, 22:58:04<1 day, 2:26:01][39m
+[titan] 2025-09-09 16:33:58,235 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:34:04,695 - root - INFO - [31mstep: 25600 [32mloss: 2.6280 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.25 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7240 [37mglobal_avg_top_loss: 1.9040
+[titan] 2025-09-09 16:34:04,695 - root - INFO - [34mlr: 7.2610e-06 gnorm: 0.34 [35m[1 day, 22:58:36<1 day, 2:25:27][39m
+[titan] 2025-09-09 16:34:04,950 - root - INFO - Dumping profiler traces at step 25600
+[titan] 2025-09-09 16:34:05,010 - root - INFO - Finished dumping profiler traces in 0.06 seconds
+[titan] 2025-09-09 16:34:36,890 - root - INFO - [31mstep: 25605 [32mloss: 2.7729 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.09 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7875 [37mglobal_avg_top_loss: 1.9855
+[titan] 2025-09-09 16:34:36,890 - root - INFO - [34mlr: 7.2578e-06 gnorm: 0.36 [35m[1 day, 22:59:08<1 day, 2:24:54][39m
+[titan] 2025-09-09 16:35:08,770 - root - INFO - [31mstep: 25610 [32mloss: 2.7704 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.87 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7873 [37mglobal_avg_top_loss: 1.9831
+[titan] 2025-09-09 16:35:08,770 - root - INFO - [34mlr: 7.2545e-06 gnorm: 0.37 [35m[1 day, 22:59:40<1 day, 2:24:20][39m
+[titan] 2025-09-09 16:35:40,590 - root - INFO - [31mstep: 25615 [32mloss: 2.6083 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,299 [36mtflops: 490.85 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7160 [37mglobal_avg_top_loss: 1.8923
+[titan] 2025-09-09 16:35:40,591 - root - INFO - [34mlr: 7.2513e-06 gnorm: 0.38 [35m[1 day, 23:00:12<1 day, 2:23:47][39m
+[titan] 2025-09-09 16:36:12,515 - root - INFO - [31mstep: 25620 [32mloss: 2.8002 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.19 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.8017 [37mglobal_avg_top_loss: 1.9985
+[titan] 2025-09-09 16:36:12,516 - root - INFO - [34mlr: 7.2480e-06 gnorm: 0.36 [35m[1 day, 23:00:44<1 day, 2:23:13][39m
+[titan] 2025-09-09 16:36:44,513 - root - INFO - [31mstep: 25625 [32mloss: 2.7163 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.08 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7670 [37mglobal_avg_top_loss: 1.9494
+[titan] 2025-09-09 16:36:44,514 - root - INFO - [34mlr: 7.2448e-06 gnorm: 0.35 [35m[1 day, 23:01:16<1 day, 2:22:39][39m
+[titan] 2025-09-09 16:37:16,448 - root - INFO - [31mstep: 25630 [32mloss: 2.7114 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.05 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7650 [37mglobal_avg_top_loss: 1.9465
+[titan] 2025-09-09 16:37:16,448 - root - INFO - [34mlr: 7.2416e-06 gnorm: 0.37 [35m[1 day, 23:01:48<1 day, 2:22:06][39m
+[titan] 2025-09-09 16:37:48,514 - root - INFO - [31mstep: 25635 [32mloss: 2.7578 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.04 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7860 [37mglobal_avg_top_loss: 1.9718
+[titan] 2025-09-09 16:37:48,514 - root - INFO - [34mlr: 7.2383e-06 gnorm: 0.36 [35m[1 day, 23:02:20<1 day, 2:21:32][39m
+[titan] 2025-09-09 16:38:20,467 - root - INFO - [31mstep: 25640 [32mloss: 2.6960 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.75 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7554 [37mglobal_avg_top_loss: 1.9406
+[titan] 2025-09-09 16:38:20,468 - root - INFO - [34mlr: 7.2351e-06 gnorm: 0.35 [35m[1 day, 23:02:52<1 day, 2:20:58][39m
+[titan] 2025-09-09 16:38:52,615 - root - INFO - [31mstep: 25645 [32mloss: 2.7464 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.80 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7800 [37mglobal_avg_top_loss: 1.9664
+[titan] 2025-09-09 16:38:52,615 - root - INFO - [34mlr: 7.2318e-06 gnorm: 0.36 [35m[1 day, 23:03:24<1 day, 2:20:25][39m
+[titan] 2025-09-09 16:39:18,125 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:39:24,527 - root - INFO - [31mstep: 25650 [32mloss: 2.8335 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.40 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.8255 [37mglobal_avg_top_loss: 2.0080
+[titan] 2025-09-09 16:39:24,527 - root - INFO - [34mlr: 7.2286e-06 gnorm: 0.40 [35m[1 day, 23:03:56<1 day, 2:19:51][39m
+[titan] 2025-09-09 16:39:56,644 - root - INFO - [31mstep: 25655 [32mloss: 3.2557 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.26 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 1.0758 [37mglobal_avg_top_loss: 2.1799
+[titan] 2025-09-09 16:39:56,645 - root - INFO - [34mlr: 7.2253e-06 gnorm: 0.35 [35m[1 day, 23:04:28<1 day, 2:19:18][39m
+[titan] 2025-09-09 16:40:28,606 - root - INFO - [31mstep: 25660 [32mloss: 2.7666 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.63 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7880 [37mglobal_avg_top_loss: 1.9786
+[titan] 2025-09-09 16:40:28,606 - root - INFO - [34mlr: 7.2221e-06 gnorm: 0.37 [35m[1 day, 23:05:00<1 day, 2:18:44][39m
+[titan] 2025-09-09 16:41:00,483 - root - INFO - [31mstep: 25665 [32mloss: 2.8095 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.92 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.8120 [37mglobal_avg_top_loss: 1.9974
+[titan] 2025-09-09 16:41:00,484 - root - INFO - [34mlr: 7.2189e-06 gnorm: 0.38 [35m[1 day, 23:05:32<1 day, 2:18:10][39m
+[titan] 2025-09-09 16:41:32,557 - root - INFO - [31mstep: 25670 [32mloss: 2.8191 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.93 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8116 [37mglobal_avg_top_loss: 2.0074
+[titan] 2025-09-09 16:41:32,557 - root - INFO - [34mlr: 7.2156e-06 gnorm: 0.38 [35m[1 day, 23:06:04<1 day, 2:17:37][39m
+[titan] 2025-09-09 16:42:04,691 - root - INFO - [31mstep: 25675 [32mloss: 2.8360 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.01 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.8426 [37mglobal_avg_top_loss: 1.9935
+[titan] 2025-09-09 16:42:04,691 - root - INFO - [34mlr: 7.2124e-06 gnorm: 0.36 [35m[1 day, 23:06:36<1 day, 2:17:03][39m
+[titan] 2025-09-09 16:42:36,428 - root - INFO - [31mstep: 25680 [32mloss: 2.6169 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,325 [36mtflops: 492.08 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 0.7155 [37mglobal_avg_top_loss: 1.9014
+[titan] 2025-09-09 16:42:36,428 - root - INFO - [34mlr: 7.2091e-06 gnorm: 0.51 [35m[1 day, 23:07:08<1 day, 2:16:30][39m
+[titan] 2025-09-09 16:43:08,422 - root - INFO - [31mstep: 25685 [32mloss: 2.7441 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.14 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7760 [37mglobal_avg_top_loss: 1.9681
+[titan] 2025-09-09 16:43:08,422 - root - INFO - [34mlr: 7.2059e-06 gnorm: 0.36 [35m[1 day, 23:07:40<1 day, 2:15:56][39m
+[titan] 2025-09-09 16:43:40,546 - root - INFO - [31mstep: 25690 [32mloss: 2.7775 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,201 [36mtflops: 486.16 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7930 [37mglobal_avg_top_loss: 1.9846
+[titan] 2025-09-09 16:43:40,546 - root - INFO - [34mlr: 7.2027e-06 gnorm: 0.37 [35m[1 day, 23:08:12<1 day, 2:15:22][39m
+[titan] 2025-09-09 16:44:12,481 - root - INFO - [31mstep: 25695 [32mloss: 2.6731 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.03 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7430 [37mglobal_avg_top_loss: 1.9301
+[titan] 2025-09-09 16:44:12,481 - root - INFO - [34mlr: 7.1994e-06 gnorm: 0.42 [35m[1 day, 23:08:44<1 day, 2:14:49][39m
+[titan] 2025-09-09 16:44:38,153 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:44:44,521 - root - INFO - [31mstep: 25700 [32mloss: 2.7730 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.44 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7941 [37mglobal_avg_top_loss: 1.9789
+[titan] 2025-09-09 16:44:44,521 - root - INFO - [34mlr: 7.1962e-06 gnorm: 0.38 [35m[1 day, 23:09:16<1 day, 2:14:15][39m
+[titan] 2025-09-09 16:45:16,478 - root - INFO - [31mstep: 25705 [32mloss: 2.7261 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.71 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7730 [37mglobal_avg_top_loss: 1.9531
+[titan] 2025-09-09 16:45:16,478 - root - INFO - [34mlr: 7.1930e-06 gnorm: 0.37 [35m[1 day, 23:09:48<1 day, 2:13:42][39m
+[titan] 2025-09-09 16:45:48,423 - root - INFO - [31mstep: 25710 [32mloss: 2.7508 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.88 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7822 [37mglobal_avg_top_loss: 1.9686
+[titan] 2025-09-09 16:45:48,424 - root - INFO - [34mlr: 7.1897e-06 gnorm: 0.37 [35m[1 day, 23:10:20<1 day, 2:13:08][39m
+[titan] 2025-09-09 16:46:20,281 - root - INFO - [31mstep: 25715 [32mloss: 2.6996 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.23 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7619 [37mglobal_avg_top_loss: 1.9377
+[titan] 2025-09-09 16:46:20,281 - root - INFO - [34mlr: 7.1865e-06 gnorm: 0.36 [35m[1 day, 23:10:51<1 day, 2:12:34][39m
+[titan] 2025-09-09 16:46:52,243 - root - INFO - [31mstep: 25720 [32mloss: 2.7088 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.61 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7621 [37mglobal_avg_top_loss: 1.9468
+[titan] 2025-09-09 16:46:52,244 - root - INFO - [34mlr: 7.1833e-06 gnorm: 3.44 [35m[1 day, 23:11:23<1 day, 2:12:01][39m
+[titan] 2025-09-09 16:47:24,367 - root - INFO - [31mstep: 25725 [32mloss: 2.6939 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,201 [36mtflops: 486.17 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7550 [37mglobal_avg_top_loss: 1.9389
+[titan] 2025-09-09 16:47:24,367 - root - INFO - [34mlr: 7.1800e-06 gnorm: 0.37 [35m[1 day, 23:11:55<1 day, 2:11:27][39m
+[titan] 2025-09-09 16:47:56,420 - root - INFO - [31mstep: 25730 [32mloss: 2.7190 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.24 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7672 [37mglobal_avg_top_loss: 1.9518
+[titan] 2025-09-09 16:47:56,420 - root - INFO - [34mlr: 7.1768e-06 gnorm: 0.38 [35m[1 day, 23:12:28<1 day, 2:10:54][39m
+[titan] 2025-09-09 16:48:28,327 - root - INFO - [31mstep: 25735 [32mloss: 3.1863 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.46 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 1.0377 [37mglobal_avg_top_loss: 2.1486
+[titan] 2025-09-09 16:48:28,327 - root - INFO - [34mlr: 7.1736e-06 gnorm: 0.37 [35m[1 day, 23:12:59<1 day, 2:10:20][39m
+[titan] 2025-09-09 16:49:00,377 - root - INFO - [31mstep: 25740 [32mloss: 2.7238 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.28 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7691 [37mglobal_avg_top_loss: 1.9547
+[titan] 2025-09-09 16:49:00,377 - root - INFO - [34mlr: 7.1703e-06 gnorm: 0.36 [35m[1 day, 23:13:32<1 day, 2:09:46][39m
+[titan] 2025-09-09 16:49:32,665 - root - INFO - [31mstep: 25745 [32mloss: 2.8013 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,149 [36mtflops: 483.70 [35mmfu: 48.91%[39m [37mglobal_avg_ntp_loss: 0.8027 [37mglobal_avg_top_loss: 1.9986
+[titan] 2025-09-09 16:49:32,665 - root - INFO - [34mlr: 7.1671e-06 gnorm: 0.46 [35m[1 day, 23:14:04<1 day, 2:09:13][39m
+[titan] 2025-09-09 16:49:58,132 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:50:04,487 - root - INFO - [31mstep: 25750 [32mloss: 2.9044 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.77 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.8640 [37mglobal_avg_top_loss: 2.0404
+[titan] 2025-09-09 16:50:04,487 - root - INFO - [34mlr: 7.1639e-06 gnorm: 0.36 [35m[1 day, 23:14:36<1 day, 2:08:39][39m
+[titan] 2025-09-09 16:50:36,447 - root - INFO - [31mstep: 25755 [32mloss: 2.7662 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7859 [37mglobal_avg_top_loss: 1.9803
+[titan] 2025-09-09 16:50:36,447 - root - INFO - [34mlr: 7.1606e-06 gnorm: 0.35 [35m[1 day, 23:15:08<1 day, 2:08:06][39m
+[titan] 2025-09-09 16:51:08,410 - root - INFO - [31mstep: 25760 [32mloss: 2.6984 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.61 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7640 [37mglobal_avg_top_loss: 1.9344
+[titan] 2025-09-09 16:51:08,410 - root - INFO - [34mlr: 7.1574e-06 gnorm: 0.38 [35m[1 day, 23:15:40<1 day, 2:07:32][39m
+[titan] 2025-09-09 16:51:40,427 - root - INFO - [31mstep: 25765 [32mloss: 2.7768 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.78 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7948 [37mglobal_avg_top_loss: 1.9820
+[titan] 2025-09-09 16:51:40,428 - root - INFO - [34mlr: 7.1542e-06 gnorm: 0.37 [35m[1 day, 23:16:12<1 day, 2:06:58][39m
+[titan] 2025-09-09 16:52:12,494 - root - INFO - [31mstep: 25770 [32mloss: 2.6964 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.03 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7578 [37mglobal_avg_top_loss: 1.9386
+[titan] 2025-09-09 16:52:12,494 - root - INFO - [34mlr: 7.1510e-06 gnorm: 0.36 [35m[1 day, 23:16:44<1 day, 2:06:25][39m
+[titan] 2025-09-09 16:52:44,570 - root - INFO - [31mstep: 25775 [32mloss: 2.8399 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.89 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8329 [37mglobal_avg_top_loss: 2.0070
+[titan] 2025-09-09 16:52:44,570 - root - INFO - [34mlr: 7.1477e-06 gnorm: 0.42 [35m[1 day, 23:17:16<1 day, 2:05:51][39m
+[titan] 2025-09-09 16:53:16,626 - root - INFO - [31mstep: 25780 [32mloss: 2.7932 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,222 [36mtflops: 487.19 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.8052 [37mglobal_avg_top_loss: 1.9880
+[titan] 2025-09-09 16:53:16,626 - root - INFO - [34mlr: 7.1445e-06 gnorm: 0.42 [35m[1 day, 23:17:48<1 day, 2:05:18][39m
+[titan] 2025-09-09 16:53:48,539 - root - INFO - [31mstep: 25785 [32mloss: 2.7354 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.38 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7747 [37mglobal_avg_top_loss: 1.9607
+[titan] 2025-09-09 16:53:48,539 - root - INFO - [34mlr: 7.1413e-06 gnorm: 0.36 [35m[1 day, 23:18:20<1 day, 2:04:44][39m
+[titan] 2025-09-09 16:54:20,638 - root - INFO - [31mstep: 25790 [32mloss: 2.7536 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.54 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7840 [37mglobal_avg_top_loss: 1.9697
+[titan] 2025-09-09 16:54:20,638 - root - INFO - [34mlr: 7.1381e-06 gnorm: 0.35 [35m[1 day, 23:18:52<1 day, 2:04:11][39m
+[titan] 2025-09-09 16:54:52,828 - root - INFO - [31mstep: 25795 [32mloss: 2.6883 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,180 [36mtflops: 485.15 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7527 [37mglobal_avg_top_loss: 1.9357
+[titan] 2025-09-09 16:54:52,829 - root - INFO - [34mlr: 7.1348e-06 gnorm: 0.40 [35m[1 day, 23:19:24<1 day, 2:03:37][39m
+[titan] 2025-09-09 16:55:18,493 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 16:55:25,011 - root - INFO - [31mstep: 25800 [32mloss: 2.7297 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,182 [36mtflops: 485.28 [35mmfu: 49.07%[39m [37mglobal_avg_ntp_loss: 0.7700 [37mglobal_avg_top_loss: 1.9598
+[titan] 2025-09-09 16:55:25,011 - root - INFO - [34mlr: 7.1316e-06 gnorm: 0.37 [35m[1 day, 23:19:56<1 day, 2:03:04][39m
+[titan] 2025-09-09 16:55:57,111 - root - INFO - [31mstep: 25805 [32mloss: 2.6895 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,208 [36mtflops: 486.51 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7543 [37mglobal_avg_top_loss: 1.9352
+[titan] 2025-09-09 16:55:57,112 - root - INFO - [34mlr: 7.1284e-06 gnorm: 0.39 [35m[1 day, 23:20:28<1 day, 2:02:30][39m
+[titan] 2025-09-09 16:56:29,125 - root - INFO - [31mstep: 25810 [32mloss: 2.7379 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.84 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7756 [37mglobal_avg_top_loss: 1.9623
+[titan] 2025-09-09 16:56:29,125 - root - INFO - [34mlr: 7.1252e-06 gnorm: 0.36 [35m[1 day, 23:21:00<1 day, 2:01:57][39m
+[titan] 2025-09-09 16:57:01,095 - root - INFO - [31mstep: 25815 [32mloss: 2.7877 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.49 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7999 [37mglobal_avg_top_loss: 1.9878
+[titan] 2025-09-09 16:57:01,096 - root - INFO - [34mlr: 7.1219e-06 gnorm: 0.39 [35m[1 day, 23:21:32<1 day, 2:01:23][39m
+[titan] 2025-09-09 16:57:33,368 - root - INFO - [31mstep: 25820 [32mloss: 2.6703 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,154 [36mtflops: 483.93 [35mmfu: 48.93%[39m [37mglobal_avg_ntp_loss: 0.7450 [37mglobal_avg_top_loss: 1.9253
+[titan] 2025-09-09 16:57:33,368 - root - INFO - [34mlr: 7.1187e-06 gnorm: 0.38 [35m[1 day, 23:22:04<1 day, 2:00:50][39m
+[titan] 2025-09-09 16:58:05,416 - root - INFO - [31mstep: 25825 [32mloss: 2.7347 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.31 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7751 [37mglobal_avg_top_loss: 1.9596
+[titan] 2025-09-09 16:58:05,416 - root - INFO - [34mlr: 7.1155e-06 gnorm: 0.36 [35m[1 day, 23:22:37<1 day, 2:00:16][39m
+[titan] 2025-09-09 16:58:37,439 - root - INFO - [31mstep: 25830 [32mloss: 2.5870 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.69 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7084 [37mglobal_avg_top_loss: 1.8786
+[titan] 2025-09-09 16:58:37,439 - root - INFO - [34mlr: 7.1123e-06 gnorm: 0.37 [35m[1 day, 23:23:09<1 day, 1:59:42][39m
+[titan] 2025-09-09 16:59:09,624 - root - INFO - [31mstep: 25835 [32mloss: 2.7255 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,181 [36mtflops: 485.24 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7672 [37mglobal_avg_top_loss: 1.9583
+[titan] 2025-09-09 16:59:09,624 - root - INFO - [34mlr: 7.1091e-06 gnorm: 0.39 [35m[1 day, 23:23:41<1 day, 1:59:09][39m
+[titan] 2025-09-09 16:59:41,635 - root - INFO - [31mstep: 25840 [32mloss: 2.8682 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.87 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.8473 [37mglobal_avg_top_loss: 2.0209
+[titan] 2025-09-09 16:59:41,636 - root - INFO - [34mlr: 7.1058e-06 gnorm: 0.40 [35m[1 day, 23:24:13<1 day, 1:58:35][39m
+[titan] 2025-09-09 17:00:13,784 - root - INFO - [31mstep: 25845 [32mloss: 2.7899 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.79 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7967 [37mglobal_avg_top_loss: 1.9932
+[titan] 2025-09-09 17:00:13,785 - root - INFO - [34mlr: 7.1026e-06 gnorm: 0.38 [35m[1 day, 23:24:45<1 day, 1:58:02][39m
+[titan] 2025-09-09 17:00:39,706 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:00:46,109 - root - INFO - [31mstep: 25850 [32mloss: 2.6656 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,137 [36mtflops: 483.14 [35mmfu: 48.85%[39m [37mglobal_avg_ntp_loss: 0.7446 [37mglobal_avg_top_loss: 1.9209
+[titan] 2025-09-09 17:00:46,110 - root - INFO - [34mlr: 7.0994e-06 gnorm: 0.36 [35m[1 day, 23:25:17<1 day, 1:57:28][39m
+[titan] 2025-09-09 17:01:17,993 - root - INFO - [31mstep: 25855 [32mloss: 2.7729 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.83 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7911 [37mglobal_avg_top_loss: 1.9819
+[titan] 2025-09-09 17:01:17,993 - root - INFO - [34mlr: 7.0962e-06 gnorm: 0.38 [35m[1 day, 23:25:49<1 day, 1:56:55][39m
+[titan] 2025-09-09 17:01:49,974 - root - INFO - [31mstep: 25860 [32mloss: 2.7198 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.34 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7680 [37mglobal_avg_top_loss: 1.9517
+[titan] 2025-09-09 17:01:49,974 - root - INFO - [34mlr: 7.0930e-06 gnorm: 0.37 [35m[1 day, 23:26:21<1 day, 1:56:21][39m
+[titan] 2025-09-09 17:02:21,804 - root - INFO - [31mstep: 25865 [32mloss: 2.6994 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.65 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7595 [37mglobal_avg_top_loss: 1.9399
+[titan] 2025-09-09 17:02:21,804 - root - INFO - [34mlr: 7.0898e-06 gnorm: 0.36 [35m[1 day, 23:26:53<1 day, 1:55:48][39m
+[titan] 2025-09-09 17:02:53,718 - root - INFO - [31mstep: 25870 [32mloss: 2.7622 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.36 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7843 [37mglobal_avg_top_loss: 1.9779
+[titan] 2025-09-09 17:02:53,718 - root - INFO - [34mlr: 7.0865e-06 gnorm: 0.39 [35m[1 day, 23:27:25<1 day, 1:55:14][39m
+[titan] 2025-09-09 17:03:25,968 - root - INFO - [31mstep: 25875 [32mloss: 2.6897 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,161 [36mtflops: 484.27 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.7555 [37mglobal_avg_top_loss: 1.9343
+[titan] 2025-09-09 17:03:25,968 - root - INFO - [34mlr: 7.0833e-06 gnorm: 0.35 [35m[1 day, 23:27:57<1 day, 1:54:40][39m
+[titan] 2025-09-09 17:03:58,066 - root - INFO - [31mstep: 25880 [32mloss: 2.7137 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.54 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7647 [37mglobal_avg_top_loss: 1.9489
+[titan] 2025-09-09 17:03:58,067 - root - INFO - [34mlr: 7.0801e-06 gnorm: 0.36 [35m[1 day, 23:28:29<1 day, 1:54:07][39m
+[titan] 2025-09-09 17:04:30,040 - root - INFO - [31mstep: 25885 [32mloss: 2.6279 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.44 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7299 [37mglobal_avg_top_loss: 1.8980
+[titan] 2025-09-09 17:04:30,041 - root - INFO - [34mlr: 7.0769e-06 gnorm: 0.37 [35m[1 day, 23:29:01<1 day, 1:53:33][39m
+[titan] 2025-09-09 17:05:02,081 - root - INFO - [31mstep: 25890 [32mloss: 2.7596 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.42 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7935 [37mglobal_avg_top_loss: 1.9662
+[titan] 2025-09-09 17:05:02,082 - root - INFO - [34mlr: 7.0737e-06 gnorm: 0.37 [35m[1 day, 23:29:33<1 day, 1:53:00][39m
+[titan] 2025-09-09 17:05:33,879 - root - INFO - [31mstep: 25895 [32mloss: 2.7432 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.14 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7774 [37mglobal_avg_top_loss: 1.9658
+[titan] 2025-09-09 17:05:33,880 - root - INFO - [34mlr: 7.0705e-06 gnorm: 0.37 [35m[1 day, 23:30:05<1 day, 1:52:26][39m
+[titan] 2025-09-09 17:05:59,651 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:06:06,077 - root - INFO - [31mstep: 25900 [32mloss: 2.7325 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.05 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7737 [37mglobal_avg_top_loss: 1.9588
+[titan] 2025-09-09 17:06:06,077 - root - INFO - [34mlr: 7.0673e-06 gnorm: 0.56 [35m[1 day, 23:30:37<1 day, 1:51:53][39m
+[titan] 2025-09-09 17:06:38,040 - root - INFO - [31mstep: 25905 [32mloss: 2.8397 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.60 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.8264 [37mglobal_avg_top_loss: 2.0133
+[titan] 2025-09-09 17:06:38,041 - root - INFO - [34mlr: 7.0641e-06 gnorm: 0.37 [35m[1 day, 23:31:09<1 day, 1:51:19][39m
+[titan] 2025-09-09 17:07:10,117 - root - INFO - [31mstep: 25910 [32mloss: 2.7564 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.88 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7814 [37mglobal_avg_top_loss: 1.9749
+[titan] 2025-09-09 17:07:10,117 - root - INFO - [34mlr: 7.0608e-06 gnorm: 0.36 [35m[1 day, 23:31:41<1 day, 1:50:46][39m
+[titan] 2025-09-09 17:07:42,188 - root - INFO - [31mstep: 25915 [32mloss: 3.1628 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.96 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 1.0202 [37mglobal_avg_top_loss: 2.1426
+[titan] 2025-09-09 17:07:42,188 - root - INFO - [34mlr: 7.0576e-06 gnorm: 0.36 [35m[1 day, 23:32:13<1 day, 1:50:12][39m
+[titan] 2025-09-09 17:08:14,263 - root - INFO - [31mstep: 25920 [32mloss: 2.8122 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.90 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.8137 [37mglobal_avg_top_loss: 1.9985
+[titan] 2025-09-09 17:08:14,263 - root - INFO - [34mlr: 7.0544e-06 gnorm: 0.36 [35m[1 day, 23:32:45<1 day, 1:49:38][39m
+[titan] 2025-09-09 17:08:46,254 - root - INFO - [31mstep: 25925 [32mloss: 2.8404 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8278 [37mglobal_avg_top_loss: 2.0126
+[titan] 2025-09-09 17:08:46,254 - root - INFO - [34mlr: 7.0512e-06 gnorm: 0.66 [35m[1 day, 23:33:17<1 day, 1:49:05][39m
+[titan] 2025-09-09 17:09:18,395 - root - INFO - [31mstep: 25930 [32mloss: 2.7268 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,195 [36mtflops: 485.91 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7700 [37mglobal_avg_top_loss: 1.9569
+[titan] 2025-09-09 17:09:18,395 - root - INFO - [34mlr: 7.0480e-06 gnorm: 0.37 [35m[1 day, 23:33:49<1 day, 1:48:31][39m
+[titan] 2025-09-09 17:09:50,352 - root - INFO - [31mstep: 25935 [32mloss: 2.6415 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.69 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7317 [37mglobal_avg_top_loss: 1.9098
+[titan] 2025-09-09 17:09:50,353 - root - INFO - [34mlr: 7.0448e-06 gnorm: 0.36 [35m[1 day, 23:34:21<1 day, 1:47:58][39m
+[titan] 2025-09-09 17:10:22,460 - root - INFO - [31mstep: 25940 [32mloss: 2.7190 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.40 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7676 [37mglobal_avg_top_loss: 1.9514
+[titan] 2025-09-09 17:10:22,461 - root - INFO - [34mlr: 7.0416e-06 gnorm: 0.37 [35m[1 day, 23:34:54<1 day, 1:47:24][39m
+[titan] 2025-09-09 17:10:54,645 - root - INFO - [31mstep: 25945 [32mloss: 2.7431 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,182 [36mtflops: 485.25 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7822 [37mglobal_avg_top_loss: 1.9610
+[titan] 2025-09-09 17:10:54,646 - root - INFO - [34mlr: 7.0384e-06 gnorm: 0.37 [35m[1 day, 23:35:26<1 day, 1:46:51][39m
+[titan] 2025-09-09 17:11:20,180 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:11:26,613 - root - INFO - [31mstep: 25950 [32mloss: 2.6404 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.54 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7345 [37mglobal_avg_top_loss: 1.9060
+[titan] 2025-09-09 17:11:26,613 - root - INFO - [34mlr: 7.0352e-06 gnorm: 0.36 [35m[1 day, 23:35:58<1 day, 1:46:17][39m
+[titan] 2025-09-09 17:11:58,761 - root - INFO - [31mstep: 25955 [32mloss: 2.7990 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.80 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.8003 [37mglobal_avg_top_loss: 1.9987
+[titan] 2025-09-09 17:11:58,761 - root - INFO - [34mlr: 7.0320e-06 gnorm: 0.37 [35m[1 day, 23:36:30<1 day, 1:45:44][39m
+[titan] 2025-09-09 17:12:30,683 - root - INFO - [31mstep: 25960 [32mloss: 2.7329 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.23 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7694 [37mglobal_avg_top_loss: 1.9635
+[titan] 2025-09-09 17:12:30,684 - root - INFO - [34mlr: 7.0288e-06 gnorm: 0.37 [35m[1 day, 23:37:02<1 day, 1:45:10][39m
+[titan] 2025-09-09 17:13:02,662 - root - INFO - [31mstep: 25965 [32mloss: 2.6478 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.38 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7350 [37mglobal_avg_top_loss: 1.9128
+[titan] 2025-09-09 17:13:02,662 - root - INFO - [34mlr: 7.0256e-06 gnorm: 0.38 [35m[1 day, 23:37:34<1 day, 1:44:37][39m
+[titan] 2025-09-09 17:13:34,659 - root - INFO - [31mstep: 25970 [32mloss: 2.6278 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.09 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7235 [37mglobal_avg_top_loss: 1.9042
+[titan] 2025-09-09 17:13:34,659 - root - INFO - [34mlr: 7.0224e-06 gnorm: 0.36 [35m[1 day, 23:38:06<1 day, 1:44:03][39m
+[titan] 2025-09-09 17:14:06,778 - root - INFO - [31mstep: 25975 [32mloss: 3.0415 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,202 [36mtflops: 486.24 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.9439 [37mglobal_avg_top_loss: 2.0976
+[titan] 2025-09-09 17:14:06,778 - root - INFO - [34mlr: 7.0192e-06 gnorm: 0.37 [35m[1 day, 23:38:38<1 day, 1:43:30][39m
+[titan] 2025-09-09 17:14:38,819 - root - INFO - [31mstep: 25980 [32mloss: 2.6871 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.42 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7530 [37mglobal_avg_top_loss: 1.9341
+[titan] 2025-09-09 17:14:38,819 - root - INFO - [34mlr: 7.0160e-06 gnorm: 0.36 [35m[1 day, 23:39:10<1 day, 1:42:56][39m
+[titan] 2025-09-09 17:15:10,777 - root - INFO - [31mstep: 25985 [32mloss: 2.6717 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.69 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7479 [37mglobal_avg_top_loss: 1.9238
+[titan] 2025-09-09 17:15:10,777 - root - INFO - [34mlr: 7.0128e-06 gnorm: 0.36 [35m[1 day, 23:39:42<1 day, 1:42:22][39m
+[titan] 2025-09-09 17:15:42,889 - root - INFO - [31mstep: 25990 [32mloss: 2.7102 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.34 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7640 [37mglobal_avg_top_loss: 1.9462
+[titan] 2025-09-09 17:15:42,889 - root - INFO - [34mlr: 7.0096e-06 gnorm: 0.37 [35m[1 day, 23:40:14<1 day, 1:41:49][39m
+[titan] 2025-09-09 17:16:15,003 - root - INFO - [31mstep: 25995 [32mloss: 3.1320 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.32 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 1.0082 [37mglobal_avg_top_loss: 2.1237
+[titan] 2025-09-09 17:16:15,003 - root - INFO - [34mlr: 7.0064e-06 gnorm: 0.37 [35m[1 day, 23:40:46<1 day, 1:41:15][39m
+[titan] 2025-09-09 17:16:40,779 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:16:47,196 - root - INFO - [31mstep: 26000 [32mloss: 2.6892 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,179 [36mtflops: 485.11 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7537 [37mglobal_avg_top_loss: 1.9355
+[titan] 2025-09-09 17:16:47,196 - root - INFO - [34mlr: 7.0032e-06 gnorm: 0.35 [35m[1 day, 23:41:18<1 day, 1:40:42][39m
+[titan] 2025-09-09 17:17:19,262 - root - INFO - [31mstep: 26005 [32mloss: 2.6976 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.04 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7588 [37mglobal_avg_top_loss: 1.9388
+[titan] 2025-09-09 17:17:19,262 - root - INFO - [34mlr: 7.0000e-06 gnorm: 0.38 [35m[1 day, 23:41:50<1 day, 1:40:08][39m
+[titan] 2025-09-09 17:17:51,312 - root - INFO - [31mstep: 26010 [32mloss: 2.7888 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.28 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7977 [37mglobal_avg_top_loss: 1.9911
+[titan] 2025-09-09 17:17:51,312 - root - INFO - [34mlr: 6.9968e-06 gnorm: 0.37 [35m[1 day, 23:42:22<1 day, 1:39:35][39m
+[titan] 2025-09-09 17:18:23,360 - root - INFO - [31mstep: 26015 [32mloss: 2.7930 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.32 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.8097 [37mglobal_avg_top_loss: 1.9833
+[titan] 2025-09-09 17:18:23,360 - root - INFO - [34mlr: 6.9936e-06 gnorm: 0.48 [35m[1 day, 23:42:54<1 day, 1:39:01][39m
+[titan] 2025-09-09 17:18:55,360 - root - INFO - [31mstep: 26020 [32mloss: 2.7541 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.04 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7820 [37mglobal_avg_top_loss: 1.9721
+[titan] 2025-09-09 17:18:55,360 - root - INFO - [34mlr: 6.9904e-06 gnorm: 0.36 [35m[1 day, 23:43:26<1 day, 1:38:28][39m
+[titan] 2025-09-09 17:19:27,445 - root - INFO - [31mstep: 26025 [32mloss: 2.7449 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.75 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7743 [37mglobal_avg_top_loss: 1.9706
+[titan] 2025-09-09 17:19:27,446 - root - INFO - [34mlr: 6.9872e-06 gnorm: 0.45 [35m[1 day, 23:43:59<1 day, 1:37:54][39m
+[titan] 2025-09-09 17:19:59,479 - root - INFO - [31mstep: 26030 [32mloss: 2.7256 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.53 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7765 [37mglobal_avg_top_loss: 1.9492
+[titan] 2025-09-09 17:19:59,479 - root - INFO - [34mlr: 6.9840e-06 gnorm: 0.38 [35m[1 day, 23:44:31<1 day, 1:37:21][39m
+[titan] 2025-09-09 17:20:31,585 - root - INFO - [31mstep: 26035 [32mloss: 2.7881 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.43 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7991 [37mglobal_avg_top_loss: 1.9889
+[titan] 2025-09-09 17:20:31,586 - root - INFO - [34mlr: 6.9808e-06 gnorm: 0.41 [35m[1 day, 23:45:03<1 day, 1:36:47][39m
+[titan] 2025-09-09 17:21:03,561 - root - INFO - [31mstep: 26040 [32mloss: 2.7484 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.41 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7811 [37mglobal_avg_top_loss: 1.9673
+[titan] 2025-09-09 17:21:03,562 - root - INFO - [34mlr: 6.9776e-06 gnorm: 0.37 [35m[1 day, 23:45:35<1 day, 1:36:14][39m
+[titan] 2025-09-09 17:21:35,665 - root - INFO - [31mstep: 26045 [32mloss: 3.0163 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.46 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.9318 [37mglobal_avg_top_loss: 2.0845
+[titan] 2025-09-09 17:21:35,666 - root - INFO - [34mlr: 6.9744e-06 gnorm: 0.36 [35m[1 day, 23:46:07<1 day, 1:35:40][39m
+[titan] 2025-09-09 17:22:01,268 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:22:07,694 - root - INFO - [31mstep: 26050 [32mloss: 2.7443 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.62 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7822 [37mglobal_avg_top_loss: 1.9621
+[titan] 2025-09-09 17:22:07,694 - root - INFO - [34mlr: 6.9712e-06 gnorm: 0.36 [35m[1 day, 23:46:39<1 day, 1:35:07][39m
+[titan] 2025-09-09 17:22:39,719 - root - INFO - [31mstep: 26055 [32mloss: 2.7208 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.66 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7660 [37mglobal_avg_top_loss: 1.9549
+[titan] 2025-09-09 17:22:39,719 - root - INFO - [34mlr: 6.9680e-06 gnorm: 0.38 [35m[1 day, 23:47:11<1 day, 1:34:33][39m
+[titan] 2025-09-09 17:23:11,858 - root - INFO - [31mstep: 26060 [32mloss: 2.6359 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,196 [36mtflops: 485.93 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7269 [37mglobal_avg_top_loss: 1.9090
+[titan] 2025-09-09 17:23:11,859 - root - INFO - [34mlr: 6.9648e-06 gnorm: 0.50 [35m[1 day, 23:47:43<1 day, 1:34:00][39m
+[titan] 2025-09-09 17:23:43,902 - root - INFO - [31mstep: 26065 [32mloss: 2.7613 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.38 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7847 [37mglobal_avg_top_loss: 1.9766
+[titan] 2025-09-09 17:23:43,902 - root - INFO - [34mlr: 6.9616e-06 gnorm: 0.40 [35m[1 day, 23:48:15<1 day, 1:33:26][39m
+[titan] 2025-09-09 17:24:15,916 - root - INFO - [31mstep: 26070 [32mloss: 2.7550 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.83 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7811 [37mglobal_avg_top_loss: 1.9740
+[titan] 2025-09-09 17:24:15,916 - root - INFO - [34mlr: 6.9585e-06 gnorm: 0.38 [35m[1 day, 23:48:47<1 day, 1:32:52][39m
+[titan] 2025-09-09 17:24:47,927 - root - INFO - [31mstep: 26075 [32mloss: 3.1116 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.88 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.9942 [37mglobal_avg_top_loss: 2.1173
+[titan] 2025-09-09 17:24:47,927 - root - INFO - [34mlr: 6.9553e-06 gnorm: 0.38 [35m[1 day, 23:49:19<1 day, 1:32:19][39m
+[titan] 2025-09-09 17:25:20,262 - root - INFO - [31mstep: 26080 [32mloss: 2.7241 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,134 [36mtflops: 482.98 [35mmfu: 48.84%[39m [37mglobal_avg_ntp_loss: 0.7668 [37mglobal_avg_top_loss: 1.9573
+[titan] 2025-09-09 17:25:20,263 - root - INFO - [34mlr: 6.9521e-06 gnorm: 0.38 [35m[1 day, 23:49:51<1 day, 1:31:46][39m
+[titan] 2025-09-09 17:25:52,316 - root - INFO - [31mstep: 26085 [32mloss: 2.6729 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.23 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7504 [37mglobal_avg_top_loss: 1.9225
+[titan] 2025-09-09 17:25:52,316 - root - INFO - [34mlr: 6.9489e-06 gnorm: 0.37 [35m[1 day, 23:50:23<1 day, 1:31:12][39m
+[titan] 2025-09-09 17:26:24,324 - root - INFO - [31mstep: 26090 [32mloss: 2.7296 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.91 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7711 [37mglobal_avg_top_loss: 1.9585
+[titan] 2025-09-09 17:26:24,325 - root - INFO - [34mlr: 6.9457e-06 gnorm: 0.37 [35m[1 day, 23:50:55<1 day, 1:30:38][39m
+[titan] 2025-09-09 17:26:56,361 - root - INFO - [31mstep: 26095 [32mloss: 2.7303 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.49 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7752 [37mglobal_avg_top_loss: 1.9551
+[titan] 2025-09-09 17:26:56,361 - root - INFO - [34mlr: 6.9425e-06 gnorm: 0.37 [35m[1 day, 23:51:27<1 day, 1:30:05][39m
+[titan] 2025-09-09 17:27:22,101 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:27:28,468 - root - INFO - [31mstep: 26100 [32mloss: 2.7222 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.43 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7676 [37mglobal_avg_top_loss: 1.9546
+[titan] 2025-09-09 17:27:28,468 - root - INFO - [34mlr: 6.9393e-06 gnorm: 0.37 [35m[1 day, 23:52:00<1 day, 1:29:31][39m
+[titan] 2025-09-09 17:28:00,388 - root - INFO - [31mstep: 26105 [32mloss: 2.6318 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.27 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7282 [37mglobal_avg_top_loss: 1.9036
+[titan] 2025-09-09 17:28:00,388 - root - INFO - [34mlr: 6.9361e-06 gnorm: 0.35 [35m[1 day, 23:52:31<1 day, 1:28:58][39m
+[titan] 2025-09-09 17:28:32,441 - root - INFO - [31mstep: 26110 [32mloss: 2.9489 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.24 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.8720 [37mglobal_avg_top_loss: 2.0769
+[titan] 2025-09-09 17:28:32,441 - root - INFO - [34mlr: 6.9330e-06 gnorm: 0.38 [35m[1 day, 23:53:03<1 day, 1:28:24][39m
+[titan] 2025-09-09 17:28:45,506 - root - INFO - Dumping profiler traces at step 26112
+[titan] 2025-09-09 17:28:45,565 - root - INFO - Finished dumping profiler traces in 0.06 seconds
+[titan] 2025-09-09 17:29:04,741 - root - INFO - [31mstep: 26115 [32mloss: 2.6586 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,145 [36mtflops: 483.51 [35mmfu: 48.89%[39m [37mglobal_avg_ntp_loss: 0.7405 [37mglobal_avg_top_loss: 1.9181
+[titan] 2025-09-09 17:29:04,742 - root - INFO - [34mlr: 6.9298e-06 gnorm: 0.41 [35m[1 day, 23:53:36<1 day, 1:27:51][39m
+[titan] 2025-09-09 17:29:36,777 - root - INFO - [31mstep: 26120 [32mloss: 2.7465 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.50 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7785 [37mglobal_avg_top_loss: 1.9680
+[titan] 2025-09-09 17:29:36,777 - root - INFO - [34mlr: 6.9266e-06 gnorm: 0.40 [35m[1 day, 23:54:08<1 day, 1:27:17][39m
+[titan] 2025-09-09 17:30:08,933 - root - INFO - [31mstep: 26125 [32mloss: 3.2059 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,191 [36mtflops: 485.67 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 1.0409 [37mglobal_avg_top_loss: 2.1650
+[titan] 2025-09-09 17:30:08,933 - root - INFO - [34mlr: 6.9234e-06 gnorm: 0.39 [35m[1 day, 23:54:40<1 day, 1:26:44][39m
+[titan] 2025-09-09 17:30:41,084 - root - INFO - [31mstep: 26130 [32mloss: 2.7242 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,192 [36mtflops: 485.75 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7707 [37mglobal_avg_top_loss: 1.9535
+[titan] 2025-09-09 17:30:41,085 - root - INFO - [34mlr: 6.9202e-06 gnorm: 0.39 [35m[1 day, 23:55:12<1 day, 1:26:10][39m
+[titan] 2025-09-09 17:31:12,963 - root - INFO - [31mstep: 26135 [32mloss: 2.6939 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.90 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7583 [37mglobal_avg_top_loss: 1.9356
+[titan] 2025-09-09 17:31:12,963 - root - INFO - [34mlr: 6.9170e-06 gnorm: 0.37 [35m[1 day, 23:55:44<1 day, 1:25:37][39m
+[titan] 2025-09-09 17:31:44,751 - root - INFO - [31mstep: 26140 [32mloss: 2.7265 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,310 [36mtflops: 491.36 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.7773 [37mglobal_avg_top_loss: 1.9492
+[titan] 2025-09-09 17:31:44,751 - root - INFO - [34mlr: 6.9139e-06 gnorm: 0.38 [35m[1 day, 23:56:16<1 day, 1:25:03][39m
+[titan] 2025-09-09 17:32:16,788 - root - INFO - [31mstep: 26145 [32mloss: 2.7751 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.48 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7924 [37mglobal_avg_top_loss: 1.9827
+[titan] 2025-09-09 17:32:16,789 - root - INFO - [34mlr: 6.9107e-06 gnorm: 0.36 [35m[1 day, 23:56:48<1 day, 1:24:30][39m
+[titan] 2025-09-09 17:32:42,415 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:32:48,788 - root - INFO - [31mstep: 26150 [32mloss: 2.7013 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.05 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7583 [37mglobal_avg_top_loss: 1.9429
+[titan] 2025-09-09 17:32:48,789 - root - INFO - [34mlr: 6.9075e-06 gnorm: 0.35 [35m[1 day, 23:57:20<1 day, 1:23:56][39m
+[titan] 2025-09-09 17:33:20,777 - root - INFO - [31mstep: 26155 [32mloss: 3.1825 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.21 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 1.0251 [37mglobal_avg_top_loss: 2.1575
+[titan] 2025-09-09 17:33:20,777 - root - INFO - [34mlr: 6.9043e-06 gnorm: 0.51 [35m[1 day, 23:57:52<1 day, 1:23:23][39m
+[titan] 2025-09-09 17:33:52,528 - root - INFO - [31mstep: 26160 [32mloss: 2.7225 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,321 [36mtflops: 491.87 [35mmfu: 49.73%[39m [37mglobal_avg_ntp_loss: 0.7712 [37mglobal_avg_top_loss: 1.9513
+[titan] 2025-09-09 17:33:52,529 - root - INFO - [34mlr: 6.9011e-06 gnorm: 0.37 [35m[1 day, 23:58:24<1 day, 1:22:49][39m
+[titan] 2025-09-09 17:34:24,606 - root - INFO - [31mstep: 26165 [32mloss: 2.6600 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.86 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7450 [37mglobal_avg_top_loss: 1.9150
+[titan] 2025-09-09 17:34:24,607 - root - INFO - [34mlr: 6.8980e-06 gnorm: 0.37 [35m[1 day, 23:58:56<1 day, 1:22:15][39m
+[titan] 2025-09-09 17:34:56,581 - root - INFO - [31mstep: 26170 [32mloss: 2.6813 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.43 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7488 [37mglobal_avg_top_loss: 1.9324
+[titan] 2025-09-09 17:34:56,581 - root - INFO - [34mlr: 6.8948e-06 gnorm: 0.38 [35m[1 day, 23:59:28<1 day, 1:21:42][39m
+[titan] 2025-09-09 17:35:28,771 - root - INFO - [31mstep: 26175 [32mloss: 2.7274 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,180 [36mtflops: 485.17 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7659 [37mglobal_avg_top_loss: 1.9615
+[titan] 2025-09-09 17:35:28,771 - root - INFO - [34mlr: 6.8916e-06 gnorm: 0.37 [35m[2 days, 0:00:00<1 day, 1:21:08][39m
+[titan] 2025-09-09 17:36:00,718 - root - INFO - [31mstep: 26180 [32mloss: 2.6162 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.84 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7187 [37mglobal_avg_top_loss: 1.8974
+[titan] 2025-09-09 17:36:00,719 - root - INFO - [34mlr: 6.8884e-06 gnorm: 0.37 [35m[2 days, 0:00:32<1 day, 1:20:35][39m
+[titan] 2025-09-09 17:36:32,747 - root - INFO - [31mstep: 26185 [32mloss: 2.7548 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.60 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7824 [37mglobal_avg_top_loss: 1.9723
+[titan] 2025-09-09 17:36:32,748 - root - INFO - [34mlr: 6.8853e-06 gnorm: 0.42 [35m[2 days, 0:01:04<1 day, 1:20:01][39m
+[titan] 2025-09-09 17:37:04,588 - root - INFO - [31mstep: 26190 [32mloss: 2.9840 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.49 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.8906 [37mglobal_avg_top_loss: 2.0935
+[titan] 2025-09-09 17:37:04,588 - root - INFO - [34mlr: 6.8821e-06 gnorm: 0.46 [35m[2 days, 0:01:36<1 day, 1:19:28][39m
+[titan] 2025-09-09 17:37:36,421 - root - INFO - [31mstep: 26195 [32mloss: 2.7330 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.60 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7713 [37mglobal_avg_top_loss: 1.9616
+[titan] 2025-09-09 17:37:36,422 - root - INFO - [34mlr: 6.8789e-06 gnorm: 0.37 [35m[2 days, 0:02:07<1 day, 1:18:54][39m
+[titan] 2025-09-09 17:38:01,898 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:38:08,364 - root - INFO - [31mstep: 26200 [32mloss: 2.8158 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.92 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8180 [37mglobal_avg_top_loss: 1.9978
+[titan] 2025-09-09 17:38:08,365 - root - INFO - [34mlr: 6.8757e-06 gnorm: 0.46 [35m[2 days, 0:02:39<1 day, 1:18:21][39m
+[titan] 2025-09-09 17:38:40,478 - root - INFO - [31mstep: 26205 [32mloss: 3.2506 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.32 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 1.0605 [37mglobal_avg_top_loss: 2.1902
+[titan] 2025-09-09 17:38:40,478 - root - INFO - [34mlr: 6.8726e-06 gnorm: 0.37 [35m[2 days, 0:03:12<1 day, 1:17:47][39m
+[titan] 2025-09-09 17:39:12,409 - root - INFO - [31mstep: 26210 [32mloss: 2.7501 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.09 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7823 [37mglobal_avg_top_loss: 1.9677
+[titan] 2025-09-09 17:39:12,409 - root - INFO - [34mlr: 6.8694e-06 gnorm: 0.40 [35m[2 days, 0:03:43<1 day, 1:17:13][39m
+[titan] 2025-09-09 17:39:44,557 - root - INFO - [31mstep: 26215 [32mloss: 2.7479 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.80 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7814 [37mglobal_avg_top_loss: 1.9665
+[titan] 2025-09-09 17:39:44,557 - root - INFO - [34mlr: 6.8662e-06 gnorm: 0.36 [35m[2 days, 0:04:16<1 day, 1:16:40][39m
+[titan] 2025-09-09 17:40:16,476 - root - INFO - [31mstep: 26220 [32mloss: 2.7353 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.28 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7754 [37mglobal_avg_top_loss: 1.9600
+[titan] 2025-09-09 17:40:16,476 - root - INFO - [34mlr: 6.8631e-06 gnorm: 0.41 [35m[2 days, 0:04:47<1 day, 1:16:06][39m
+[titan] 2025-09-09 17:40:48,348 - root - INFO - [31mstep: 26225 [32mloss: 2.6104 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,281 [36mtflops: 490.01 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7270 [37mglobal_avg_top_loss: 1.8834
+[titan] 2025-09-09 17:40:48,348 - root - INFO - [34mlr: 6.8599e-06 gnorm: 0.40 [35m[2 days, 0:05:19<1 day, 1:15:33][39m
+[titan] 2025-09-09 17:41:20,515 - root - INFO - [31mstep: 26230 [32mloss: 2.7211 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,187 [36mtflops: 485.52 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 0.7662 [37mglobal_avg_top_loss: 1.9548
+[titan] 2025-09-09 17:41:20,515 - root - INFO - [34mlr: 6.8567e-06 gnorm: 0.37 [35m[2 days, 0:05:52<1 day, 1:14:59][39m
+[titan] 2025-09-09 17:41:52,524 - root - INFO - [31mstep: 26235 [32mloss: 3.1629 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.90 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 1.0223 [37mglobal_avg_top_loss: 2.1405
+[titan] 2025-09-09 17:41:52,525 - root - INFO - [34mlr: 6.8535e-06 gnorm: 0.39 [35m[2 days, 0:06:24<1 day, 1:14:26][39m
+[titan] 2025-09-09 17:42:24,723 - root - INFO - [31mstep: 26240 [32mloss: 2.6700 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,177 [36mtflops: 485.03 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7476 [37mglobal_avg_top_loss: 1.9224
+[titan] 2025-09-09 17:42:24,724 - root - INFO - [34mlr: 6.8504e-06 gnorm: 0.36 [35m[2 days, 0:06:56<1 day, 1:13:52][39m
+[titan] 2025-09-09 17:42:56,549 - root - INFO - [31mstep: 26245 [32mloss: 2.8124 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.72 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.8050 [37mglobal_avg_top_loss: 2.0074
+[titan] 2025-09-09 17:42:56,549 - root - INFO - [34mlr: 6.8472e-06 gnorm: 0.45 [35m[2 days, 0:07:28<1 day, 1:13:19][39m
+[titan] 2025-09-09 17:43:22,086 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:43:28,510 - root - INFO - [31mstep: 26250 [32mloss: 2.7200 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.64 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7670 [37mglobal_avg_top_loss: 1.9529
+[titan] 2025-09-09 17:43:28,510 - root - INFO - [34mlr: 6.8440e-06 gnorm: 0.39 [35m[2 days, 0:08:00<1 day, 1:12:45][39m
+[titan] 2025-09-09 17:44:00,493 - root - INFO - [31mstep: 26255 [32mloss: 2.7502 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.30 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7798 [37mglobal_avg_top_loss: 1.9704
+[titan] 2025-09-09 17:44:00,494 - root - INFO - [34mlr: 6.8409e-06 gnorm: 0.40 [35m[2 days, 0:08:32<1 day, 1:12:12][39m
+[titan] 2025-09-09 17:44:32,454 - root - INFO - [31mstep: 26260 [32mloss: 2.7866 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.64 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.8035 [37mglobal_avg_top_loss: 1.9832
+[titan] 2025-09-09 17:44:32,455 - root - INFO - [34mlr: 6.8377e-06 gnorm: 0.38 [35m[2 days, 0:09:03<1 day, 1:11:38][39m
+[titan] 2025-09-09 17:45:04,432 - root - INFO - [31mstep: 26265 [32mloss: 2.7524 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.39 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7777 [37mglobal_avg_top_loss: 1.9748
+[titan] 2025-09-09 17:45:04,432 - root - INFO - [34mlr: 6.8345e-06 gnorm: 0.44 [35m[2 days, 0:09:35<1 day, 1:11:05][39m
+[titan] 2025-09-09 17:45:36,550 - root - INFO - [31mstep: 26270 [32mloss: 3.0611 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.26 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.9269 [37mglobal_avg_top_loss: 2.1343
+[titan] 2025-09-09 17:45:36,550 - root - INFO - [34mlr: 6.8314e-06 gnorm: 0.47 [35m[2 days, 0:10:08<1 day, 1:10:31][39m
+[titan] 2025-09-09 17:46:08,719 - root - INFO - [31mstep: 26275 [32mloss: 2.7708 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,186 [36mtflops: 485.47 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 0.7906 [37mglobal_avg_top_loss: 1.9801
+[titan] 2025-09-09 17:46:08,719 - root - INFO - [34mlr: 6.8282e-06 gnorm: 0.44 [35m[2 days, 0:10:40<1 day, 1:09:58][39m
+[titan] 2025-09-09 17:46:40,775 - root - INFO - [31mstep: 26280 [32mloss: 2.7453 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,222 [36mtflops: 487.20 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7787 [37mglobal_avg_top_loss: 1.9666
+[titan] 2025-09-09 17:46:40,775 - root - INFO - [34mlr: 6.8251e-06 gnorm: 0.38 [35m[2 days, 0:11:12<1 day, 1:09:24][39m
+[titan] 2025-09-09 17:47:12,851 - root - INFO - [31mstep: 26285 [32mloss: 3.2434 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.88 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 1.0574 [37mglobal_avg_top_loss: 2.1860
+[titan] 2025-09-09 17:47:12,851 - root - INFO - [34mlr: 6.8219e-06 gnorm: 0.36 [35m[2 days, 0:11:44<1 day, 1:08:51][39m
+[titan] 2025-09-09 17:47:44,915 - root - INFO - [31mstep: 26290 [32mloss: 2.6759 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.06 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7502 [37mglobal_avg_top_loss: 1.9257
+[titan] 2025-09-09 17:47:44,916 - root - INFO - [34mlr: 6.8187e-06 gnorm: 0.37 [35m[2 days, 0:12:16<1 day, 1:08:17][39m
+[titan] 2025-09-09 17:48:16,760 - root - INFO - [31mstep: 26295 [32mloss: 2.7558 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.43 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7835 [37mglobal_avg_top_loss: 1.9723
+[titan] 2025-09-09 17:48:16,760 - root - INFO - [34mlr: 6.8156e-06 gnorm: 0.37 [35m[2 days, 0:12:48<1 day, 1:07:44][39m
+[titan] 2025-09-09 17:48:42,599 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:48:48,994 - root - INFO - [31mstep: 26300 [32mloss: 2.6701 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,166 [36mtflops: 484.50 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7485 [37mglobal_avg_top_loss: 1.9216
+[titan] 2025-09-09 17:48:48,994 - root - INFO - [34mlr: 6.8124e-06 gnorm: 0.39 [35m[2 days, 0:13:20<1 day, 1:07:10][39m
+[titan] 2025-09-09 17:49:21,003 - root - INFO - [31mstep: 26305 [32mloss: 2.6700 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.91 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7552 [37mglobal_avg_top_loss: 1.9149
+[titan] 2025-09-09 17:49:21,003 - root - INFO - [34mlr: 6.8093e-06 gnorm: 0.36 [35m[2 days, 0:13:52<1 day, 1:06:37][39m
+[titan] 2025-09-09 17:49:53,125 - root - INFO - [31mstep: 26310 [32mloss: 2.7345 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,201 [36mtflops: 486.19 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7737 [37mglobal_avg_top_loss: 1.9608
+[titan] 2025-09-09 17:49:53,126 - root - INFO - [34mlr: 6.8061e-06 gnorm: 0.38 [35m[2 days, 0:14:24<1 day, 1:06:03][39m
+[titan] 2025-09-09 17:50:25,099 - root - INFO - [31mstep: 26315 [32mloss: 2.5116 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.45 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.6761 [37mglobal_avg_top_loss: 1.8355
+[titan] 2025-09-09 17:50:25,099 - root - INFO - [34mlr: 6.8029e-06 gnorm: 0.36 [35m[2 days, 0:14:56<1 day, 1:05:30][39m
+[titan] 2025-09-09 17:50:57,141 - root - INFO - [31mstep: 26320 [32mloss: 2.6805 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.40 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7471 [37mglobal_avg_top_loss: 1.9334
+[titan] 2025-09-09 17:50:57,141 - root - INFO - [34mlr: 6.7998e-06 gnorm: 0.36 [35m[2 days, 0:15:28<1 day, 1:04:56][39m
+[titan] 2025-09-09 17:51:28,971 - root - INFO - [31mstep: 26325 [32mloss: 2.7257 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.65 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7710 [37mglobal_avg_top_loss: 1.9547
+[titan] 2025-09-09 17:51:28,972 - root - INFO - [34mlr: 6.7966e-06 gnorm: 0.37 [35m[2 days, 0:16:00<1 day, 1:04:23][39m
+[titan] 2025-09-09 17:52:01,087 - root - INFO - [31mstep: 26330 [32mloss: 2.8199 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.28 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.8124 [37mglobal_avg_top_loss: 2.0075
+[titan] 2025-09-09 17:52:01,088 - root - INFO - [34mlr: 6.7935e-06 gnorm: 0.38 [35m[2 days, 0:16:32<1 day, 1:03:49][39m
+[titan] 2025-09-09 17:52:33,236 - root - INFO - [31mstep: 26335 [32mloss: 2.7084 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.79 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7606 [37mglobal_avg_top_loss: 1.9478
+[titan] 2025-09-09 17:52:33,236 - root - INFO - [34mlr: 6.7903e-06 gnorm: 0.36 [35m[2 days, 0:17:04<1 day, 1:03:16][39m
+[titan] 2025-09-09 17:53:05,068 - root - INFO - [31mstep: 26340 [32mloss: 2.7015 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.62 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7610 [37mglobal_avg_top_loss: 1.9405
+[titan] 2025-09-09 17:53:05,068 - root - INFO - [34mlr: 6.7872e-06 gnorm: 0.36 [35m[2 days, 0:17:36<1 day, 1:02:42][39m
+[titan] 2025-09-09 17:53:37,161 - root - INFO - [31mstep: 26345 [32mloss: 2.6775 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.63 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7480 [37mglobal_avg_top_loss: 1.9296
+[titan] 2025-09-09 17:53:37,161 - root - INFO - [34mlr: 6.7840e-06 gnorm: 0.37 [35m[2 days, 0:18:08<1 day, 1:02:09][39m
+[titan] 2025-09-09 17:54:02,816 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:54:09,188 - root - INFO - [31mstep: 26350 [32mloss: 2.7072 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.63 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7623 [37mglobal_avg_top_loss: 1.9449
+[titan] 2025-09-09 17:54:09,188 - root - INFO - [34mlr: 6.7808e-06 gnorm: 0.37 [35m[2 days, 0:18:40<1 day, 1:01:35][39m
+[titan] 2025-09-09 17:54:41,133 - root - INFO - [31mstep: 26355 [32mloss: 2.7382 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.88 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7775 [37mglobal_avg_top_loss: 1.9607
+[titan] 2025-09-09 17:54:41,134 - root - INFO - [34mlr: 6.7777e-06 gnorm: 0.37 [35m[2 days, 0:19:12<1 day, 1:01:01][39m
+[titan] 2025-09-09 17:55:13,224 - root - INFO - [31mstep: 26360 [32mloss: 2.7167 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.67 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7659 [37mglobal_avg_top_loss: 1.9509
+[titan] 2025-09-09 17:55:13,224 - root - INFO - [34mlr: 6.7745e-06 gnorm: 0.39 [35m[2 days, 0:19:44<1 day, 1:00:28][39m
+[titan] 2025-09-09 17:55:45,334 - root - INFO - [31mstep: 26365 [32mloss: 3.2179 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.37 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 1.0477 [37mglobal_avg_top_loss: 2.1702
+[titan] 2025-09-09 17:55:45,334 - root - INFO - [34mlr: 6.7714e-06 gnorm: 0.36 [35m[2 days, 0:20:16<1 day, 0:59:55][39m
+[titan] 2025-09-09 17:56:17,293 - root - INFO - [31mstep: 26370 [32mloss: 2.7177 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.68 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7681 [37mglobal_avg_top_loss: 1.9496
+[titan] 2025-09-09 17:56:17,293 - root - INFO - [34mlr: 6.7682e-06 gnorm: 0.37 [35m[2 days, 0:20:48<1 day, 0:59:21][39m
+[titan] 2025-09-09 17:56:49,398 - root - INFO - [31mstep: 26375 [32mloss: 2.6157 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.44 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7332 [37mglobal_avg_top_loss: 1.8825
+[titan] 2025-09-09 17:56:49,398 - root - INFO - [34mlr: 6.7651e-06 gnorm: 0.38 [35m[2 days, 0:21:20<1 day, 0:58:48][39m
+[titan] 2025-09-09 17:57:21,626 - root - INFO - [31mstep: 26380 [32mloss: 2.6597 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,168 [36mtflops: 484.60 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 0.7400 [37mglobal_avg_top_loss: 1.9197
+[titan] 2025-09-09 17:57:21,626 - root - INFO - [34mlr: 6.7619e-06 gnorm: 0.36 [35m[2 days, 0:21:53<1 day, 0:58:14][39m
+[titan] 2025-09-09 17:57:53,624 - root - INFO - [31mstep: 26385 [32mloss: 2.6869 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.07 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7589 [37mglobal_avg_top_loss: 1.9280
+[titan] 2025-09-09 17:57:53,625 - root - INFO - [34mlr: 6.7588e-06 gnorm: 0.37 [35m[2 days, 0:22:25<1 day, 0:57:41][39m
+[titan] 2025-09-09 17:58:25,650 - root - INFO - [31mstep: 26390 [32mloss: 2.8947 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.65 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.8609 [37mglobal_avg_top_loss: 2.0338
+[titan] 2025-09-09 17:58:25,651 - root - INFO - [34mlr: 6.7556e-06 gnorm: 0.36 [35m[2 days, 0:22:57<1 day, 0:57:07][39m
+[titan] 2025-09-09 17:58:57,689 - root - INFO - [31mstep: 26395 [32mloss: 2.5015 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.45 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.6745 [37mglobal_avg_top_loss: 1.8270
+[titan] 2025-09-09 17:58:57,690 - root - INFO - [34mlr: 6.7525e-06 gnorm: 0.36 [35m[2 days, 0:23:29<1 day, 0:56:34][39m
+[titan] 2025-09-09 17:59:23,318 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 17:59:29,796 - root - INFO - [31mstep: 26400 [32mloss: 2.7635 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.42 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7863 [37mglobal_avg_top_loss: 1.9771
+[titan] 2025-09-09 17:59:29,797 - root - INFO - [34mlr: 6.7493e-06 gnorm: 0.38 [35m[2 days, 0:24:01<1 day, 0:56:00][39m
+[titan] 2025-09-09 18:00:01,816 - root - INFO - [31mstep: 26405 [32mloss: 2.6588 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.74 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7360 [37mglobal_avg_top_loss: 1.9228
+[titan] 2025-09-09 18:00:01,817 - root - INFO - [34mlr: 6.7462e-06 gnorm: 0.37 [35m[2 days, 0:24:33<1 day, 0:55:27][39m
+[titan] 2025-09-09 18:00:34,083 - root - INFO - [31mstep: 26410 [32mloss: 2.7292 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,156 [36mtflops: 484.02 [35mmfu: 48.94%[39m [37mglobal_avg_ntp_loss: 0.7729 [37mglobal_avg_top_loss: 1.9563
+[titan] 2025-09-09 18:00:34,083 - root - INFO - [34mlr: 6.7431e-06 gnorm: 0.43 [35m[2 days, 0:25:05<1 day, 0:54:53][39m
+[titan] 2025-09-09 18:01:06,220 - root - INFO - [31mstep: 26415 [32mloss: 2.7316 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,197 [36mtflops: 485.96 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.7731 [37mglobal_avg_top_loss: 1.9586
+[titan] 2025-09-09 18:01:06,220 - root - INFO - [34mlr: 6.7399e-06 gnorm: 0.36 [35m[2 days, 0:25:37<1 day, 0:54:20][39m
+[titan] 2025-09-09 18:01:38,095 - root - INFO - [31mstep: 26420 [32mloss: 2.7657 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.96 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7806 [37mglobal_avg_top_loss: 1.9851
+[titan] 2025-09-09 18:01:38,095 - root - INFO - [34mlr: 6.7368e-06 gnorm: 0.39 [35m[2 days, 0:26:09<1 day, 0:53:46][39m
+[titan] 2025-09-09 18:02:10,202 - root - INFO - [31mstep: 26425 [32mloss: 2.7568 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.42 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7904 [37mglobal_avg_top_loss: 1.9664
+[titan] 2025-09-09 18:02:10,202 - root - INFO - [34mlr: 6.7336e-06 gnorm: 0.37 [35m[2 days, 0:26:41<1 day, 0:53:13][39m
+[titan] 2025-09-09 18:02:42,082 - root - INFO - [31mstep: 26430 [32mloss: 2.8021 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.88 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7979 [37mglobal_avg_top_loss: 2.0042
+[titan] 2025-09-09 18:02:42,082 - root - INFO - [34mlr: 6.7305e-06 gnorm: 1.15 [35m[2 days, 0:27:13<1 day, 0:52:39][39m
+[titan] 2025-09-09 18:03:13,873 - root - INFO - [31mstep: 26435 [32mloss: 2.8537 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,308 [36mtflops: 491.25 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.8392 [37mglobal_avg_top_loss: 2.0145
+[titan] 2025-09-09 18:03:13,873 - root - INFO - [34mlr: 6.7273e-06 gnorm: 0.38 [35m[2 days, 0:27:45<1 day, 0:52:06][39m
+[titan] 2025-09-09 18:03:45,896 - root - INFO - [31mstep: 26440 [32mloss: 2.6853 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.70 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7534 [37mglobal_avg_top_loss: 1.9319
+[titan] 2025-09-09 18:03:45,896 - root - INFO - [34mlr: 6.7242e-06 gnorm: 0.38 [35m[2 days, 0:28:17<1 day, 0:51:32][39m
+[titan] 2025-09-09 18:04:17,896 - root - INFO - [31mstep: 26445 [32mloss: 2.7965 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.03 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.8035 [37mglobal_avg_top_loss: 1.9930
+[titan] 2025-09-09 18:04:17,897 - root - INFO - [34mlr: 6.7211e-06 gnorm: 0.37 [35m[2 days, 0:28:49<1 day, 0:50:59][39m
+[titan] 2025-09-09 18:04:43,414 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:04:49,848 - root - INFO - [31mstep: 26450 [32mloss: 2.7367 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.79 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7727 [37mglobal_avg_top_loss: 1.9640
+[titan] 2025-09-09 18:04:49,848 - root - INFO - [34mlr: 6.7179e-06 gnorm: 0.36 [35m[2 days, 0:29:21<1 day, 0:50:25][39m
+[titan] 2025-09-09 18:05:21,850 - root - INFO - [31mstep: 26455 [32mloss: 2.6576 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.02 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7378 [37mglobal_avg_top_loss: 1.9199
+[titan] 2025-09-09 18:05:21,850 - root - INFO - [34mlr: 6.7148e-06 gnorm: 0.37 [35m[2 days, 0:29:53<1 day, 0:49:52][39m
+[titan] 2025-09-09 18:05:53,703 - root - INFO - [31mstep: 26460 [32mloss: 2.7638 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.29 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7850 [37mglobal_avg_top_loss: 1.9788
+[titan] 2025-09-09 18:05:53,704 - root - INFO - [34mlr: 6.7116e-06 gnorm: 0.79 [35m[2 days, 0:30:25<1 day, 0:49:18][39m
+[titan] 2025-09-09 18:06:25,662 - root - INFO - [31mstep: 26465 [32mloss: 2.7646 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.67 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7900 [37mglobal_avg_top_loss: 1.9745
+[titan] 2025-09-09 18:06:25,662 - root - INFO - [34mlr: 6.7085e-06 gnorm: 0.37 [35m[2 days, 0:30:57<1 day, 0:48:44][39m
+[titan] 2025-09-09 18:06:57,520 - root - INFO - [31mstep: 26470 [32mloss: 2.6511 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.22 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7346 [37mglobal_avg_top_loss: 1.9166
+[titan] 2025-09-09 18:06:57,520 - root - INFO - [34mlr: 6.7054e-06 gnorm: 0.36 [35m[2 days, 0:31:28<1 day, 0:48:11][39m
+[titan] 2025-09-09 18:07:29,449 - root - INFO - [31mstep: 26475 [32mloss: 2.7796 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.13 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7954 [37mglobal_avg_top_loss: 1.9842
+[titan] 2025-09-09 18:07:29,450 - root - INFO - [34mlr: 6.7022e-06 gnorm: 0.39 [35m[2 days, 0:32:00<1 day, 0:47:37][39m
+[titan] 2025-09-09 18:08:01,415 - root - INFO - [31mstep: 26480 [32mloss: 2.6602 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.56 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7415 [37mglobal_avg_top_loss: 1.9188
+[titan] 2025-09-09 18:08:01,416 - root - INFO - [34mlr: 6.6991e-06 gnorm: 0.40 [35m[2 days, 0:32:32<1 day, 0:47:04][39m
+[titan] 2025-09-09 18:08:33,352 - root - INFO - [31mstep: 26485 [32mloss: 2.7112 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.01 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7618 [37mglobal_avg_top_loss: 1.9494
+[titan] 2025-09-09 18:08:33,353 - root - INFO - [34mlr: 6.6959e-06 gnorm: 0.39 [35m[2 days, 0:33:04<1 day, 0:46:30][39m
+[titan] 2025-09-09 18:09:05,385 - root - INFO - [31mstep: 26490 [32mloss: 2.7449 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.55 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7773 [37mglobal_avg_top_loss: 1.9675
+[titan] 2025-09-09 18:09:05,385 - root - INFO - [34mlr: 6.6928e-06 gnorm: 0.36 [35m[2 days, 0:33:36<1 day, 0:45:57][39m
+[titan] 2025-09-09 18:09:37,198 - root - INFO - [31mstep: 26495 [32mloss: 2.7375 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.91 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7756 [37mglobal_avg_top_loss: 1.9619
+[titan] 2025-09-09 18:09:37,199 - root - INFO - [34mlr: 6.6897e-06 gnorm: 0.36 [35m[2 days, 0:34:08<1 day, 0:45:23][39m
+[titan] 2025-09-09 18:10:02,774 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:10:09,221 - root - INFO - [31mstep: 26500 [32mloss: 2.7651 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.70 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7881 [37mglobal_avg_top_loss: 1.9770
+[titan] 2025-09-09 18:10:09,221 - root - INFO - [34mlr: 6.6865e-06 gnorm: 0.37 [35m[2 days, 0:34:40<1 day, 0:44:50][39m
+[titan] 2025-09-09 18:10:41,146 - root - INFO - [31mstep: 26505 [32mloss: 2.7141 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.19 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7685 [37mglobal_avg_top_loss: 1.9456
+[titan] 2025-09-09 18:10:41,147 - root - INFO - [34mlr: 6.6834e-06 gnorm: 0.39 [35m[2 days, 0:35:12<1 day, 0:44:16][39m
+[titan] 2025-09-09 18:11:13,150 - root - INFO - [31mstep: 26510 [32mloss: 2.7420 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.99 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7779 [37mglobal_avg_top_loss: 1.9641
+[titan] 2025-09-09 18:11:13,151 - root - INFO - [34mlr: 6.6803e-06 gnorm: 0.37 [35m[2 days, 0:35:44<1 day, 0:43:43][39m
+[titan] 2025-09-09 18:11:45,192 - root - INFO - [31mstep: 26515 [32mloss: 3.1672 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.41 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 1.0187 [37mglobal_avg_top_loss: 2.1484
+[titan] 2025-09-09 18:11:45,192 - root - INFO - [34mlr: 6.6771e-06 gnorm: 0.41 [35m[2 days, 0:36:16<1 day, 0:43:09][39m
+[titan] 2025-09-09 18:12:17,093 - root - INFO - [31mstep: 26520 [32mloss: 2.6628 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.56 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7401 [37mglobal_avg_top_loss: 1.9227
+[titan] 2025-09-09 18:12:17,093 - root - INFO - [34mlr: 6.6740e-06 gnorm: 0.37 [35m[2 days, 0:36:48<1 day, 0:42:36][39m
+[titan] 2025-09-09 18:12:49,050 - root - INFO - [31mstep: 26525 [32mloss: 2.6877 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.70 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7521 [37mglobal_avg_top_loss: 1.9356
+[titan] 2025-09-09 18:12:49,050 - root - INFO - [34mlr: 6.6709e-06 gnorm: 0.39 [35m[2 days, 0:37:20<1 day, 0:42:02][39m
+[titan] 2025-09-09 18:13:20,965 - root - INFO - [31mstep: 26530 [32mloss: 2.7931 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.34 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.8094 [37mglobal_avg_top_loss: 1.9837
+[titan] 2025-09-09 18:13:20,965 - root - INFO - [34mlr: 6.6678e-06 gnorm: 0.38 [35m[2 days, 0:37:52<1 day, 0:41:29][39m
+[titan] 2025-09-09 18:13:52,774 - root - INFO - [31mstep: 26535 [32mloss: 2.5594 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.97 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.6996 [37mglobal_avg_top_loss: 1.8598
+[titan] 2025-09-09 18:13:52,775 - root - INFO - [34mlr: 6.6646e-06 gnorm: 0.39 [35m[2 days, 0:38:24<1 day, 0:40:55][39m
+[titan] 2025-09-09 18:14:24,706 - root - INFO - [31mstep: 26540 [32mloss: 2.7469 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.09 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7782 [37mglobal_avg_top_loss: 1.9688
+[titan] 2025-09-09 18:14:24,707 - root - INFO - [34mlr: 6.6615e-06 gnorm: 0.36 [35m[2 days, 0:38:56<1 day, 0:40:21][39m
+[titan] 2025-09-09 18:14:56,756 - root - INFO - [31mstep: 26545 [32mloss: 2.7816 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.28 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7984 [37mglobal_avg_top_loss: 1.9832
+[titan] 2025-09-09 18:14:56,757 - root - INFO - [34mlr: 6.6584e-06 gnorm: 0.36 [35m[2 days, 0:39:28<1 day, 0:39:48][39m
+[titan] 2025-09-09 18:15:22,337 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:15:28,768 - root - INFO - [31mstep: 26550 [32mloss: 2.7103 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.87 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7605 [37mglobal_avg_top_loss: 1.9497
+[titan] 2025-09-09 18:15:28,768 - root - INFO - [34mlr: 6.6552e-06 gnorm: 0.39 [35m[2 days, 0:40:00<1 day, 0:39:14][39m
+[titan] 2025-09-09 18:16:00,543 - root - INFO - [31mstep: 26555 [32mloss: 2.7149 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,313 [36mtflops: 491.51 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.7613 [37mglobal_avg_top_loss: 1.9536
+[titan] 2025-09-09 18:16:00,543 - root - INFO - [34mlr: 6.6521e-06 gnorm: 0.37 [35m[2 days, 0:40:31<1 day, 0:38:41][39m
+[titan] 2025-09-09 18:16:32,527 - root - INFO - [31mstep: 26560 [32mloss: 2.6688 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.28 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7442 [37mglobal_avg_top_loss: 1.9246
+[titan] 2025-09-09 18:16:32,528 - root - INFO - [34mlr: 6.6490e-06 gnorm: 0.35 [35m[2 days, 0:41:03<1 day, 0:38:07][39m
+[titan] 2025-09-09 18:17:04,428 - root - INFO - [31mstep: 26565 [32mloss: 3.0258 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.56 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.9468 [37mglobal_avg_top_loss: 2.0790
+[titan] 2025-09-09 18:17:04,429 - root - INFO - [34mlr: 6.6459e-06 gnorm: 0.36 [35m[2 days, 0:41:35<1 day, 0:37:34][39m
+[titan] 2025-09-09 18:17:36,436 - root - INFO - [31mstep: 26570 [32mloss: 2.7843 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.93 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8151 [37mglobal_avg_top_loss: 1.9691
+[titan] 2025-09-09 18:17:36,436 - root - INFO - [34mlr: 6.6427e-06 gnorm: 0.52 [35m[2 days, 0:42:07<1 day, 0:37:00][39m
+[titan] 2025-09-09 18:18:08,371 - root - INFO - [31mstep: 26575 [32mloss: 2.7543 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.03 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7811 [37mglobal_avg_top_loss: 1.9732
+[titan] 2025-09-09 18:18:08,371 - root - INFO - [34mlr: 6.6396e-06 gnorm: 0.36 [35m[2 days, 0:42:39<1 day, 0:36:27][39m
+[titan] 2025-09-09 18:18:40,351 - root - INFO - [31mstep: 26580 [32mloss: 2.5754 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.35 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7003 [37mglobal_avg_top_loss: 1.8751
+[titan] 2025-09-09 18:18:40,351 - root - INFO - [34mlr: 6.6365e-06 gnorm: 0.38 [35m[2 days, 0:43:11<1 day, 0:35:53][39m
+[titan] 2025-09-09 18:19:12,027 - root - INFO - [31mstep: 26585 [32mloss: 2.7384 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,345 [36mtflops: 493.04 [35mmfu: 49.85%[39m [37mglobal_avg_ntp_loss: 0.7740 [37mglobal_avg_top_loss: 1.9644
+[titan] 2025-09-09 18:19:12,027 - root - INFO - [34mlr: 6.6334e-06 gnorm: 0.37 [35m[2 days, 0:43:43<1 day, 0:35:20][39m
+[titan] 2025-09-09 18:19:43,955 - root - INFO - [31mstep: 26590 [32mloss: 2.6997 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.14 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7598 [37mglobal_avg_top_loss: 1.9399
+[titan] 2025-09-09 18:19:43,955 - root - INFO - [34mlr: 6.6303e-06 gnorm: 0.37 [35m[2 days, 0:44:15<1 day, 0:34:46][39m
+[titan] 2025-09-09 18:20:15,942 - root - INFO - [31mstep: 26595 [32mloss: 2.7426 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.25 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7755 [37mglobal_avg_top_loss: 1.9671
+[titan] 2025-09-09 18:20:15,942 - root - INFO - [34mlr: 6.6271e-06 gnorm: 0.38 [35m[2 days, 0:44:47<1 day, 0:34:13][39m
+[titan] 2025-09-09 18:20:41,522 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:20:47,897 - root - INFO - [31mstep: 26600 [32mloss: 2.7555 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.73 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7882 [37mglobal_avg_top_loss: 1.9673
+[titan] 2025-09-09 18:20:47,897 - root - INFO - [34mlr: 6.6240e-06 gnorm: 0.37 [35m[2 days, 0:45:19<1 day, 0:33:39][39m
+[titan] 2025-09-09 18:21:19,953 - root - INFO - [31mstep: 26605 [32mloss: 2.7358 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,222 [36mtflops: 487.19 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7759 [37mglobal_avg_top_loss: 1.9599
+[titan] 2025-09-09 18:21:19,954 - root - INFO - [34mlr: 6.6209e-06 gnorm: 0.36 [35m[2 days, 0:45:51<1 day, 0:33:06][39m
+[titan] 2025-09-09 18:21:51,968 - root - INFO - [31mstep: 26610 [32mloss: 2.7057 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.82 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7625 [37mglobal_avg_top_loss: 1.9432
+[titan] 2025-09-09 18:21:51,969 - root - INFO - [34mlr: 6.6178e-06 gnorm: 0.36 [35m[2 days, 0:46:23<1 day, 0:32:32][39m
+[titan] 2025-09-09 18:22:23,977 - root - INFO - [31mstep: 26615 [32mloss: 2.6111 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.91 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7177 [37mglobal_avg_top_loss: 1.8934
+[titan] 2025-09-09 18:22:23,977 - root - INFO - [34mlr: 6.6147e-06 gnorm: 0.36 [35m[2 days, 0:46:55<1 day, 0:31:59][39m
+[titan] 2025-09-09 18:22:55,882 - root - INFO - [31mstep: 26620 [32mloss: 2.7573 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.50 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7826 [37mglobal_avg_top_loss: 1.9747
+[titan] 2025-09-09 18:22:55,883 - root - INFO - [34mlr: 6.6115e-06 gnorm: 0.37 [35m[2 days, 0:47:27<1 day, 0:31:25][39m
+[titan] 2025-09-09 18:23:21,666 - root - INFO - Dumping profiler traces at step 26624
+[titan] 2025-09-09 18:23:21,726 - root - INFO - Finished dumping profiler traces in 0.06 seconds
+[titan] 2025-09-09 18:23:28,076 - root - INFO - [31mstep: 26625 [32mloss: 2.6706 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,179 [36mtflops: 485.10 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7474 [37mglobal_avg_top_loss: 1.9232
+[titan] 2025-09-09 18:23:28,077 - root - INFO - [34mlr: 6.6084e-06 gnorm: 0.36 [35m[2 days, 0:47:59<1 day, 0:30:52][39m
+[titan] 2025-09-09 18:23:59,881 - root - INFO - [31mstep: 26630 [32mloss: 2.6770 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.05 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7498 [37mglobal_avg_top_loss: 1.9273
+[titan] 2025-09-09 18:23:59,881 - root - INFO - [34mlr: 6.6053e-06 gnorm: 0.36 [35m[2 days, 0:48:31<1 day, 0:30:18][39m
+[titan] 2025-09-09 18:24:31,904 - root - INFO - [31mstep: 26635 [32mloss: 2.7035 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.69 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7591 [37mglobal_avg_top_loss: 1.9444
+[titan] 2025-09-09 18:24:31,905 - root - INFO - [34mlr: 6.6022e-06 gnorm: 0.35 [35m[2 days, 0:49:03<1 day, 0:29:45][39m
+[titan] 2025-09-09 18:25:03,947 - root - INFO - [31mstep: 26640 [32mloss: 2.7697 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.40 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7919 [37mglobal_avg_top_loss: 1.9778
+[titan] 2025-09-09 18:25:03,947 - root - INFO - [34mlr: 6.5991e-06 gnorm: 0.36 [35m[2 days, 0:49:35<1 day, 0:29:11][39m
+[titan] 2025-09-09 18:25:35,714 - root - INFO - [31mstep: 26645 [32mloss: 2.6643 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,315 [36mtflops: 491.63 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.7427 [37mglobal_avg_top_loss: 1.9217
+[titan] 2025-09-09 18:25:35,714 - root - INFO - [34mlr: 6.5960e-06 gnorm: 0.36 [35m[2 days, 0:50:07<1 day, 0:28:38][39m
+[titan] 2025-09-09 18:26:01,148 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:26:07,551 - root - INFO - [31mstep: 26650 [32mloss: 2.7395 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.54 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7801 [37mglobal_avg_top_loss: 1.9594
+[titan] 2025-09-09 18:26:07,552 - root - INFO - [34mlr: 6.5929e-06 gnorm: 0.37 [35m[2 days, 0:50:38<1 day, 0:28:04][39m
+[titan] 2025-09-09 18:26:39,577 - root - INFO - [31mstep: 26655 [32mloss: 2.7005 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.66 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7555 [37mglobal_avg_top_loss: 1.9450
+[titan] 2025-09-09 18:26:39,577 - root - INFO - [34mlr: 6.5897e-06 gnorm: 0.36 [35m[2 days, 0:51:11<1 day, 0:27:30][39m
+[titan] 2025-09-09 18:27:11,661 - root - INFO - [31mstep: 26660 [32mloss: 2.5784 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.76 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7040 [37mglobal_avg_top_loss: 1.8744
+[titan] 2025-09-09 18:27:11,662 - root - INFO - [34mlr: 6.5866e-06 gnorm: 0.57 [35m[2 days, 0:51:43<1 day, 0:26:57][39m
+[titan] 2025-09-09 18:27:43,857 - root - INFO - [31mstep: 26665 [32mloss: 2.7274 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.08 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7693 [37mglobal_avg_top_loss: 1.9581
+[titan] 2025-09-09 18:27:43,857 - root - INFO - [34mlr: 6.5835e-06 gnorm: 0.50 [35m[2 days, 0:52:15<1 day, 0:26:24][39m
+[titan] 2025-09-09 18:28:15,625 - root - INFO - [31mstep: 26670 [32mloss: 2.7489 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,315 [36mtflops: 491.60 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.7797 [37mglobal_avg_top_loss: 1.9692
+[titan] 2025-09-09 18:28:15,626 - root - INFO - [34mlr: 6.5804e-06 gnorm: 0.37 [35m[2 days, 0:52:47<1 day, 0:25:50][39m
+[titan] 2025-09-09 18:28:47,736 - root - INFO - [31mstep: 26675 [32mloss: 2.6154 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.36 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7216 [37mglobal_avg_top_loss: 1.8938
+[titan] 2025-09-09 18:28:47,736 - root - INFO - [34mlr: 6.5773e-06 gnorm: 0.37 [35m[2 days, 0:53:19<1 day, 0:25:17][39m
+[titan] 2025-09-09 18:29:19,704 - root - INFO - [31mstep: 26680 [32mloss: 2.6659 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.53 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7406 [37mglobal_avg_top_loss: 1.9253
+[titan] 2025-09-09 18:29:19,705 - root - INFO - [34mlr: 6.5742e-06 gnorm: 0.37 [35m[2 days, 0:53:51<1 day, 0:24:43][39m
+[titan] 2025-09-09 18:29:51,641 - root - INFO - [31mstep: 26685 [32mloss: 2.7422 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.02 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7777 [37mglobal_avg_top_loss: 1.9645
+[titan] 2025-09-09 18:29:51,641 - root - INFO - [34mlr: 6.5711e-06 gnorm: 0.37 [35m[2 days, 0:54:23<1 day, 0:24:10][39m
+[titan] 2025-09-09 18:30:23,887 - root - INFO - [31mstep: 26690 [32mloss: 2.8764 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,162 [36mtflops: 484.32 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.8292 [37mglobal_avg_top_loss: 2.0471
+[titan] 2025-09-09 18:30:23,887 - root - INFO - [34mlr: 6.5680e-06 gnorm: 0.42 [35m[2 days, 0:54:55<1 day, 0:23:36][39m
+[titan] 2025-09-09 18:30:55,929 - root - INFO - [31mstep: 26695 [32mloss: 2.6844 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.40 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7501 [37mglobal_avg_top_loss: 1.9344
+[titan] 2025-09-09 18:30:55,930 - root - INFO - [34mlr: 6.5649e-06 gnorm: 0.40 [35m[2 days, 0:55:27<1 day, 0:23:03][39m
+[titan] 2025-09-09 18:31:21,543 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:31:27,901 - root - INFO - [31mstep: 26700 [32mloss: 2.7301 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.48 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7713 [37mglobal_avg_top_loss: 1.9588
+[titan] 2025-09-09 18:31:27,902 - root - INFO - [34mlr: 6.5618e-06 gnorm: 0.39 [35m[2 days, 0:55:59<1 day, 0:22:29][39m
+[titan] 2025-09-09 18:31:59,878 - root - INFO - [31mstep: 26705 [32mloss: 2.7289 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.40 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7786 [37mglobal_avg_top_loss: 1.9503
+[titan] 2025-09-09 18:31:59,879 - root - INFO - [34mlr: 6.5587e-06 gnorm: 0.39 [35m[2 days, 0:56:31<1 day, 0:21:56][39m
+[titan] 2025-09-09 18:32:31,845 - root - INFO - [31mstep: 26710 [32mloss: 2.7364 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.54 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7807 [37mglobal_avg_top_loss: 1.9557
+[titan] 2025-09-09 18:32:31,846 - root - INFO - [34mlr: 6.5556e-06 gnorm: 0.38 [35m[2 days, 0:57:03<1 day, 0:21:22][39m
+[titan] 2025-09-09 18:33:03,927 - root - INFO - [31mstep: 26715 [32mloss: 2.6721 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.81 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7460 [37mglobal_avg_top_loss: 1.9261
+[titan] 2025-09-09 18:33:03,927 - root - INFO - [34mlr: 6.5525e-06 gnorm: 0.43 [35m[2 days, 0:57:35<1 day, 0:20:49][39m
+[titan] 2025-09-09 18:33:35,894 - root - INFO - [31mstep: 26720 [32mloss: 2.6865 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.54 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7515 [37mglobal_avg_top_loss: 1.9351
+[titan] 2025-09-09 18:33:35,895 - root - INFO - [34mlr: 6.5493e-06 gnorm: 0.39 [35m[2 days, 0:58:07<1 day, 0:20:15][39m
+[titan] 2025-09-09 18:34:07,755 - root - INFO - [31mstep: 26725 [32mloss: 2.6890 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.18 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7545 [37mglobal_avg_top_loss: 1.9345
+[titan] 2025-09-09 18:34:07,755 - root - INFO - [34mlr: 6.5462e-06 gnorm: 0.39 [35m[2 days, 0:58:39<1 day, 0:19:42][39m
+[titan] 2025-09-09 18:34:39,712 - root - INFO - [31mstep: 26730 [32mloss: 2.7492 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.69 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7823 [37mglobal_avg_top_loss: 1.9668
+[titan] 2025-09-09 18:34:39,713 - root - INFO - [34mlr: 6.5431e-06 gnorm: 0.42 [35m[2 days, 0:59:11<1 day, 0:19:08][39m
+[titan] 2025-09-09 18:35:11,503 - root - INFO - [31mstep: 26735 [32mloss: 2.7350 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,308 [36mtflops: 491.26 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.7739 [37mglobal_avg_top_loss: 1.9610
+[titan] 2025-09-09 18:35:11,504 - root - INFO - [34mlr: 6.5400e-06 gnorm: 0.35 [35m[2 days, 0:59:42<1 day, 0:18:35][39m
+[titan] 2025-09-09 18:35:43,530 - root - INFO - [31mstep: 26740 [32mloss: 2.6251 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.65 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7294 [37mglobal_avg_top_loss: 1.8956
+[titan] 2025-09-09 18:35:43,530 - root - INFO - [34mlr: 6.5369e-06 gnorm: 0.51 [35m[2 days, 1:00:14<1 day, 0:18:01][39m
+[titan] 2025-09-09 18:36:15,427 - root - INFO - [31mstep: 26745 [32mloss: 2.7441 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.61 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7772 [37mglobal_avg_top_loss: 1.9669
+[titan] 2025-09-09 18:36:15,428 - root - INFO - [34mlr: 6.5338e-06 gnorm: 0.44 [35m[2 days, 1:00:46<1 day, 0:17:28][39m
+[titan] 2025-09-09 18:36:40,904 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:36:47,249 - root - INFO - [31mstep: 26750 [32mloss: 2.7808 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.78 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7984 [37mglobal_avg_top_loss: 1.9824
+[titan] 2025-09-09 18:36:47,249 - root - INFO - [34mlr: 6.5307e-06 gnorm: 0.39 [35m[2 days, 1:01:18<1 day, 0:16:54][39m
+[titan] 2025-09-09 18:37:19,394 - root - INFO - [31mstep: 26755 [32mloss: 2.7120 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.84 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7616 [37mglobal_avg_top_loss: 1.9503
+[titan] 2025-09-09 18:37:19,395 - root - INFO - [34mlr: 6.5276e-06 gnorm: 0.37 [35m[2 days, 1:01:50<1 day, 0:16:21][39m
+[titan] 2025-09-09 18:37:51,277 - root - INFO - [31mstep: 26760 [32mloss: 2.7289 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.84 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7720 [37mglobal_avg_top_loss: 1.9569
+[titan] 2025-09-09 18:37:51,278 - root - INFO - [34mlr: 6.5245e-06 gnorm: 0.36 [35m[2 days, 1:02:22<1 day, 0:15:47][39m
+[titan] 2025-09-09 18:38:23,337 - root - INFO - [31mstep: 26765 [32mloss: 2.9793 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.14 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.9055 [37mglobal_avg_top_loss: 2.0738
+[titan] 2025-09-09 18:38:23,337 - root - INFO - [34mlr: 6.5215e-06 gnorm: 0.38 [35m[2 days, 1:02:54<1 day, 0:15:14][39m
+[titan] 2025-09-09 18:38:55,034 - root - INFO - [31mstep: 26770 [32mloss: 2.6750 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,338 [36mtflops: 492.71 [35mmfu: 49.82%[39m [37mglobal_avg_ntp_loss: 0.7462 [37mglobal_avg_top_loss: 1.9288
+[titan] 2025-09-09 18:38:55,034 - root - INFO - [34mlr: 6.5184e-06 gnorm: 0.48 [35m[2 days, 1:03:26<1 day, 0:14:40][39m
+[titan] 2025-09-09 18:39:27,104 - root - INFO - [31mstep: 26775 [32mloss: 2.8126 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.97 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.8093 [37mglobal_avg_top_loss: 2.0033
+[titan] 2025-09-09 18:39:27,105 - root - INFO - [34mlr: 6.5153e-06 gnorm: 0.38 [35m[2 days, 1:03:58<1 day, 0:14:07][39m
+[titan] 2025-09-09 18:39:59,118 - root - INFO - [31mstep: 26780 [32mloss: 2.7232 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.84 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7691 [37mglobal_avg_top_loss: 1.9541
+[titan] 2025-09-09 18:39:59,118 - root - INFO - [34mlr: 6.5122e-06 gnorm: 0.38 [35m[2 days, 1:04:30<1 day, 0:13:33][39m
+[titan] 2025-09-09 18:40:30,999 - root - INFO - [31mstep: 26785 [32mloss: 2.6975 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.87 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7588 [37mglobal_avg_top_loss: 1.9387
+[titan] 2025-09-09 18:40:30,999 - root - INFO - [34mlr: 6.5091e-06 gnorm: 0.37 [35m[2 days, 1:05:02<1 day, 0:13:00][39m
+[titan] 2025-09-09 18:41:02,990 - root - INFO - [31mstep: 26790 [32mloss: 2.8991 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8509 [37mglobal_avg_top_loss: 2.0482
+[titan] 2025-09-09 18:41:02,990 - root - INFO - [34mlr: 6.5060e-06 gnorm: 0.41 [35m[2 days, 1:05:34<1 day, 0:12:26][39m
+[titan] 2025-09-09 18:41:34,969 - root - INFO - [31mstep: 26795 [32mloss: 2.7226 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.36 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7696 [37mglobal_avg_top_loss: 1.9530
+[titan] 2025-09-09 18:41:34,969 - root - INFO - [34mlr: 6.5029e-06 gnorm: 0.37 [35m[2 days, 1:06:06<1 day, 0:11:53][39m
+[titan] 2025-09-09 18:42:00,507 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:42:06,982 - root - INFO - [31mstep: 26800 [32mloss: 2.7583 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.84 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7973 [37mglobal_avg_top_loss: 1.9610
+[titan] 2025-09-09 18:42:06,982 - root - INFO - [34mlr: 6.4998e-06 gnorm: 0.45 [35m[2 days, 1:06:38<1 day, 0:11:19][39m
+[titan] 2025-09-09 18:42:38,835 - root - INFO - [31mstep: 26805 [32mloss: 2.6549 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.30 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7381 [37mglobal_avg_top_loss: 1.9168
+[titan] 2025-09-09 18:42:38,835 - root - INFO - [34mlr: 6.4967e-06 gnorm: 0.35 [35m[2 days, 1:07:10<1 day, 0:10:46][39m
+[titan] 2025-09-09 18:43:10,862 - root - INFO - [31mstep: 26810 [32mloss: 2.7149 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.64 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7648 [37mglobal_avg_top_loss: 1.9501
+[titan] 2025-09-09 18:43:10,862 - root - INFO - [34mlr: 6.4936e-06 gnorm: 0.35 [35m[2 days, 1:07:42<1 day, 0:10:12][39m
+[titan] 2025-09-09 18:43:43,062 - root - INFO - [31mstep: 26815 [32mloss: 2.7017 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,177 [36mtflops: 485.02 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7592 [37mglobal_avg_top_loss: 1.9426
+[titan] 2025-09-09 18:43:43,062 - root - INFO - [34mlr: 6.4905e-06 gnorm: 0.36 [35m[2 days, 1:08:14<1 day, 0:09:39][39m
+[titan] 2025-09-09 18:44:14,872 - root - INFO - [31mstep: 26820 [32mloss: 2.6268 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,301 [36mtflops: 490.96 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7222 [37mglobal_avg_top_loss: 1.9046
+[titan] 2025-09-09 18:44:14,872 - root - INFO - [34mlr: 6.4874e-06 gnorm: 0.55 [35m[2 days, 1:08:46<1 day, 0:09:05][39m
+[titan] 2025-09-09 18:44:46,895 - root - INFO - [31mstep: 26825 [32mloss: 2.6928 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.69 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7572 [37mglobal_avg_top_loss: 1.9356
+[titan] 2025-09-09 18:44:46,895 - root - INFO - [34mlr: 6.4843e-06 gnorm: 0.38 [35m[2 days, 1:09:18<1 day, 0:08:32][39m
+[titan] 2025-09-09 18:45:18,819 - root - INFO - [31mstep: 26830 [32mloss: 2.6896 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.21 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7526 [37mglobal_avg_top_loss: 1.9371
+[titan] 2025-09-09 18:45:18,820 - root - INFO - [34mlr: 6.4813e-06 gnorm: 0.37 [35m[2 days, 1:09:50<1 day, 0:07:58][39m
+[titan] 2025-09-09 18:45:50,797 - root - INFO - [31mstep: 26835 [32mloss: 2.6914 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.38 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7555 [37mglobal_avg_top_loss: 1.9359
+[titan] 2025-09-09 18:45:50,797 - root - INFO - [34mlr: 6.4782e-06 gnorm: 0.36 [35m[2 days, 1:10:22<1 day, 0:07:25][39m
+[titan] 2025-09-09 18:46:22,809 - root - INFO - [31mstep: 26840 [32mloss: 2.7312 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.86 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7715 [37mglobal_avg_top_loss: 1.9596
+[titan] 2025-09-09 18:46:22,809 - root - INFO - [34mlr: 6.4751e-06 gnorm: 0.37 [35m[2 days, 1:10:54<1 day, 0:06:51][39m
+[titan] 2025-09-09 18:46:54,845 - root - INFO - [31mstep: 26845 [32mloss: 2.7830 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.50 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7965 [37mglobal_avg_top_loss: 1.9865
+[titan] 2025-09-09 18:46:54,845 - root - INFO - [34mlr: 6.4720e-06 gnorm: 0.37 [35m[2 days, 1:11:26<1 day, 0:06:18][39m
+[titan] 2025-09-09 18:47:20,438 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:47:26,953 - root - INFO - [31mstep: 26850 [32mloss: 2.8378 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.41 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.8144 [37mglobal_avg_top_loss: 2.0234
+[titan] 2025-09-09 18:47:26,953 - root - INFO - [34mlr: 6.4689e-06 gnorm: 1.00 [35m[2 days, 1:11:58<1 day, 0:05:45][39m
+[titan] 2025-09-09 18:47:58,978 - root - INFO - [31mstep: 26855 [32mloss: 3.2085 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.66 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 1.0421 [37mglobal_avg_top_loss: 2.1664
+[titan] 2025-09-09 18:47:58,978 - root - INFO - [34mlr: 6.4658e-06 gnorm: 0.42 [35m[2 days, 1:12:30<1 day, 0:05:11][39m
+[titan] 2025-09-09 18:48:30,675 - root - INFO - [31mstep: 26860 [32mloss: 2.6858 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,338 [36mtflops: 492.71 [35mmfu: 49.82%[39m [37mglobal_avg_ntp_loss: 0.7521 [37mglobal_avg_top_loss: 1.9337
+[titan] 2025-09-09 18:48:30,676 - root - INFO - [34mlr: 6.4627e-06 gnorm: 0.36 [35m[2 days, 1:13:02<1 day, 0:04:38][39m
+[titan] 2025-09-09 18:49:02,689 - root - INFO - [31mstep: 26865 [32mloss: 2.5476 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.83 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.6916 [37mglobal_avg_top_loss: 1.8560
+[titan] 2025-09-09 18:49:02,690 - root - INFO - [34mlr: 6.4597e-06 gnorm: 0.35 [35m[2 days, 1:13:34<1 day, 0:04:04][39m
+[titan] 2025-09-09 18:49:34,520 - root - INFO - [31mstep: 26870 [32mloss: 2.6289 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.65 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7313 [37mglobal_avg_top_loss: 1.8976
+[titan] 2025-09-09 18:49:34,520 - root - INFO - [34mlr: 6.4566e-06 gnorm: 0.36 [35m[2 days, 1:14:05<1 day, 0:03:31][39m
+[titan] 2025-09-09 18:50:06,556 - root - INFO - [31mstep: 26875 [32mloss: 2.7057 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.50 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7613 [37mglobal_avg_top_loss: 1.9444
+[titan] 2025-09-09 18:50:06,556 - root - INFO - [34mlr: 6.4535e-06 gnorm: 0.38 [35m[2 days, 1:14:37<1 day, 0:02:57][39m
+[titan] 2025-09-09 18:50:38,478 - root - INFO - [31mstep: 26880 [32mloss: 2.8484 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.23 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.8308 [37mglobal_avg_top_loss: 2.0176
+[titan] 2025-09-09 18:50:38,479 - root - INFO - [34mlr: 6.4504e-06 gnorm: 0.37 [35m[2 days, 1:15:09<1 day, 0:02:24][39m
+[titan] 2025-09-09 18:51:10,479 - root - INFO - [31mstep: 26885 [32mloss: 2.6820 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.03 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7489 [37mglobal_avg_top_loss: 1.9331
+[titan] 2025-09-09 18:51:10,480 - root - INFO - [34mlr: 6.4473e-06 gnorm: 0.38 [35m[2 days, 1:15:41<1 day, 0:01:50][39m
+[titan] 2025-09-09 18:51:42,515 - root - INFO - [31mstep: 26890 [32mloss: 2.7534 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.51 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7844 [37mglobal_avg_top_loss: 1.9691
+[titan] 2025-09-09 18:51:42,515 - root - INFO - [34mlr: 6.4443e-06 gnorm: 0.37 [35m[2 days, 1:16:13<1 day, 0:01:17][39m
+[titan] 2025-09-09 18:52:14,305 - root - INFO - [31mstep: 26895 [32mloss: 2.6009 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,308 [36mtflops: 491.26 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 0.7154 [37mglobal_avg_top_loss: 1.8856
+[titan] 2025-09-09 18:52:14,306 - root - INFO - [34mlr: 6.4412e-06 gnorm: 0.35 [35m[2 days, 1:16:45<1 day, 0:00:43][39m
+[titan] 2025-09-09 18:52:40,051 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:52:46,410 - root - INFO - [31mstep: 26900 [32mloss: 2.8136 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.45 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.8116 [37mglobal_avg_top_loss: 2.0020
+[titan] 2025-09-09 18:52:46,411 - root - INFO - [34mlr: 6.4381e-06 gnorm: 0.38 [35m[2 days, 1:17:17<1 day, 0:00:10][39m
+[titan] 2025-09-09 18:53:18,400 - root - INFO - [31mstep: 26905 [32mloss: 2.7392 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.20 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7787 [37mglobal_avg_top_loss: 1.9605
+[titan] 2025-09-09 18:53:18,400 - root - INFO - [34mlr: 6.4350e-06 gnorm: 0.38 [35m[2 days, 1:17:49<23:59:36][39m
+[titan] 2025-09-09 18:53:50,345 - root - INFO - [31mstep: 26910 [32mloss: 2.7703 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.89 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7903 [37mglobal_avg_top_loss: 1.9800
+[titan] 2025-09-09 18:53:50,345 - root - INFO - [34mlr: 6.4320e-06 gnorm: 0.37 [35m[2 days, 1:18:21<23:59:03][39m
+[titan] 2025-09-09 18:54:22,055 - root - INFO - [31mstep: 26915 [32mloss: 2.6668 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,334 [36mtflops: 492.50 [35mmfu: 49.80%[39m [37mglobal_avg_ntp_loss: 0.7436 [37mglobal_avg_top_loss: 1.9232
+[titan] 2025-09-09 18:54:22,056 - root - INFO - [34mlr: 6.4289e-06 gnorm: 0.39 [35m[2 days, 1:18:53<23:58:29][39m
+[titan] 2025-09-09 18:54:53,974 - root - INFO - [31mstep: 26920 [32mloss: 2.7720 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.28 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7904 [37mglobal_avg_top_loss: 1.9816
+[titan] 2025-09-09 18:54:53,975 - root - INFO - [34mlr: 6.4258e-06 gnorm: 0.37 [35m[2 days, 1:19:25<23:57:56][39m
+[titan] 2025-09-09 18:55:26,066 - root - INFO - [31mstep: 26925 [32mloss: 2.7029 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.66 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7597 [37mglobal_avg_top_loss: 1.9431
+[titan] 2025-09-09 18:55:26,066 - root - INFO - [34mlr: 6.4227e-06 gnorm: 0.40 [35m[2 days, 1:19:57<23:57:22][39m
+[titan] 2025-09-09 18:55:58,132 - root - INFO - [31mstep: 26930 [32mloss: 2.7287 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.04 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7702 [37mglobal_avg_top_loss: 1.9585
+[titan] 2025-09-09 18:55:58,132 - root - INFO - [34mlr: 6.4197e-06 gnorm: 0.39 [35m[2 days, 1:20:29<23:56:49][39m
+[titan] 2025-09-09 18:56:30,182 - root - INFO - [31mstep: 26935 [32mloss: 2.8694 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.27 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.8479 [37mglobal_avg_top_loss: 2.0215
+[titan] 2025-09-09 18:56:30,183 - root - INFO - [34mlr: 6.4166e-06 gnorm: 0.38 [35m[2 days, 1:21:01<23:56:15][39m
+[titan] 2025-09-09 18:57:02,151 - root - INFO - [31mstep: 26940 [32mloss: 2.7370 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.52 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7747 [37mglobal_avg_top_loss: 1.9623
+[titan] 2025-09-09 18:57:02,152 - root - INFO - [34mlr: 6.4135e-06 gnorm: 0.37 [35m[2 days, 1:21:33<23:55:42][39m
+[titan] 2025-09-09 18:57:34,348 - root - INFO - [31mstep: 26945 [32mloss: 2.7522 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.05 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7980 [37mglobal_avg_top_loss: 1.9542
+[titan] 2025-09-09 18:57:34,349 - root - INFO - [34mlr: 6.4104e-06 gnorm: 0.38 [35m[2 days, 1:22:05<23:55:09][39m
+[titan] 2025-09-09 18:57:59,826 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 18:58:06,251 - root - INFO - [31mstep: 26950 [32mloss: 3.0763 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.54 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.9866 [37mglobal_avg_top_loss: 2.0897
+[titan] 2025-09-09 18:58:06,252 - root - INFO - [34mlr: 6.4074e-06 gnorm: 0.43 [35m[2 days, 1:22:37<23:54:35][39m
+[titan] 2025-09-09 18:58:38,209 - root - INFO - [31mstep: 26955 [32mloss: 2.6407 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.69 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7312 [37mglobal_avg_top_loss: 1.9095
+[titan] 2025-09-09 18:58:38,210 - root - INFO - [34mlr: 6.4043e-06 gnorm: 0.36 [35m[2 days, 1:23:09<23:54:02][39m
+[titan] 2025-09-09 18:59:10,199 - root - INFO - [31mstep: 26960 [32mloss: 2.6080 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.20 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7167 [37mglobal_avg_top_loss: 1.8912
+[titan] 2025-09-09 18:59:10,200 - root - INFO - [34mlr: 6.4012e-06 gnorm: 0.37 [35m[2 days, 1:23:41<23:53:28][39m
+[titan] 2025-09-09 18:59:42,158 - root - INFO - [31mstep: 26965 [32mloss: 2.6799 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.68 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7459 [37mglobal_avg_top_loss: 1.9340
+[titan] 2025-09-09 18:59:42,158 - root - INFO - [34mlr: 6.3982e-06 gnorm: 0.36 [35m[2 days, 1:24:13<23:52:55][39m
+[titan] 2025-09-09 19:00:14,177 - root - INFO - [31mstep: 26970 [32mloss: 2.6982 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.76 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7540 [37mglobal_avg_top_loss: 1.9442
+[titan] 2025-09-09 19:00:14,177 - root - INFO - [34mlr: 6.3951e-06 gnorm: 0.36 [35m[2 days, 1:24:45<23:52:21][39m
+[titan] 2025-09-09 19:00:46,238 - root - INFO - [31mstep: 26975 [32mloss: 2.6967 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.12 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7569 [37mglobal_avg_top_loss: 1.9398
+[titan] 2025-09-09 19:00:46,238 - root - INFO - [34mlr: 6.3920e-06 gnorm: 0.37 [35m[2 days, 1:25:17<23:51:48][39m
+[titan] 2025-09-09 19:01:18,133 - root - INFO - [31mstep: 26980 [32mloss: 2.8244 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.65 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.8155 [37mglobal_avg_top_loss: 2.0090
+[titan] 2025-09-09 19:01:18,133 - root - INFO - [34mlr: 6.3890e-06 gnorm: 0.37 [35m[2 days, 1:25:49<23:51:14][39m
+[titan] 2025-09-09 19:01:49,981 - root - INFO - [31mstep: 26985 [32mloss: 2.7032 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,289 [36mtflops: 490.38 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7608 [37mglobal_avg_top_loss: 1.9424
+[titan] 2025-09-09 19:01:49,981 - root - INFO - [34mlr: 6.3859e-06 gnorm: 0.37 [35m[2 days, 1:26:21<23:50:41][39m
+[titan] 2025-09-09 19:02:22,023 - root - INFO - [31mstep: 26990 [32mloss: 2.7053 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.40 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7472 [37mglobal_avg_top_loss: 1.9582
+[titan] 2025-09-09 19:02:22,024 - root - INFO - [34mlr: 6.3828e-06 gnorm: 1.18 [35m[2 days, 1:26:53<23:50:07][39m
+[titan] 2025-09-09 19:02:54,069 - root - INFO - [31mstep: 26995 [32mloss: 2.6801 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.34 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7467 [37mglobal_avg_top_loss: 1.9334
+[titan] 2025-09-09 19:02:54,070 - root - INFO - [34mlr: 6.3798e-06 gnorm: 0.35 [35m[2 days, 1:27:25<23:49:34][39m
+[titan] 2025-09-09 19:03:19,665 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:03:26,079 - root - INFO - [31mstep: 27000 [32mloss: 2.6747 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7454 [37mglobal_avg_top_loss: 1.9293
+[titan] 2025-09-09 19:03:26,080 - root - INFO - [34mlr: 6.3767e-06 gnorm: 0.36 [35m[2 days, 1:27:57<23:49:00][39m
+[titan] 2025-09-09 19:03:57,995 - root - INFO - [31mstep: 27005 [32mloss: 3.0726 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.33 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.9287 [37mglobal_avg_top_loss: 2.1439
+[titan] 2025-09-09 19:03:57,996 - root - INFO - [34mlr: 6.3736e-06 gnorm: 0.39 [35m[2 days, 1:28:29<23:48:27][39m
+[titan] 2025-09-09 19:04:30,067 - root - INFO - [31mstep: 27010 [32mloss: 2.7375 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.95 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7730 [37mglobal_avg_top_loss: 1.9645
+[titan] 2025-09-09 19:04:30,068 - root - INFO - [34mlr: 6.3706e-06 gnorm: 0.37 [35m[2 days, 1:29:01<23:47:54][39m
+[titan] 2025-09-09 19:05:02,102 - root - INFO - [31mstep: 27015 [32mloss: 2.8582 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.51 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.8349 [37mglobal_avg_top_loss: 2.0232
+[titan] 2025-09-09 19:05:02,103 - root - INFO - [34mlr: 6.3675e-06 gnorm: 0.38 [35m[2 days, 1:29:33<23:47:20][39m
+[titan] 2025-09-09 19:05:34,130 - root - INFO - [31mstep: 27020 [32mloss: 2.6838 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.62 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7519 [37mglobal_avg_top_loss: 1.9319
+[titan] 2025-09-09 19:05:34,131 - root - INFO - [34mlr: 6.3645e-06 gnorm: 0.36 [35m[2 days, 1:30:05<23:46:47][39m
+[titan] 2025-09-09 19:06:06,082 - root - INFO - [31mstep: 27025 [32mloss: 2.6730 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.78 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7428 [37mglobal_avg_top_loss: 1.9302
+[titan] 2025-09-09 19:06:06,083 - root - INFO - [34mlr: 6.3614e-06 gnorm: 0.38 [35m[2 days, 1:30:37<23:46:13][39m
+[titan] 2025-09-09 19:06:37,883 - root - INFO - [31mstep: 27030 [32mloss: 3.1885 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.10 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 1.0375 [37mglobal_avg_top_loss: 2.1509
+[titan] 2025-09-09 19:06:37,884 - root - INFO - [34mlr: 6.3583e-06 gnorm: 0.36 [35m[2 days, 1:31:09<23:45:40][39m
+[titan] 2025-09-09 19:07:09,881 - root - INFO - [31mstep: 27035 [32mloss: 2.7117 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.08 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7632 [37mglobal_avg_top_loss: 1.9485
+[titan] 2025-09-09 19:07:09,882 - root - INFO - [34mlr: 6.3553e-06 gnorm: 0.38 [35m[2 days, 1:31:41<23:45:06][39m
+[titan] 2025-09-09 19:07:41,905 - root - INFO - [31mstep: 27040 [32mloss: 2.6774 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.68 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7484 [37mglobal_avg_top_loss: 1.9290
+[titan] 2025-09-09 19:07:41,906 - root - INFO - [34mlr: 6.3522e-06 gnorm: 0.37 [35m[2 days, 1:32:13<23:44:33][39m
+[titan] 2025-09-09 19:08:13,946 - root - INFO - [31mstep: 27045 [32mloss: 2.7450 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.43 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7756 [37mglobal_avg_top_loss: 1.9694
+[titan] 2025-09-09 19:08:13,946 - root - INFO - [34mlr: 6.3492e-06 gnorm: 0.40 [35m[2 days, 1:32:45<23:43:59][39m
+[titan] 2025-09-09 19:08:39,584 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:08:46,031 - root - INFO - [31mstep: 27050 [32mloss: 2.6478 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.75 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7343 [37mglobal_avg_top_loss: 1.9134
+[titan] 2025-09-09 19:08:46,031 - root - INFO - [34mlr: 6.3461e-06 gnorm: 0.42 [35m[2 days, 1:33:17<23:43:26][39m
+[titan] 2025-09-09 19:09:17,964 - root - INFO - [31mstep: 27055 [32mloss: 2.7008 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.06 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7598 [37mglobal_avg_top_loss: 1.9409
+[titan] 2025-09-09 19:09:17,965 - root - INFO - [34mlr: 6.3431e-06 gnorm: 0.39 [35m[2 days, 1:33:49<23:42:53][39m
+[titan] 2025-09-09 19:09:50,017 - root - INFO - [31mstep: 27060 [32mloss: 2.7380 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.23 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7725 [37mglobal_avg_top_loss: 1.9655
+[titan] 2025-09-09 19:09:50,018 - root - INFO - [34mlr: 6.3400e-06 gnorm: 0.38 [35m[2 days, 1:34:21<23:42:19][39m
+[titan] 2025-09-09 19:10:22,223 - root - INFO - [31mstep: 27065 [32mloss: 2.6734 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,175 [36mtflops: 484.93 [35mmfu: 49.03%[39m [37mglobal_avg_ntp_loss: 0.7467 [37mglobal_avg_top_loss: 1.9267
+[titan] 2025-09-09 19:10:22,223 - root - INFO - [34mlr: 6.3369e-06 gnorm: 0.39 [35m[2 days, 1:34:53<23:41:46][39m
+[titan] 2025-09-09 19:10:54,268 - root - INFO - [31mstep: 27070 [32mloss: 2.6952 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.36 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7551 [37mglobal_avg_top_loss: 1.9402
+[titan] 2025-09-09 19:10:54,268 - root - INFO - [34mlr: 6.3339e-06 gnorm: 0.36 [35m[2 days, 1:35:25<23:41:12][39m
+[titan] 2025-09-09 19:11:26,435 - root - INFO - [31mstep: 27075 [32mloss: 2.7463 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,187 [36mtflops: 485.50 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 0.7781 [37mglobal_avg_top_loss: 1.9681
+[titan] 2025-09-09 19:11:26,436 - root - INFO - [34mlr: 6.3308e-06 gnorm: 0.42 [35m[2 days, 1:35:57<23:40:39][39m
+[titan] 2025-09-09 19:11:58,375 - root - INFO - [31mstep: 27080 [32mloss: 2.7435 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.97 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7763 [37mglobal_avg_top_loss: 1.9671
+[titan] 2025-09-09 19:11:58,375 - root - INFO - [34mlr: 6.3278e-06 gnorm: 0.39 [35m[2 days, 1:36:29<23:40:06][39m
+[titan] 2025-09-09 19:12:30,534 - root - INFO - [31mstep: 27085 [32mloss: 2.6053 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.64 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7191 [37mglobal_avg_top_loss: 1.8862
+[titan] 2025-09-09 19:12:30,534 - root - INFO - [34mlr: 6.3247e-06 gnorm: 0.37 [35m[2 days, 1:37:01<23:39:32][39m
+[titan] 2025-09-09 19:13:02,558 - root - INFO - [31mstep: 27090 [32mloss: 2.7293 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.67 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7670 [37mglobal_avg_top_loss: 1.9623
+[titan] 2025-09-09 19:13:02,559 - root - INFO - [34mlr: 6.3217e-06 gnorm: 0.37 [35m[2 days, 1:37:33<23:38:59][39m
+[titan] 2025-09-09 19:13:34,565 - root - INFO - [31mstep: 27095 [32mloss: 3.6952 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.95 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 1.3170 [37mglobal_avg_top_loss: 2.3782
+[titan] 2025-09-09 19:13:34,565 - root - INFO - [34mlr: 6.3186e-06 gnorm: 0.40 [35m[2 days, 1:38:05<23:38:25][39m
+[titan] 2025-09-09 19:14:00,107 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:14:06,437 - root - INFO - [31mstep: 27100 [32mloss: 2.7371 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,281 [36mtflops: 490.00 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7732 [37mglobal_avg_top_loss: 1.9639
+[titan] 2025-09-09 19:14:06,437 - root - INFO - [34mlr: 6.3156e-06 gnorm: 0.39 [35m[2 days, 1:38:37<23:37:52][39m
+[titan] 2025-09-09 19:14:38,535 - root - INFO - [31mstep: 27105 [32mloss: 2.7704 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.56 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7909 [37mglobal_avg_top_loss: 1.9795
+[titan] 2025-09-09 19:14:38,535 - root - INFO - [34mlr: 6.3125e-06 gnorm: 0.40 [35m[2 days, 1:39:09<23:37:18][39m
+[titan] 2025-09-09 19:15:10,418 - root - INFO - [31mstep: 27110 [32mloss: 3.1899 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.82 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 1.0342 [37mglobal_avg_top_loss: 2.1557
+[titan] 2025-09-09 19:15:10,419 - root - INFO - [34mlr: 6.3095e-06 gnorm: 0.37 [35m[2 days, 1:39:41<23:36:45][39m
+[titan] 2025-09-09 19:15:42,484 - root - INFO - [31mstep: 27115 [32mloss: 2.9980 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.05 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.9298 [37mglobal_avg_top_loss: 2.0682
+[titan] 2025-09-09 19:15:42,484 - root - INFO - [34mlr: 6.3064e-06 gnorm: 0.36 [35m[2 days, 1:40:13<23:36:12][39m
+[titan] 2025-09-09 19:16:14,529 - root - INFO - [31mstep: 27120 [32mloss: 2.7013 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.36 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7600 [37mglobal_avg_top_loss: 1.9413
+[titan] 2025-09-09 19:16:14,529 - root - INFO - [34mlr: 6.3034e-06 gnorm: 0.36 [35m[2 days, 1:40:45<23:35:38][39m
+[titan] 2025-09-09 19:16:46,604 - root - INFO - [31mstep: 27125 [32mloss: 2.7012 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.89 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7581 [37mglobal_avg_top_loss: 1.9430
+[titan] 2025-09-09 19:16:46,605 - root - INFO - [34mlr: 6.3004e-06 gnorm: 0.36 [35m[2 days, 1:41:17<23:35:05][39m
+[titan] 2025-09-09 19:17:18,649 - root - INFO - [31mstep: 27130 [32mloss: 2.7045 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.37 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7655 [37mglobal_avg_top_loss: 1.9391
+[titan] 2025-09-09 19:17:18,649 - root - INFO - [34mlr: 6.2973e-06 gnorm: 0.38 [35m[2 days, 1:41:49<23:34:31][39m
+[titan] 2025-09-09 19:17:50,655 - root - INFO - [31mstep: 27135 [32mloss: 2.9253 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.95 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.8866 [37mglobal_avg_top_loss: 2.0387
+[titan] 2025-09-09 19:17:50,655 - root - INFO - [34mlr: 6.2943e-06 gnorm: 0.37 [35m[2 days, 1:42:21<23:33:58][39m
+[titan] 2025-09-09 19:17:57,331 - root - INFO - Dumping profiler traces at step 27136
+[titan] 2025-09-09 19:17:57,401 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 19:18:22,920 - root - INFO - [31mstep: 27140 [32mloss: 2.6909 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,156 [36mtflops: 484.04 [35mmfu: 48.94%[39m [37mglobal_avg_ntp_loss: 0.7540 [37mglobal_avg_top_loss: 1.9369
+[titan] 2025-09-09 19:18:22,920 - root - INFO - [34mlr: 6.2912e-06 gnorm: 0.43 [35m[2 days, 1:42:54<23:33:25][39m
+[titan] 2025-09-09 19:18:55,015 - root - INFO - [31mstep: 27145 [32mloss: 2.6698 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,210 [36mtflops: 486.59 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7440 [37mglobal_avg_top_loss: 1.9258
+[titan] 2025-09-09 19:18:55,015 - root - INFO - [34mlr: 6.2882e-06 gnorm: 0.40 [35m[2 days, 1:43:26<23:32:51][39m
+[titan] 2025-09-09 19:19:20,804 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:19:27,192 - root - INFO - [31mstep: 27150 [32mloss: 2.7734 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,184 [36mtflops: 485.37 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 0.7875 [37mglobal_avg_top_loss: 1.9859
+[titan] 2025-09-09 19:19:27,192 - root - INFO - [34mlr: 6.2851e-06 gnorm: 0.38 [35m[2 days, 1:43:58<23:32:18][39m
+[titan] 2025-09-09 19:19:59,034 - root - INFO - [31mstep: 27155 [32mloss: 2.6800 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.47 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7478 [37mglobal_avg_top_loss: 1.9323
+[titan] 2025-09-09 19:19:59,034 - root - INFO - [34mlr: 6.2821e-06 gnorm: 0.40 [35m[2 days, 1:44:30<23:31:44][39m
+[titan] 2025-09-09 19:20:31,023 - root - INFO - [31mstep: 27160 [32mloss: 2.7391 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.20 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7746 [37mglobal_avg_top_loss: 1.9645
+[titan] 2025-09-09 19:20:31,024 - root - INFO - [34mlr: 6.2791e-06 gnorm: 0.36 [35m[2 days, 1:45:02<23:31:11][39m
+[titan] 2025-09-09 19:21:02,982 - root - INFO - [31mstep: 27165 [32mloss: 3.0957 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.68 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.9706 [37mglobal_avg_top_loss: 2.1251
+[titan] 2025-09-09 19:21:02,982 - root - INFO - [34mlr: 6.2760e-06 gnorm: 0.37 [35m[2 days, 1:45:34<23:30:37][39m
+[titan] 2025-09-09 19:21:34,913 - root - INFO - [31mstep: 27170 [32mloss: 2.6701 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.10 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7452 [37mglobal_avg_top_loss: 1.9249
+[titan] 2025-09-09 19:21:34,913 - root - INFO - [34mlr: 6.2730e-06 gnorm: 0.37 [35m[2 days, 1:46:06<23:30:04][39m
+[titan] 2025-09-09 19:22:06,814 - root - INFO - [31mstep: 27175 [32mloss: 3.6108 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.56 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 1.2809 [37mglobal_avg_top_loss: 2.3299
+[titan] 2025-09-09 19:22:06,814 - root - INFO - [34mlr: 6.2700e-06 gnorm: 0.44 [35m[2 days, 1:46:38<23:29:30][39m
+[titan] 2025-09-09 19:22:38,918 - root - INFO - [31mstep: 27180 [32mloss: 2.7138 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.46 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7626 [37mglobal_avg_top_loss: 1.9512
+[titan] 2025-09-09 19:22:38,918 - root - INFO - [34mlr: 6.2669e-06 gnorm: 0.40 [35m[2 days, 1:47:10<23:28:57][39m
+[titan] 2025-09-09 19:23:11,081 - root - INFO - [31mstep: 27185 [32mloss: 2.7029 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,188 [36mtflops: 485.57 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7599 [37mglobal_avg_top_loss: 1.9430
+[titan] 2025-09-09 19:23:11,082 - root - INFO - [34mlr: 6.2639e-06 gnorm: 0.39 [35m[2 days, 1:47:42<23:28:24][39m
+[titan] 2025-09-09 19:23:43,028 - root - INFO - [31mstep: 27190 [32mloss: 2.7070 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.85 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7610 [37mglobal_avg_top_loss: 1.9460
+[titan] 2025-09-09 19:23:43,029 - root - INFO - [34mlr: 6.2608e-06 gnorm: 0.37 [35m[2 days, 1:48:14<23:27:50][39m
+[titan] 2025-09-09 19:24:15,006 - root - INFO - [31mstep: 27195 [32mloss: 2.6947 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.39 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7549 [37mglobal_avg_top_loss: 1.9398
+[titan] 2025-09-09 19:24:15,006 - root - INFO - [34mlr: 6.2578e-06 gnorm: 0.37 [35m[2 days, 1:48:46<23:27:17][39m
+[titan] 2025-09-09 19:24:40,596 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:24:47,045 - root - INFO - [31mstep: 27200 [32mloss: 2.7372 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.45 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7747 [37mglobal_avg_top_loss: 1.9625
+[titan] 2025-09-09 19:24:47,046 - root - INFO - [34mlr: 6.2548e-06 gnorm: 0.41 [35m[2 days, 1:49:18<23:26:43][39m
+[titan] 2025-09-09 19:25:19,281 - root - INFO - [31mstep: 27205 [32mloss: 2.6635 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,166 [36mtflops: 484.48 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7423 [37mglobal_avg_top_loss: 1.9211
+[titan] 2025-09-09 19:25:19,281 - root - INFO - [34mlr: 6.2517e-06 gnorm: 0.37 [35m[2 days, 1:49:50<23:26:10][39m
+[titan] 2025-09-09 19:25:51,302 - root - INFO - [31mstep: 27210 [32mloss: 2.7562 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.72 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7837 [37mglobal_avg_top_loss: 1.9725
+[titan] 2025-09-09 19:25:51,302 - root - INFO - [34mlr: 6.2487e-06 gnorm: 0.45 [35m[2 days, 1:50:22<23:25:37][39m
+[titan] 2025-09-09 19:26:23,452 - root - INFO - [31mstep: 27215 [32mloss: 2.6512 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.77 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7339 [37mglobal_avg_top_loss: 1.9173
+[titan] 2025-09-09 19:26:23,452 - root - INFO - [34mlr: 6.2457e-06 gnorm: 0.36 [35m[2 days, 1:50:54<23:25:03][39m
+[titan] 2025-09-09 19:26:55,409 - root - INFO - [31mstep: 27220 [32mloss: 2.6973 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.70 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7558 [37mglobal_avg_top_loss: 1.9415
+[titan] 2025-09-09 19:26:55,409 - root - INFO - [34mlr: 6.2426e-06 gnorm: 0.36 [35m[2 days, 1:51:26<23:24:30][39m
+[titan] 2025-09-09 19:27:27,275 - root - INFO - [31mstep: 27225 [32mloss: 2.6883 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.09 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7520 [37mglobal_avg_top_loss: 1.9364
+[titan] 2025-09-09 19:27:27,276 - root - INFO - [34mlr: 6.2396e-06 gnorm: 0.41 [35m[2 days, 1:51:58<23:23:56][39m
+[titan] 2025-09-09 19:27:59,387 - root - INFO - [31mstep: 27230 [32mloss: 2.7478 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.35 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7774 [37mglobal_avg_top_loss: 1.9704
+[titan] 2025-09-09 19:27:59,387 - root - INFO - [34mlr: 6.2366e-06 gnorm: 0.38 [35m[2 days, 1:52:30<23:23:23][39m
+[titan] 2025-09-09 19:28:31,434 - root - INFO - [31mstep: 27235 [32mloss: 2.6302 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.33 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7288 [37mglobal_avg_top_loss: 1.9014
+[titan] 2025-09-09 19:28:31,434 - root - INFO - [34mlr: 6.2336e-06 gnorm: 0.40 [35m[2 days, 1:53:02<23:22:50][39m
+[titan] 2025-09-09 19:29:03,280 - root - INFO - [31mstep: 27240 [32mloss: 2.6055 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.41 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7136 [37mglobal_avg_top_loss: 1.8919
+[titan] 2025-09-09 19:29:03,280 - root - INFO - [34mlr: 6.2305e-06 gnorm: 0.38 [35m[2 days, 1:53:34<23:22:16][39m
+[titan] 2025-09-09 19:29:35,315 - root - INFO - [31mstep: 27245 [32mloss: 2.7772 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.52 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7943 [37mglobal_avg_top_loss: 1.9829
+[titan] 2025-09-09 19:29:35,315 - root - INFO - [34mlr: 6.2275e-06 gnorm: 0.38 [35m[2 days, 1:54:06<23:21:43][39m
+[titan] 2025-09-09 19:30:00,910 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:30:07,285 - root - INFO - [31mstep: 27250 [32mloss: 2.7578 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.50 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7853 [37mglobal_avg_top_loss: 1.9726
+[titan] 2025-09-09 19:30:07,285 - root - INFO - [34mlr: 6.2245e-06 gnorm: 0.40 [35m[2 days, 1:54:38<23:21:09][39m
+[titan] 2025-09-09 19:30:39,306 - root - INFO - [31mstep: 27255 [32mloss: 3.7046 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.72 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 1.3229 [37mglobal_avg_top_loss: 2.3816
+[titan] 2025-09-09 19:30:39,307 - root - INFO - [34mlr: 6.2215e-06 gnorm: 0.37 [35m[2 days, 1:55:10<23:20:36][39m
+[titan] 2025-09-09 19:31:11,557 - root - INFO - [31mstep: 27260 [32mloss: 2.6905 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,161 [36mtflops: 484.26 [35mmfu: 48.96%[39m [37mglobal_avg_ntp_loss: 0.7532 [37mglobal_avg_top_loss: 1.9373
+[titan] 2025-09-09 19:31:11,557 - root - INFO - [34mlr: 6.2184e-06 gnorm: 0.40 [35m[2 days, 1:55:42<23:20:03][39m
+[titan] 2025-09-09 19:31:43,626 - root - INFO - [31mstep: 27265 [32mloss: 2.7020 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,218 [36mtflops: 487.00 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7563 [37mglobal_avg_top_loss: 1.9457
+[titan] 2025-09-09 19:31:43,626 - root - INFO - [34mlr: 6.2154e-06 gnorm: 0.38 [35m[2 days, 1:56:14<23:19:29][39m
+[titan] 2025-09-09 19:32:15,717 - root - INFO - [31mstep: 27270 [32mloss: 3.1301 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.66 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 1.0078 [37mglobal_avg_top_loss: 2.1223
+[titan] 2025-09-09 19:32:15,717 - root - INFO - [34mlr: 6.2124e-06 gnorm: 0.39 [35m[2 days, 1:56:47<23:18:56][39m
+[titan] 2025-09-09 19:32:47,847 - root - INFO - [31mstep: 27275 [32mloss: 2.7305 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,199 [36mtflops: 486.06 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7718 [37mglobal_avg_top_loss: 1.9586
+[titan] 2025-09-09 19:32:47,848 - root - INFO - [34mlr: 6.2094e-06 gnorm: 0.37 [35m[2 days, 1:57:19<23:18:22][39m
+[titan] 2025-09-09 19:33:19,745 - root - INFO - [31mstep: 27280 [32mloss: 2.7766 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.60 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7970 [37mglobal_avg_top_loss: 1.9796
+[titan] 2025-09-09 19:33:19,746 - root - INFO - [34mlr: 6.2063e-06 gnorm: 0.43 [35m[2 days, 1:57:51<23:17:49][39m
+[titan] 2025-09-09 19:33:51,888 - root - INFO - [31mstep: 27285 [32mloss: 2.7533 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,195 [36mtflops: 485.88 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7798 [37mglobal_avg_top_loss: 1.9734
+[titan] 2025-09-09 19:33:51,888 - root - INFO - [34mlr: 6.2033e-06 gnorm: 0.37 [35m[2 days, 1:58:23<23:17:16][39m
+[titan] 2025-09-09 19:34:23,928 - root - INFO - [31mstep: 27290 [32mloss: 2.9562 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.43 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.9042 [37mglobal_avg_top_loss: 2.0520
+[titan] 2025-09-09 19:34:23,928 - root - INFO - [34mlr: 6.2003e-06 gnorm: 0.37 [35m[2 days, 1:58:55<23:16:42][39m
+[titan] 2025-09-09 19:34:55,792 - root - INFO - [31mstep: 27295 [32mloss: 2.6356 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,284 [36mtflops: 490.13 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7307 [37mglobal_avg_top_loss: 1.9048
+[titan] 2025-09-09 19:34:55,792 - root - INFO - [34mlr: 6.1973e-06 gnorm: 0.41 [35m[2 days, 1:59:27<23:16:09][39m
+[titan] 2025-09-09 19:35:21,453 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:35:27,806 - root - INFO - [31mstep: 27300 [32mloss: 2.6964 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.83 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7595 [37mglobal_avg_top_loss: 1.9370
+[titan] 2025-09-09 19:35:27,806 - root - INFO - [34mlr: 6.1943e-06 gnorm: 0.37 [35m[2 days, 1:59:59<23:15:35][39m
+[titan] 2025-09-09 19:35:59,660 - root - INFO - [31mstep: 27305 [32mloss: 2.7542 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.27 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7859 [37mglobal_avg_top_loss: 1.9683
+[titan] 2025-09-09 19:35:59,661 - root - INFO - [34mlr: 6.1912e-06 gnorm: 0.37 [35m[2 days, 2:00:30<23:15:02][39m
+[titan] 2025-09-09 19:36:31,539 - root - INFO - [31mstep: 27310 [32mloss: 2.7520 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.91 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7791 [37mglobal_avg_top_loss: 1.9729
+[titan] 2025-09-09 19:36:31,539 - root - INFO - [34mlr: 6.1882e-06 gnorm: 0.38 [35m[2 days, 2:01:02<23:14:28][39m
+[titan] 2025-09-09 19:37:03,781 - root - INFO - [31mstep: 27315 [32mloss: 2.6413 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,163 [36mtflops: 484.37 [35mmfu: 48.98%[39m [37mglobal_avg_ntp_loss: 0.7354 [37mglobal_avg_top_loss: 1.9059
+[titan] 2025-09-09 19:37:03,782 - root - INFO - [34mlr: 6.1852e-06 gnorm: 0.38 [35m[2 days, 2:01:35<23:13:55][39m
+[titan] 2025-09-09 19:37:35,889 - root - INFO - [31mstep: 27320 [32mloss: 2.7525 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.41 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7833 [37mglobal_avg_top_loss: 1.9692
+[titan] 2025-09-09 19:37:35,889 - root - INFO - [34mlr: 6.1822e-06 gnorm: 0.38 [35m[2 days, 2:02:07<23:13:22][39m
+[titan] 2025-09-09 19:38:08,291 - root - INFO - [31mstep: 27325 [32mloss: 2.7362 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,113 [36mtflops: 481.98 [35mmfu: 48.73%[39m [37mglobal_avg_ntp_loss: 0.7729 [37mglobal_avg_top_loss: 1.9632
+[titan] 2025-09-09 19:38:08,292 - root - INFO - [34mlr: 6.1792e-06 gnorm: 0.37 [35m[2 days, 2:02:39<23:12:48][39m
+[titan] 2025-09-09 19:38:40,424 - root - INFO - [31mstep: 27330 [32mloss: 2.7705 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.03 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.7971 [37mglobal_avg_top_loss: 1.9734
+[titan] 2025-09-09 19:38:40,424 - root - INFO - [34mlr: 6.1762e-06 gnorm: 0.37 [35m[2 days, 2:03:11<23:12:15][39m
+[titan] 2025-09-09 19:39:12,592 - root - INFO - [31mstep: 27335 [32mloss: 3.6722 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,187 [36mtflops: 485.49 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 1.3130 [37mglobal_avg_top_loss: 2.3592
+[titan] 2025-09-09 19:39:12,593 - root - INFO - [34mlr: 6.1731e-06 gnorm: 0.42 [35m[2 days, 2:03:43<23:11:42][39m
+[titan] 2025-09-09 19:39:44,432 - root - INFO - [31mstep: 27340 [32mloss: 2.6653 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,292 [36mtflops: 490.49 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7394 [37mglobal_avg_top_loss: 1.9259
+[titan] 2025-09-09 19:39:44,433 - root - INFO - [34mlr: 6.1701e-06 gnorm: 0.38 [35m[2 days, 2:04:15<23:11:08][39m
+[titan] 2025-09-09 19:40:16,442 - root - INFO - [31mstep: 27345 [32mloss: 2.7383 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.90 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7746 [37mglobal_avg_top_loss: 1.9637
+[titan] 2025-09-09 19:40:16,443 - root - INFO - [34mlr: 6.1671e-06 gnorm: 0.37 [35m[2 days, 2:04:47<23:10:35][39m
+[titan] 2025-09-09 19:40:42,013 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:40:48,530 - root - INFO - [31mstep: 27350 [32mloss: 3.2440 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.71 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 1.0583 [37mglobal_avg_top_loss: 2.1858
+[titan] 2025-09-09 19:40:48,531 - root - INFO - [34mlr: 6.1641e-06 gnorm: 0.38 [35m[2 days, 2:05:19<23:10:02][39m
+[titan] 2025-09-09 19:41:20,522 - root - INFO - [31mstep: 27355 [32mloss: 3.2478 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.16 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 1.0625 [37mglobal_avg_top_loss: 2.1853
+[titan] 2025-09-09 19:41:20,523 - root - INFO - [34mlr: 6.1611e-06 gnorm: 0.39 [35m[2 days, 2:05:51<23:09:28][39m
+[titan] 2025-09-09 19:41:52,521 - root - INFO - [31mstep: 27360 [32mloss: 2.7529 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.06 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7827 [37mglobal_avg_top_loss: 1.9702
+[titan] 2025-09-09 19:41:52,522 - root - INFO - [34mlr: 6.1581e-06 gnorm: 0.39 [35m[2 days, 2:06:23<23:08:55][39m
+[titan] 2025-09-09 19:42:24,369 - root - INFO - [31mstep: 27365 [32mloss: 2.6660 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,289 [36mtflops: 490.38 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7412 [37mglobal_avg_top_loss: 1.9247
+[titan] 2025-09-09 19:42:24,370 - root - INFO - [34mlr: 6.1551e-06 gnorm: 0.37 [35m[2 days, 2:06:55<23:08:21][39m
+[titan] 2025-09-09 19:42:56,638 - root - INFO - [31mstep: 27370 [32mloss: 2.6101 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,155 [36mtflops: 483.99 [35mmfu: 48.94%[39m [37mglobal_avg_ntp_loss: 0.7181 [37mglobal_avg_top_loss: 1.8921
+[titan] 2025-09-09 19:42:56,638 - root - INFO - [34mlr: 6.1521e-06 gnorm: 0.36 [35m[2 days, 2:07:27<23:07:48][39m
+[titan] 2025-09-09 19:43:28,559 - root - INFO - [31mstep: 27375 [32mloss: 2.7820 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.25 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7932 [37mglobal_avg_top_loss: 1.9888
+[titan] 2025-09-09 19:43:28,560 - root - INFO - [34mlr: 6.1491e-06 gnorm: 0.55 [35m[2 days, 2:07:59<23:07:14][39m
+[titan] 2025-09-09 19:44:00,626 - root - INFO - [31mstep: 27380 [32mloss: 2.7627 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.03 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7883 [37mglobal_avg_top_loss: 1.9744
+[titan] 2025-09-09 19:44:00,627 - root - INFO - [34mlr: 6.1461e-06 gnorm: 0.39 [35m[2 days, 2:08:31<23:06:41][39m
+[titan] 2025-09-09 19:44:32,575 - root - INFO - [31mstep: 27385 [32mloss: 2.6749 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.82 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7450 [37mglobal_avg_top_loss: 1.9299
+[titan] 2025-09-09 19:44:32,576 - root - INFO - [34mlr: 6.1431e-06 gnorm: 0.38 [35m[2 days, 2:09:03<23:06:08][39m
+[titan] 2025-09-09 19:45:04,823 - root - INFO - [31mstep: 27390 [32mloss: 2.7189 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,162 [36mtflops: 484.31 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.7633 [37mglobal_avg_top_loss: 1.9556
+[titan] 2025-09-09 19:45:04,823 - root - INFO - [34mlr: 6.1400e-06 gnorm: 0.40 [35m[2 days, 2:09:36<23:05:34][39m
+[titan] 2025-09-09 19:45:36,726 - root - INFO - [31mstep: 27395 [32mloss: 2.6272 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.52 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7262 [37mglobal_avg_top_loss: 1.9010
+[titan] 2025-09-09 19:45:36,727 - root - INFO - [34mlr: 6.1370e-06 gnorm: 0.38 [35m[2 days, 2:10:08<23:05:01][39m
+[titan] 2025-09-09 19:46:02,254 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:46:08,670 - root - INFO - [31mstep: 27400 [32mloss: 3.2022 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.90 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 1.0387 [37mglobal_avg_top_loss: 2.1635
+[titan] 2025-09-09 19:46:08,671 - root - INFO - [34mlr: 6.1340e-06 gnorm: 0.39 [35m[2 days, 2:10:39<23:04:28][39m
+[titan] 2025-09-09 19:46:40,734 - root - INFO - [31mstep: 27405 [32mloss: 2.7130 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.08 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7644 [37mglobal_avg_top_loss: 1.9487
+[titan] 2025-09-09 19:46:40,734 - root - INFO - [34mlr: 6.1310e-06 gnorm: 0.37 [35m[2 days, 2:11:12<23:03:54][39m
+[titan] 2025-09-09 19:47:12,686 - root - INFO - [31mstep: 27410 [32mloss: 2.7198 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.77 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7647 [37mglobal_avg_top_loss: 1.9552
+[titan] 2025-09-09 19:47:12,686 - root - INFO - [34mlr: 6.1280e-06 gnorm: 0.37 [35m[2 days, 2:11:43<23:03:21][39m
+[titan] 2025-09-09 19:47:44,812 - root - INFO - [31mstep: 27415 [32mloss: 3.0966 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,200 [36mtflops: 486.13 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.9709 [37mglobal_avg_top_loss: 2.1257
+[titan] 2025-09-09 19:47:44,812 - root - INFO - [34mlr: 6.1250e-06 gnorm: 0.38 [35m[2 days, 2:12:16<23:02:47][39m
+[titan] 2025-09-09 19:48:16,814 - root - INFO - [31mstep: 27420 [32mloss: 2.7303 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.01 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7730 [37mglobal_avg_top_loss: 1.9573
+[titan] 2025-09-09 19:48:16,815 - root - INFO - [34mlr: 6.1220e-06 gnorm: 0.37 [35m[2 days, 2:12:48<23:02:14][39m
+[titan] 2025-09-09 19:48:48,824 - root - INFO - [31mstep: 27425 [32mloss: 2.6604 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7384 [37mglobal_avg_top_loss: 1.9220
+[titan] 2025-09-09 19:48:48,825 - root - INFO - [34mlr: 6.1190e-06 gnorm: 0.36 [35m[2 days, 2:13:20<23:01:41][39m
+[titan] 2025-09-09 19:49:20,980 - root - INFO - [31mstep: 27430 [32mloss: 3.2045 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,191 [36mtflops: 485.68 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 1.0378 [37mglobal_avg_top_loss: 2.1667
+[titan] 2025-09-09 19:49:20,980 - root - INFO - [34mlr: 6.1160e-06 gnorm: 0.39 [35m[2 days, 2:13:52<23:01:07][39m
+[titan] 2025-09-09 19:49:53,147 - root - INFO - [31mstep: 27435 [32mloss: 2.6938 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,187 [36mtflops: 485.51 [35mmfu: 49.09%[39m [37mglobal_avg_ntp_loss: 0.7552 [37mglobal_avg_top_loss: 1.9387
+[titan] 2025-09-09 19:49:53,148 - root - INFO - [34mlr: 6.1130e-06 gnorm: 0.37 [35m[2 days, 2:14:24<23:00:34][39m
+[titan] 2025-09-09 19:50:25,231 - root - INFO - [31mstep: 27440 [32mloss: 2.7039 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.77 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7604 [37mglobal_avg_top_loss: 1.9435
+[titan] 2025-09-09 19:50:25,231 - root - INFO - [34mlr: 6.1100e-06 gnorm: 0.38 [35m[2 days, 2:14:56<23:00:01][39m
+[titan] 2025-09-09 19:50:57,335 - root - INFO - [31mstep: 27445 [32mloss: 2.6224 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.46 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7239 [37mglobal_avg_top_loss: 1.8986
+[titan] 2025-09-09 19:50:57,335 - root - INFO - [34mlr: 6.1070e-06 gnorm: 0.36 [35m[2 days, 2:15:28<22:59:27][39m
+[titan] 2025-09-09 19:51:23,014 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 19:51:29,568 - root - INFO - [31mstep: 27450 [32mloss: 2.7362 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,166 [36mtflops: 484.52 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7747 [37mglobal_avg_top_loss: 1.9615
+[titan] 2025-09-09 19:51:29,568 - root - INFO - [34mlr: 6.1040e-06 gnorm: 0.38 [35m[2 days, 2:16:00<22:58:54][39m
+[titan] 2025-09-09 19:52:01,614 - root - INFO - [31mstep: 27455 [32mloss: 2.7540 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.35 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7794 [37mglobal_avg_top_loss: 1.9747
+[titan] 2025-09-09 19:52:01,614 - root - INFO - [34mlr: 6.1011e-06 gnorm: 0.50 [35m[2 days, 2:16:32<22:58:20][39m
+[titan] 2025-09-09 19:52:33,610 - root - INFO - [31mstep: 27460 [32mloss: 2.7341 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.09 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7713 [37mglobal_avg_top_loss: 1.9628
+[titan] 2025-09-09 19:52:33,611 - root - INFO - [34mlr: 6.0981e-06 gnorm: 0.39 [35m[2 days, 2:17:04<22:57:47][39m
+[titan] 2025-09-09 19:53:05,588 - root - INFO - [31mstep: 27465 [32mloss: 2.6968 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.39 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7580 [37mglobal_avg_top_loss: 1.9388
+[titan] 2025-09-09 19:53:05,588 - root - INFO - [34mlr: 6.0951e-06 gnorm: 0.39 [35m[2 days, 2:17:36<22:57:14][39m
+[titan] 2025-09-09 19:53:37,733 - root - INFO - [31mstep: 27470 [32mloss: 2.7709 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.84 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7893 [37mglobal_avg_top_loss: 1.9816
+[titan] 2025-09-09 19:53:37,734 - root - INFO - [34mlr: 6.0921e-06 gnorm: 0.40 [35m[2 days, 2:18:08<22:56:40][39m
+[titan] 2025-09-09 19:54:09,955 - root - INFO - [31mstep: 27475 [32mloss: 2.6824 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,170 [36mtflops: 484.68 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 0.7459 [37mglobal_avg_top_loss: 1.9365
+[titan] 2025-09-09 19:54:09,956 - root - INFO - [34mlr: 6.0891e-06 gnorm: 0.37 [35m[2 days, 2:18:41<22:56:07][39m
+[titan] 2025-09-09 19:54:42,002 - root - INFO - [31mstep: 27480 [32mloss: 2.7159 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.34 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7651 [37mglobal_avg_top_loss: 1.9508
+[titan] 2025-09-09 19:54:42,002 - root - INFO - [34mlr: 6.0861e-06 gnorm: 0.37 [35m[2 days, 2:19:13<22:55:34][39m
+[titan] 2025-09-09 19:55:14,137 - root - INFO - [31mstep: 27485 [32mloss: 2.6776 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,197 [36mtflops: 486.00 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.7357 [37mglobal_avg_top_loss: 1.9419
+[titan] 2025-09-09 19:55:14,137 - root - INFO - [34mlr: 6.0831e-06 gnorm: 0.66 [35m[2 days, 2:19:45<22:55:00][39m
+[titan] 2025-09-09 19:55:46,147 - root - INFO - [31mstep: 27490 [32mloss: 2.7540 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7806 [37mglobal_avg_top_loss: 1.9734
+[titan] 2025-09-09 19:55:46,148 - root - INFO - [34mlr: 6.0801e-06 gnorm: 0.37 [35m[2 days, 2:20:17<22:54:27][39m
+[titan] 2025-09-09 19:56:18,093 - root - INFO - [31mstep: 27495 [32mloss: 2.6953 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.88 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7540 [37mglobal_avg_top_loss: 1.9413
+[titan] 2025-09-09 19:56:18,093 - root - INFO - [34mlr: 6.0771e-06 gnorm: 0.38 [35m[2 days, 2:20:49<22:53:54][39m
+[titan] 2025-09-09 19:56:43,689 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
+[titan] 2025-09-09 19:56:50,109 - root - INFO - [31mstep: 27500 [32mloss: 2.6832 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.80 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7485 [37mglobal_avg_top_loss: 1.9347
+[titan] 2025-09-09 19:56:50,110 - root - INFO - [34mlr: 6.0741e-06 gnorm: 0.37 [35m[2 days, 2:21:21<22:53:20][39m
+[titan] 2025-09-09 19:57:22,269 - root - INFO - [31mstep: 27505 [32mloss: 2.6244 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,189 [36mtflops: 485.61 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7235 [37mglobal_avg_top_loss: 1.9009
+[titan] 2025-09-09 19:57:22,270 - root - INFO - [34mlr: 6.0711e-06 gnorm: 0.36 [35m[2 days, 2:21:53<22:52:47][39m
+[titan] 2025-09-09 19:57:54,296 - root - INFO - [31mstep: 27510 [32mloss: 3.1628 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.64 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 1.0215 [37mglobal_avg_top_loss: 2.1413
+[titan] 2025-09-09 19:57:54,296 - root - INFO - [34mlr: 6.0682e-06 gnorm: 0.37 [35m[2 days, 2:22:25<22:52:13][39m
+[titan] 2025-09-09 19:58:26,527 - root - INFO - [31mstep: 27515 [32mloss: 2.7030 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,167 [36mtflops: 484.55 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7594 [37mglobal_avg_top_loss: 1.9436
+[titan] 2025-09-09 19:58:26,527 - root - INFO - [34mlr: 6.0652e-06 gnorm: 0.44 [35m[2 days, 2:22:57<22:51:40][39m
+[titan] 2025-09-09 19:58:58,587 - root - INFO - [31mstep: 27520 [32mloss: 2.6953 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.13 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7537 [37mglobal_avg_top_loss: 1.9416
+[titan] 2025-09-09 19:58:58,587 - root - INFO - [34mlr: 6.0622e-06 gnorm: 0.41 [35m[2 days, 2:23:29<22:51:07][39m
+[titan] 2025-09-09 19:59:30,617 - root - INFO - [31mstep: 27525 [32mloss: 2.6840 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.59 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7492 [37mglobal_avg_top_loss: 1.9348
+[titan] 2025-09-09 19:59:30,617 - root - INFO - [34mlr: 6.0592e-06 gnorm: 0.38 [35m[2 days, 2:24:01<22:50:33][39m
+[titan] 2025-09-09 20:00:02,763 - root - INFO - [31mstep: 27530 [32mloss: 2.7763 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.83 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7919 [37mglobal_avg_top_loss: 1.9844
+[titan] 2025-09-09 20:00:02,763 - root - INFO - [34mlr: 6.0562e-06 gnorm: 0.38 [35m[2 days, 2:24:34<22:50:00][39m
+[titan] 2025-09-09 20:00:34,732 - root - INFO - [31mstep: 27535 [32mloss: 2.7301 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.51 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7703 [37mglobal_avg_top_loss: 1.9598
+[titan] 2025-09-09 20:00:34,733 - root - INFO - [34mlr: 6.0532e-06 gnorm: 0.42 [35m[2 days, 2:25:05<22:49:27][39m
+[titan] 2025-09-09 20:01:06,792 - root - INFO - [31mstep: 27540 [32mloss: 2.7005 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.14 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7577 [37mglobal_avg_top_loss: 1.9428
+[titan] 2025-09-09 20:01:06,792 - root - INFO - [34mlr: 6.0503e-06 gnorm: 0.38 [35m[2 days, 2:25:38<22:48:53][39m
+[titan] 2025-09-09 20:01:38,813 - root - INFO - [31mstep: 27545 [32mloss: 3.2007 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.72 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 1.0387 [37mglobal_avg_top_loss: 2.1619
+[titan] 2025-09-09 20:01:38,813 - root - INFO - [34mlr: 6.0473e-06 gnorm: 0.37 [35m[2 days, 2:26:10<22:48:20][39m
+[titan] 2025-09-09 20:02:04,423 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
+[titan] 2025-09-09 20:02:10,844 - root - INFO - [31mstep: 27550 [32mloss: 2.7095 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.57 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7634 [37mglobal_avg_top_loss: 1.9461
+[titan] 2025-09-09 20:02:10,845 - root - INFO - [34mlr: 6.0443e-06 gnorm: 0.36 [35m[2 days, 2:26:42<22:47:47][39m
+[titan] 2025-09-09 20:02:43,093 - root - INFO - [31mstep: 27555 [32mloss: 2.6413 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,161 [36mtflops: 484.28 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.7298 [37mglobal_avg_top_loss: 1.9114
+[titan] 2025-09-09 20:02:43,094 - root - INFO - [34mlr: 6.0413e-06 gnorm: 0.38 [35m[2 days, 2:27:14<22:47:13][39m
+[titan] 2025-09-09 20:03:15,158 - root - INFO - [31mstep: 27560 [32mloss: 2.7785 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.05 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7945 [37mglobal_avg_top_loss: 1.9840
+[titan] 2025-09-09 20:03:15,159 - root - INFO - [34mlr: 6.0383e-06 gnorm: 0.38 [35m[2 days, 2:27:46<22:46:40][39m
+[titan] 2025-09-09 20:03:47,318 - root - INFO - [31mstep: 27565 [32mloss: 2.6802 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.63 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7481 [37mglobal_avg_top_loss: 1.9320
+[titan] 2025-09-09 20:03:47,318 - root - INFO - [34mlr: 6.0354e-06 gnorm: 0.36 [35m[2 days, 2:28:18<22:46:07][39m
+[titan] 2025-09-09 20:04:19,472 - root - INFO - [31mstep: 27570 [32mloss: 2.7005 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,191 [36mtflops: 485.71 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.7580 [37mglobal_avg_top_loss: 1.9425
+[titan] 2025-09-09 20:04:19,472 - root - INFO - [34mlr: 6.0324e-06 gnorm: 0.62 [35m[2 days, 2:28:50<22:45:33][39m
+[titan] 2025-09-09 20:04:51,441 - root - INFO - [31mstep: 27575 [32mloss: 2.7155 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.52 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7669 [37mglobal_avg_top_loss: 1.9486
+[titan] 2025-09-09 20:04:51,441 - root - INFO - [34mlr: 6.0294e-06 gnorm: 0.37 [35m[2 days, 2:29:22<22:45:00][39m
+[titan] 2025-09-09 20:05:23,276 - root - INFO - [31mstep: 27580 [32mloss: 2.7277 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.57 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7699 [37mglobal_avg_top_loss: 1.9577
+[titan] 2025-09-09 20:05:23,277 - root - INFO - [34mlr: 6.0264e-06 gnorm: 0.36 [35m[2 days, 2:29:54<22:44:26][39m
+[titan] 2025-09-09 20:05:55,367 - root - INFO - [31mstep: 27585 [32mloss: 2.7860 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.66 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.8004 [37mglobal_avg_top_loss: 1.9856
+[titan] 2025-09-09 20:05:55,368 - root - INFO - [34mlr: 6.0235e-06 gnorm: 0.42 [35m[2 days, 2:30:26<22:43:53][39m
+[titan] 2025-09-09 20:06:27,596 - root - INFO - [31mstep: 27590 [32mloss: 2.6917 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,168 [36mtflops: 484.58 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 0.7537 [37mglobal_avg_top_loss: 1.9379
+[titan] 2025-09-09 20:06:27,597 - root - INFO - [34mlr: 6.0205e-06 gnorm: 0.38 [35m[2 days, 2:30:58<22:43:20][39m
+[titan] 2025-09-09 20:06:59,531 - root - INFO - [31mstep: 27595 [32mloss: 2.7512 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.04 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7786 [37mglobal_avg_top_loss: 1.9726
+[titan] 2025-09-09 20:06:59,532 - root - INFO - [34mlr: 6.0175e-06 gnorm: 0.38 [35m[2 days, 2:31:30<22:42:46][39m
+[titan] 2025-09-09 20:07:25,377 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:07:31,769 - root - INFO - [31mstep: 27600 [32mloss: 2.6837 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,165 [36mtflops: 484.45 [35mmfu: 48.98%[39m [37mglobal_avg_ntp_loss: 0.7489 [37mglobal_avg_top_loss: 1.9348
+[titan] 2025-09-09 20:07:31,769 - root - INFO - [34mlr: 6.0145e-06 gnorm: 0.38 [35m[2 days, 2:32:03<22:42:13][39m
+[titan] 2025-09-09 20:08:03,866 - root - INFO - [31mstep: 27605 [32mloss: 2.7257 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.57 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7721 [37mglobal_avg_top_loss: 1.9536
+[titan] 2025-09-09 20:08:03,867 - root - INFO - [34mlr: 6.0116e-06 gnorm: 0.37 [35m[2 days, 2:32:35<22:41:40][39m
+[titan] 2025-09-09 20:08:35,733 - root - INFO - [31mstep: 27610 [32mloss: 2.7324 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.09 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7754 [37mglobal_avg_top_loss: 1.9570
+[titan] 2025-09-09 20:08:35,733 - root - INFO - [34mlr: 6.0086e-06 gnorm: 0.39 [35m[2 days, 2:33:06<22:41:06][39m
+[titan] 2025-09-09 20:09:07,693 - root - INFO - [31mstep: 27615 [32mloss: 2.7002 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7548 [37mglobal_avg_top_loss: 1.9454
+[titan] 2025-09-09 20:09:07,694 - root - INFO - [34mlr: 6.0056e-06 gnorm: 0.45 [35m[2 days, 2:33:38<22:40:33][39m
+[titan] 2025-09-09 20:09:39,612 - root - INFO - [31mstep: 27620 [32mloss: 2.7725 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.28 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7925 [37mglobal_avg_top_loss: 1.9799
+[titan] 2025-09-09 20:09:39,613 - root - INFO - [34mlr: 6.0027e-06 gnorm: 0.38 [35m[2 days, 2:34:10<22:39:59][39m
+[titan] 2025-09-09 20:10:11,764 - root - INFO - [31mstep: 27625 [32mloss: 3.2329 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,192 [36mtflops: 485.75 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 1.0521 [37mglobal_avg_top_loss: 2.1808
+[titan] 2025-09-09 20:10:11,765 - root - INFO - [34mlr: 5.9997e-06 gnorm: 0.37 [35m[2 days, 2:34:42<22:39:26][39m
+[titan] 2025-09-09 20:10:43,627 - root - INFO - [31mstep: 27630 [32mloss: 2.7781 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,284 [36mtflops: 490.14 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7998 [37mglobal_avg_top_loss: 1.9783
+[titan] 2025-09-09 20:10:43,628 - root - INFO - [34mlr: 5.9967e-06 gnorm: 0.38 [35m[2 days, 2:35:14<22:38:53][39m
+[titan] 2025-09-09 20:11:15,602 - root - INFO - [31mstep: 27635 [32mloss: 2.6257 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.43 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7219 [37mglobal_avg_top_loss: 1.9037
+[titan] 2025-09-09 20:11:15,602 - root - INFO - [34mlr: 5.9937e-06 gnorm: 0.36 [35m[2 days, 2:35:46<22:38:19][39m
+[titan] 2025-09-09 20:11:47,610 - root - INFO - [31mstep: 27640 [32mloss: 2.6846 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.93 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7516 [37mglobal_avg_top_loss: 1.9330
+[titan] 2025-09-09 20:11:47,610 - root - INFO - [34mlr: 5.9908e-06 gnorm: 0.37 [35m[2 days, 2:36:18<22:37:46][39m
+[titan] 2025-09-09 20:12:19,839 - root - INFO - [31mstep: 27645 [32mloss: 2.6790 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,168 [36mtflops: 484.58 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 0.7468 [37mglobal_avg_top_loss: 1.9322
+[titan] 2025-09-09 20:12:19,839 - root - INFO - [34mlr: 5.9878e-06 gnorm: 0.37 [35m[2 days, 2:36:51<22:37:13][39m
+[titan] 2025-09-09 20:12:39,278 - root - INFO - Dumping profiler traces at step 27648
+[titan] 2025-09-09 20:12:39,335 - root - INFO - Finished dumping profiler traces in 0.06 seconds
+[titan] 2025-09-09 20:12:45,702 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:12:52,061 - root - INFO - [31mstep: 27650 [32mloss: 2.6148 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,170 [36mtflops: 484.68 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 0.7215 [37mglobal_avg_top_loss: 1.8933
+[titan] 2025-09-09 20:12:52,061 - root - INFO - [34mlr: 5.9849e-06 gnorm: 0.36 [35m[2 days, 2:37:23<22:36:39][39m
+[titan] 2025-09-09 20:13:24,071 - root - INFO - [31mstep: 27655 [32mloss: 2.6972 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7571 [37mglobal_avg_top_loss: 1.9401
+[titan] 2025-09-09 20:13:24,071 - root - INFO - [34mlr: 5.9819e-06 gnorm: 0.37 [35m[2 days, 2:37:55<22:36:06][39m
+[titan] 2025-09-09 20:13:56,082 - root - INFO - [31mstep: 27660 [32mloss: 2.6261 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.88 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7168 [37mglobal_avg_top_loss: 1.9093
+[titan] 2025-09-09 20:13:56,082 - root - INFO - [34mlr: 5.9789e-06 gnorm: 0.45 [35m[2 days, 2:38:27<22:35:33][39m
+[titan] 2025-09-09 20:14:28,102 - root - INFO - [31mstep: 27665 [32mloss: 2.7333 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.73 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7723 [37mglobal_avg_top_loss: 1.9610
+[titan] 2025-09-09 20:14:28,103 - root - INFO - [34mlr: 5.9760e-06 gnorm: 0.38 [35m[2 days, 2:38:59<22:34:59][39m
+[titan] 2025-09-09 20:15:00,192 - root - INFO - [31mstep: 27670 [32mloss: 2.6630 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.68 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7456 [37mglobal_avg_top_loss: 1.9174
+[titan] 2025-09-09 20:15:00,192 - root - INFO - [34mlr: 5.9730e-06 gnorm: 0.37 [35m[2 days, 2:39:31<22:34:26][39m
+[titan] 2025-09-09 20:15:31,995 - root - INFO - [31mstep: 27675 [32mloss: 2.6408 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.06 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7313 [37mglobal_avg_top_loss: 1.9095
+[titan] 2025-09-09 20:15:31,996 - root - INFO - [34mlr: 5.9700e-06 gnorm: 0.37 [35m[2 days, 2:40:03<22:33:52][39m
+[titan] 2025-09-09 20:16:04,045 - root - INFO - [31mstep: 27680 [32mloss: 2.6419 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.28 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7316 [37mglobal_avg_top_loss: 1.9104
+[titan] 2025-09-09 20:16:04,046 - root - INFO - [34mlr: 5.9671e-06 gnorm: 0.36 [35m[2 days, 2:40:35<22:33:19][39m
+[titan] 2025-09-09 20:16:36,103 - root - INFO - [31mstep: 27685 [32mloss: 2.7632 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,222 [36mtflops: 487.16 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7870 [37mglobal_avg_top_loss: 1.9761
+[titan] 2025-09-09 20:16:36,104 - root - INFO - [34mlr: 5.9641e-06 gnorm: 0.38 [35m[2 days, 2:41:07<22:32:46][39m
+[titan] 2025-09-09 20:17:08,384 - root - INFO - [31mstep: 27690 [32mloss: 2.7392 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,151 [36mtflops: 483.80 [35mmfu: 48.92%[39m [37mglobal_avg_ntp_loss: 0.7735 [37mglobal_avg_top_loss: 1.9657
+[titan] 2025-09-09 20:17:08,384 - root - INFO - [34mlr: 5.9612e-06 gnorm: 0.38 [35m[2 days, 2:41:39<22:32:12][39m
+[titan] 2025-09-09 20:17:40,218 - root - INFO - [31mstep: 27695 [32mloss: 2.7230 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,294 [36mtflops: 490.60 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7709 [37mglobal_avg_top_loss: 1.9521
+[titan] 2025-09-09 20:17:40,218 - root - INFO - [34mlr: 5.9582e-06 gnorm: 0.37 [35m[2 days, 2:42:11<22:31:39][39m
+[titan] 2025-09-09 20:18:05,935 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:18:12,389 - root - INFO - [31mstep: 27700 [32mloss: 2.6947 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,186 [36mtflops: 485.45 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 0.7504 [37mglobal_avg_top_loss: 1.9443
+[titan] 2025-09-09 20:18:12,389 - root - INFO - [34mlr: 5.9552e-06 gnorm: 0.38 [35m[2 days, 2:42:43<22:31:06][39m
+[titan] 2025-09-09 20:18:44,121 - root - INFO - [31mstep: 27705 [32mloss: 3.2882 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,327 [36mtflops: 492.16 [35mmfu: 49.76%[39m [37mglobal_avg_ntp_loss: 1.0857 [37mglobal_avg_top_loss: 2.2025
+[titan] 2025-09-09 20:18:44,122 - root - INFO - [34mlr: 5.9523e-06 gnorm: 0.36 [35m[2 days, 2:43:15<22:30:32][39m
+[titan] 2025-09-09 20:19:16,187 - root - INFO - [31mstep: 27710 [32mloss: 2.6179 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.12 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7210 [37mglobal_avg_top_loss: 1.8969
+[titan] 2025-09-09 20:19:16,187 - root - INFO - [34mlr: 5.9493e-06 gnorm: 0.36 [35m[2 days, 2:43:47<22:29:59][39m
+[titan] 2025-09-09 20:19:48,197 - root - INFO - [31mstep: 27715 [32mloss: 2.7285 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7739 [37mglobal_avg_top_loss: 1.9546
+[titan] 2025-09-09 20:19:48,198 - root - INFO - [34mlr: 5.9464e-06 gnorm: 0.38 [35m[2 days, 2:44:19<22:29:25][39m
+[titan] 2025-09-09 20:20:20,146 - root - INFO - [31mstep: 27720 [32mloss: 2.6512 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.82 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7383 [37mglobal_avg_top_loss: 1.9129
+[titan] 2025-09-09 20:20:20,147 - root - INFO - [34mlr: 5.9434e-06 gnorm: 0.38 [35m[2 days, 2:44:51<22:28:52][39m
+[titan] 2025-09-09 20:20:52,030 - root - INFO - [31mstep: 27725 [32mloss: 2.7467 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.83 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7770 [37mglobal_avg_top_loss: 1.9697
+[titan] 2025-09-09 20:20:52,030 - root - INFO - [34mlr: 5.9405e-06 gnorm: 0.38 [35m[2 days, 2:45:23<22:28:19][39m
+[titan] 2025-09-09 20:21:23,985 - root - INFO - [31mstep: 27730 [32mloss: 2.6245 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.74 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7239 [37mglobal_avg_top_loss: 1.9006
+[titan] 2025-09-09 20:21:23,985 - root - INFO - [34mlr: 5.9375e-06 gnorm: 0.40 [35m[2 days, 2:45:55<22:27:45][39m
+[titan] 2025-09-09 20:21:56,412 - root - INFO - [31mstep: 27735 [32mloss: 2.7080 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,105 [36mtflops: 481.62 [35mmfu: 48.70%[39m [37mglobal_avg_ntp_loss: 0.7666 [37mglobal_avg_top_loss: 1.9414
+[titan] 2025-09-09 20:21:56,412 - root - INFO - [34mlr: 5.9346e-06 gnorm: 0.38 [35m[2 days, 2:46:27<22:27:12][39m
+[titan] 2025-09-09 20:22:28,310 - root - INFO - [31mstep: 27740 [32mloss: 2.6339 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.60 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7294 [37mglobal_avg_top_loss: 1.9045
+[titan] 2025-09-09 20:22:28,310 - root - INFO - [34mlr: 5.9316e-06 gnorm: 0.39 [35m[2 days, 2:46:59<22:26:39][39m
+[titan] 2025-09-09 20:23:00,336 - root - INFO - [31mstep: 27745 [32mloss: 2.7238 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.65 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7675 [37mglobal_avg_top_loss: 1.9563
+[titan] 2025-09-09 20:23:00,336 - root - INFO - [34mlr: 5.9287e-06 gnorm: 0.39 [35m[2 days, 2:47:31<22:26:05][39m
+[titan] 2025-09-09 20:23:25,737 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:23:32,139 - root - INFO - [31mstep: 27750 [32mloss: 3.1013 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,304 [36mtflops: 491.07 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.9949 [37mglobal_avg_top_loss: 2.1064
+[titan] 2025-09-09 20:23:32,139 - root - INFO - [34mlr: 5.9257e-06 gnorm: 0.37 [35m[2 days, 2:48:03<22:25:32][39m
+[titan] 2025-09-09 20:24:04,277 - root - INFO - [31mstep: 27755 [32mloss: 2.7528 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,196 [36mtflops: 485.94 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7814 [37mglobal_avg_top_loss: 1.9714
+[titan] 2025-09-09 20:24:04,278 - root - INFO - [34mlr: 5.9228e-06 gnorm: 0.38 [35m[2 days, 2:48:35<22:24:58][39m
+[titan] 2025-09-09 20:24:36,239 - root - INFO - [31mstep: 27760 [32mloss: 2.6562 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.64 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7350 [37mglobal_avg_top_loss: 1.9213
+[titan] 2025-09-09 20:24:36,239 - root - INFO - [34mlr: 5.9198e-06 gnorm: 0.42 [35m[2 days, 2:49:07<22:24:25][39m
+[titan] 2025-09-09 20:25:08,110 - root - INFO - [31mstep: 27765 [32mloss: 2.6901 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,282 [36mtflops: 490.01 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7548 [37mglobal_avg_top_loss: 1.9353
+[titan] 2025-09-09 20:25:08,110 - root - INFO - [34mlr: 5.9169e-06 gnorm: 0.38 [35m[2 days, 2:49:39<22:23:52][39m
+[titan] 2025-09-09 20:25:39,934 - root - INFO - [31mstep: 27770 [32mloss: 2.7141 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,297 [36mtflops: 490.74 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7625 [37mglobal_avg_top_loss: 1.9516
+[titan] 2025-09-09 20:25:39,935 - root - INFO - [34mlr: 5.9139e-06 gnorm: 0.39 [35m[2 days, 2:50:11<22:23:18][39m
+[titan] 2025-09-09 20:26:11,990 - root - INFO - [31mstep: 27775 [32mloss: 2.5712 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.20 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7013 [37mglobal_avg_top_loss: 1.8698
+[titan] 2025-09-09 20:26:11,990 - root - INFO - [34mlr: 5.9110e-06 gnorm: 0.37 [35m[2 days, 2:50:43<22:22:45][39m
+[titan] 2025-09-09 20:26:43,981 - root - INFO - [31mstep: 27780 [32mloss: 2.6533 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.17 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7368 [37mglobal_avg_top_loss: 1.9165
+[titan] 2025-09-09 20:26:43,982 - root - INFO - [34mlr: 5.9080e-06 gnorm: 0.37 [35m[2 days, 2:51:15<22:22:11][39m
+[titan] 2025-09-09 20:27:15,992 - root - INFO - [31mstep: 27785 [32mloss: 3.1062 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.88 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.9976 [37mglobal_avg_top_loss: 2.1086
+[titan] 2025-09-09 20:27:15,992 - root - INFO - [34mlr: 5.9051e-06 gnorm: 0.35 [35m[2 days, 2:51:47<22:21:38][39m
+[titan] 2025-09-09 20:27:48,058 - root - INFO - [31mstep: 27790 [32mloss: 2.6567 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.04 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7385 [37mglobal_avg_top_loss: 1.9182
+[titan] 2025-09-09 20:27:48,058 - root - INFO - [34mlr: 5.9022e-06 gnorm: 0.37 [35m[2 days, 2:52:19<22:21:05][39m
+[titan] 2025-09-09 20:28:20,124 - root - INFO - [31mstep: 27795 [32mloss: 2.7361 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.04 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7737 [37mglobal_avg_top_loss: 1.9624
+[titan] 2025-09-09 20:28:20,124 - root - INFO - [34mlr: 5.8992e-06 gnorm: 0.40 [35m[2 days, 2:52:51<22:20:31][39m
+[titan] 2025-09-09 20:28:45,736 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:28:52,137 - root - INFO - [31mstep: 27800 [32mloss: 2.7212 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.84 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7679 [37mglobal_avg_top_loss: 1.9533
+[titan] 2025-09-09 20:28:52,138 - root - INFO - [34mlr: 5.8963e-06 gnorm: 0.38 [35m[2 days, 2:53:23<22:19:58][39m
+[titan] 2025-09-09 20:29:24,069 - root - INFO - [31mstep: 27805 [32mloss: 2.7507 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.09 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7806 [37mglobal_avg_top_loss: 1.9700
+[titan] 2025-09-09 20:29:24,069 - root - INFO - [34mlr: 5.8933e-06 gnorm: 0.41 [35m[2 days, 2:53:55<22:19:25][39m
+[titan] 2025-09-09 20:29:56,045 - root - INFO - [31mstep: 27810 [32mloss: 2.6492 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.42 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7346 [37mglobal_avg_top_loss: 1.9146
+[titan] 2025-09-09 20:29:56,045 - root - INFO - [34mlr: 5.8904e-06 gnorm: 0.39 [35m[2 days, 2:54:27<22:18:51][39m
+[titan] 2025-09-09 20:30:27,920 - root - INFO - [31mstep: 27815 [32mloss: 2.7739 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.95 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7886 [37mglobal_avg_top_loss: 1.9853
+[titan] 2025-09-09 20:30:27,920 - root - INFO - [34mlr: 5.8875e-06 gnorm: 0.43 [35m[2 days, 2:54:59<22:18:18][39m
+[titan] 2025-09-09 20:30:59,941 - root - INFO - [31mstep: 27820 [32mloss: 2.7760 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.73 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7907 [37mglobal_avg_top_loss: 1.9852
+[titan] 2025-09-09 20:30:59,941 - root - INFO - [34mlr: 5.8845e-06 gnorm: 0.47 [35m[2 days, 2:55:31<22:17:45][39m
+[titan] 2025-09-09 20:31:31,956 - root - INFO - [31mstep: 27825 [32mloss: 2.6948 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.81 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7546 [37mglobal_avg_top_loss: 1.9402
+[titan] 2025-09-09 20:31:31,956 - root - INFO - [34mlr: 5.8816e-06 gnorm: 0.40 [35m[2 days, 2:56:03<22:17:11][39m
+[titan] 2025-09-09 20:32:04,192 - root - INFO - [31mstep: 27830 [32mloss: 2.7136 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,165 [36mtflops: 484.48 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7647 [37mglobal_avg_top_loss: 1.9489
+[titan] 2025-09-09 20:32:04,192 - root - INFO - [34mlr: 5.8787e-06 gnorm: 0.38 [35m[2 days, 2:56:35<22:16:38][39m
+[titan] 2025-09-09 20:32:36,176 - root - INFO - [31mstep: 27835 [32mloss: 2.6755 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.28 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7463 [37mglobal_avg_top_loss: 1.9292
+[titan] 2025-09-09 20:32:36,177 - root - INFO - [34mlr: 5.8757e-06 gnorm: 0.37 [35m[2 days, 2:57:07<22:16:05][39m
+[titan] 2025-09-09 20:33:08,157 - root - INFO - [31mstep: 27840 [32mloss: 2.7392 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.34 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7754 [37mglobal_avg_top_loss: 1.9638
+[titan] 2025-09-09 20:33:08,158 - root - INFO - [34mlr: 5.8728e-06 gnorm: 0.41 [35m[2 days, 2:57:39<22:15:31][39m
+[titan] 2025-09-09 20:33:39,972 - root - INFO - [31mstep: 27845 [32mloss: 2.7575 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.89 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7808 [37mglobal_avg_top_loss: 1.9766
+[titan] 2025-09-09 20:33:39,972 - root - INFO - [34mlr: 5.8698e-06 gnorm: 0.41 [35m[2 days, 2:58:11<22:14:58][39m
+[titan] 2025-09-09 20:34:05,443 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:34:11,851 - root - INFO - [31mstep: 27850 [32mloss: 2.6612 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.89 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7407 [37mglobal_avg_top_loss: 1.9204
+[titan] 2025-09-09 20:34:11,852 - root - INFO - [34mlr: 5.8669e-06 gnorm: 0.38 [35m[2 days, 2:58:43<22:14:24][39m
+[titan] 2025-09-09 20:34:43,833 - root - INFO - [31mstep: 27855 [32mloss: 2.7084 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.33 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7628 [37mglobal_avg_top_loss: 1.9456
+[titan] 2025-09-09 20:34:43,833 - root - INFO - [34mlr: 5.8640e-06 gnorm: 0.37 [35m[2 days, 2:59:15<22:13:51][39m
+[titan] 2025-09-09 20:35:15,730 - root - INFO - [31mstep: 27860 [32mloss: 2.6756 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.62 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7448 [37mglobal_avg_top_loss: 1.9307
+[titan] 2025-09-09 20:35:15,730 - root - INFO - [34mlr: 5.8611e-06 gnorm: 0.38 [35m[2 days, 2:59:46<22:13:18][39m
+[titan] 2025-09-09 20:35:47,711 - root - INFO - [31mstep: 27865 [32mloss: 2.6445 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.33 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7335 [37mglobal_avg_top_loss: 1.9110
+[titan] 2025-09-09 20:35:47,711 - root - INFO - [34mlr: 5.8581e-06 gnorm: 0.36 [35m[2 days, 3:00:18<22:12:44][39m
+[titan] 2025-09-09 20:36:19,554 - root - INFO - [31mstep: 27870 [32mloss: 2.6463 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.46 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7369 [37mglobal_avg_top_loss: 1.9094
+[titan] 2025-09-09 20:36:19,554 - root - INFO - [34mlr: 5.8552e-06 gnorm: 0.37 [35m[2 days, 3:00:50<22:12:11][39m
+[titan] 2025-09-09 20:36:51,375 - root - INFO - [31mstep: 27875 [32mloss: 2.5530 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.78 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.6897 [37mglobal_avg_top_loss: 1.8633
+[titan] 2025-09-09 20:36:51,376 - root - INFO - [34mlr: 5.8523e-06 gnorm: 0.38 [35m[2 days, 3:01:22<22:11:37][39m
+[titan] 2025-09-09 20:37:23,101 - root - INFO - [31mstep: 27880 [32mloss: 2.7208 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,329 [36mtflops: 492.27 [35mmfu: 49.77%[39m [37mglobal_avg_ntp_loss: 0.7674 [37mglobal_avg_top_loss: 1.9535
+[titan] 2025-09-09 20:37:23,101 - root - INFO - [34mlr: 5.8493e-06 gnorm: 0.38 [35m[2 days, 3:01:54<22:11:04][39m
+[titan] 2025-09-09 20:37:55,021 - root - INFO - [31mstep: 27885 [32mloss: 2.7644 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.27 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7860 [37mglobal_avg_top_loss: 1.9784
+[titan] 2025-09-09 20:37:55,021 - root - INFO - [34mlr: 5.8464e-06 gnorm: 0.37 [35m[2 days, 3:02:26<22:10:30][39m
+[titan] 2025-09-09 20:38:27,082 - root - INFO - [31mstep: 27890 [32mloss: 2.7070 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.12 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7628 [37mglobal_avg_top_loss: 1.9442
+[titan] 2025-09-09 20:38:27,082 - root - INFO - [34mlr: 5.8435e-06 gnorm: 0.39 [35m[2 days, 3:02:58<22:09:57][39m
+[titan] 2025-09-09 20:38:58,877 - root - INFO - [31mstep: 27895 [32mloss: 2.6664 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.18 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7416 [37mglobal_avg_top_loss: 1.9248
+[titan] 2025-09-09 20:38:58,878 - root - INFO - [34mlr: 5.8406e-06 gnorm: 0.40 [35m[2 days, 3:03:30<22:09:24][39m
+[titan] 2025-09-09 20:39:24,355 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:39:30,830 - root - INFO - [31mstep: 27900 [32mloss: 2.7641 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.77 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7852 [37mglobal_avg_top_loss: 1.9788
+[titan] 2025-09-09 20:39:30,831 - root - INFO - [34mlr: 5.8376e-06 gnorm: 0.40 [35m[2 days, 3:04:01<22:08:50][39m
+[titan] 2025-09-09 20:40:02,573 - root - INFO - [31mstep: 27905 [32mloss: 2.6812 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,323 [36mtflops: 491.99 [35mmfu: 49.75%[39m [37mglobal_avg_ntp_loss: 0.7510 [37mglobal_avg_top_loss: 1.9302
+[titan] 2025-09-09 20:40:02,574 - root - INFO - [34mlr: 5.8347e-06 gnorm: 0.39 [35m[2 days, 3:04:33<22:08:17][39m
+[titan] 2025-09-09 20:40:34,425 - root - INFO - [31mstep: 27910 [32mloss: 2.7515 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.32 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7792 [37mglobal_avg_top_loss: 1.9723
+[titan] 2025-09-09 20:40:34,426 - root - INFO - [34mlr: 5.8318e-06 gnorm: 0.38 [35m[2 days, 3:05:05<22:07:43][39m
+[titan] 2025-09-09 20:41:06,221 - root - INFO - [31mstep: 27915 [32mloss: 2.6838 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.19 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7545 [37mglobal_avg_top_loss: 1.9292
+[titan] 2025-09-09 20:41:06,221 - root - INFO - [34mlr: 5.8289e-06 gnorm: 0.37 [35m[2 days, 3:05:37<22:07:10][39m
+[titan] 2025-09-09 20:41:38,370 - root - INFO - [31mstep: 27920 [32mloss: 2.8344 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.78 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.8100 [37mglobal_avg_top_loss: 2.0244
+[titan] 2025-09-09 20:41:38,370 - root - INFO - [34mlr: 5.8259e-06 gnorm: 0.46 [35m[2 days, 3:06:09<22:06:37][39m
+[titan] 2025-09-09 20:42:10,335 - root - INFO - [31mstep: 27925 [32mloss: 2.6726 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.57 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7456 [37mglobal_avg_top_loss: 1.9271
+[titan] 2025-09-09 20:42:10,335 - root - INFO - [34mlr: 5.8230e-06 gnorm: 0.38 [35m[2 days, 3:06:41<22:06:03][39m
+[titan] 2025-09-09 20:42:42,202 - root - INFO - [31mstep: 27930 [32mloss: 2.7168 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.08 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7630 [37mglobal_avg_top_loss: 1.9538
+[titan] 2025-09-09 20:42:42,203 - root - INFO - [34mlr: 5.8201e-06 gnorm: 0.37 [35m[2 days, 3:07:13<22:05:30][39m
+[titan] 2025-09-09 20:43:14,355 - root - INFO - [31mstep: 27935 [32mloss: 2.7060 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,192 [36mtflops: 485.73 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.7617 [37mglobal_avg_top_loss: 1.9442
+[titan] 2025-09-09 20:43:14,355 - root - INFO - [34mlr: 5.8172e-06 gnorm: 0.44 [35m[2 days, 3:07:45<22:04:57][39m
+[titan] 2025-09-09 20:43:46,305 - root - INFO - [31mstep: 27940 [32mloss: 2.7714 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.81 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7948 [37mglobal_avg_top_loss: 1.9766
+[titan] 2025-09-09 20:43:46,305 - root - INFO - [34mlr: 5.8143e-06 gnorm: 0.40 [35m[2 days, 3:08:17<22:04:23][39m
+[titan] 2025-09-09 20:44:18,549 - root - INFO - [31mstep: 27945 [32mloss: 2.6450 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,163 [36mtflops: 484.35 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.7317 [37mglobal_avg_top_loss: 1.9133
+[titan] 2025-09-09 20:44:18,549 - root - INFO - [34mlr: 5.8113e-06 gnorm: 0.40 [35m[2 days, 3:08:49<22:03:50][39m
+[titan] 2025-09-09 20:44:44,211 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:44:50,578 - root - INFO - [31mstep: 27950 [32mloss: 2.5807 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,231 [36mtflops: 487.60 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7058 [37mglobal_avg_top_loss: 1.8749
+[titan] 2025-09-09 20:44:50,578 - root - INFO - [34mlr: 5.8084e-06 gnorm: 0.36 [35m[2 days, 3:09:21<22:03:17][39m
+[titan] 2025-09-09 20:45:22,546 - root - INFO - [31mstep: 27955 [32mloss: 2.5778 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.53 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7009 [37mglobal_avg_top_loss: 1.8769
+[titan] 2025-09-09 20:45:22,547 - root - INFO - [34mlr: 5.8055e-06 gnorm: 0.38 [35m[2 days, 3:09:53<22:02:43][39m
+[titan] 2025-09-09 20:45:54,501 - root - INFO - [31mstep: 27960 [32mloss: 2.7177 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.74 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7674 [37mglobal_avg_top_loss: 1.9503
+[titan] 2025-09-09 20:45:54,502 - root - INFO - [34mlr: 5.8026e-06 gnorm: 0.39 [35m[2 days, 3:10:25<22:02:10][39m
+[titan] 2025-09-09 20:46:26,433 - root - INFO - [31mstep: 27965 [32mloss: 2.7119 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.09 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7644 [37mglobal_avg_top_loss: 1.9475
+[titan] 2025-09-09 20:46:26,433 - root - INFO - [34mlr: 5.7997e-06 gnorm: 0.38 [35m[2 days, 3:10:57<22:01:36][39m
+[titan] 2025-09-09 20:46:58,521 - root - INFO - [31mstep: 27970 [32mloss: 2.6835 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.71 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7488 [37mglobal_avg_top_loss: 1.9347
+[titan] 2025-09-09 20:46:58,521 - root - INFO - [34mlr: 5.7968e-06 gnorm: 0.37 [35m[2 days, 3:11:29<22:01:03][39m
+[titan] 2025-09-09 20:47:30,572 - root - INFO - [31mstep: 27975 [32mloss: 2.7375 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.27 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7756 [37mglobal_avg_top_loss: 1.9619
+[titan] 2025-09-09 20:47:30,572 - root - INFO - [34mlr: 5.7939e-06 gnorm: 0.37 [35m[2 days, 3:12:01<22:00:30][39m
+[titan] 2025-09-09 20:48:02,632 - root - INFO - [31mstep: 27980 [32mloss: 2.6951 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.14 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7562 [37mglobal_avg_top_loss: 1.9389
+[titan] 2025-09-09 20:48:02,632 - root - INFO - [34mlr: 5.7909e-06 gnorm: 0.36 [35m[2 days, 3:12:33<21:59:57][39m
+[titan] 2025-09-09 20:48:34,549 - root - INFO - [31mstep: 27985 [32mloss: 2.7530 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.30 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7831 [37mglobal_avg_top_loss: 1.9699
+[titan] 2025-09-09 20:48:34,550 - root - INFO - [34mlr: 5.7880e-06 gnorm: 0.39 [35m[2 days, 3:13:05<21:59:23][39m
+[titan] 2025-09-09 20:49:06,345 - root - INFO - [31mstep: 27990 [32mloss: 2.7218 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,306 [36mtflops: 491.18 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7642 [37mglobal_avg_top_loss: 1.9576
+[titan] 2025-09-09 20:49:06,346 - root - INFO - [34mlr: 5.7851e-06 gnorm: 0.39 [35m[2 days, 3:13:37<21:58:50][39m
+[titan] 2025-09-09 20:49:38,332 - root - INFO - [31mstep: 27995 [32mloss: 2.6885 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.24 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7592 [37mglobal_avg_top_loss: 1.9293
+[titan] 2025-09-09 20:49:38,333 - root - INFO - [34mlr: 5.7822e-06 gnorm: 0.39 [35m[2 days, 3:14:09<21:58:16][39m
+[titan] 2025-09-09 20:50:03,918 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:50:10,289 - root - INFO - [31mstep: 28000 [32mloss: 3.0002 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.71 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.9230 [37mglobal_avg_top_loss: 2.0772
+[titan] 2025-09-09 20:50:10,289 - root - INFO - [34mlr: 5.7793e-06 gnorm: 0.37 [35m[2 days, 3:14:41<21:57:43][39m
+[titan] 2025-09-09 20:50:42,195 - root - INFO - [31mstep: 28005 [32mloss: 2.7525 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.49 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7789 [37mglobal_avg_top_loss: 1.9736
+[titan] 2025-09-09 20:50:42,195 - root - INFO - [34mlr: 5.7764e-06 gnorm: 0.38 [35m[2 days, 3:15:13<21:57:10][39m
+[titan] 2025-09-09 20:51:14,076 - root - INFO - [31mstep: 28010 [32mloss: 2.6766 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.87 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7462 [37mglobal_avg_top_loss: 1.9304
+[titan] 2025-09-09 20:51:14,076 - root - INFO - [34mlr: 5.7735e-06 gnorm: 0.36 [35m[2 days, 3:15:45<21:56:36][39m
+[titan] 2025-09-09 20:51:46,090 - root - INFO - [31mstep: 28015 [32mloss: 2.7187 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.82 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7629 [37mglobal_avg_top_loss: 1.9557
+[titan] 2025-09-09 20:51:46,091 - root - INFO - [34mlr: 5.7706e-06 gnorm: 0.37 [35m[2 days, 3:16:17<21:56:03][39m
+[titan] 2025-09-09 20:52:18,012 - root - INFO - [31mstep: 28020 [32mloss: 2.7724 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.24 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.8058 [37mglobal_avg_top_loss: 1.9666
+[titan] 2025-09-09 20:52:18,012 - root - INFO - [34mlr: 5.7677e-06 gnorm: 0.41 [35m[2 days, 3:16:49<21:55:29][39m
+[titan] 2025-09-09 20:52:49,997 - root - INFO - [31mstep: 28025 [32mloss: 2.6849 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,245 [36mtflops: 488.28 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7529 [37mglobal_avg_top_loss: 1.9319
+[titan] 2025-09-09 20:52:49,997 - root - INFO - [34mlr: 5.7648e-06 gnorm: 0.44 [35m[2 days, 3:17:21<21:54:56][39m
+[titan] 2025-09-09 20:53:21,970 - root - INFO - [31mstep: 28030 [32mloss: 2.6014 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.45 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7126 [37mglobal_avg_top_loss: 1.8888
+[titan] 2025-09-09 20:53:21,970 - root - INFO - [34mlr: 5.7619e-06 gnorm: 0.36 [35m[2 days, 3:17:53<21:54:23][39m
+[titan] 2025-09-09 20:53:54,070 - root - INFO - [31mstep: 28035 [32mloss: 2.7625 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,208 [36mtflops: 486.53 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.8031 [37mglobal_avg_top_loss: 1.9594
+[titan] 2025-09-09 20:53:54,070 - root - INFO - [34mlr: 5.7590e-06 gnorm: 0.38 [35m[2 days, 3:18:25<21:53:49][39m
+[titan] 2025-09-09 20:54:26,026 - root - INFO - [31mstep: 28040 [32mloss: 2.6993 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.72 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7587 [37mglobal_avg_top_loss: 1.9407
+[titan] 2025-09-09 20:54:26,026 - root - INFO - [34mlr: 5.7561e-06 gnorm: 0.40 [35m[2 days, 3:18:57<21:53:16][39m
+[titan] 2025-09-09 20:54:58,026 - root - INFO - [31mstep: 28045 [32mloss: 2.6724 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.04 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7497 [37mglobal_avg_top_loss: 1.9227
+[titan] 2025-09-09 20:54:58,026 - root - INFO - [34mlr: 5.7532e-06 gnorm: 0.38 [35m[2 days, 3:19:29<21:52:43][39m
+[titan] 2025-09-09 20:55:23,613 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 20:55:30,087 - root - INFO - [31mstep: 28050 [32mloss: 2.7747 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.11 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7913 [37mglobal_avg_top_loss: 1.9834
+[titan] 2025-09-09 20:55:30,088 - root - INFO - [34mlr: 5.7503e-06 gnorm: 0.37 [35m[2 days, 3:20:01<21:52:09][39m
+[titan] 2025-09-09 20:56:01,944 - root - INFO - [31mstep: 28055 [32mloss: 3.0967 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.24 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.9778 [37mglobal_avg_top_loss: 2.1190
+[titan] 2025-09-09 20:56:01,945 - root - INFO - [34mlr: 5.7474e-06 gnorm: 0.44 [35m[2 days, 3:20:33<21:51:36][39m
+[titan] 2025-09-09 20:56:33,948 - root - INFO - [31mstep: 28060 [32mloss: 2.6766 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.99 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7476 [37mglobal_avg_top_loss: 1.9290
+[titan] 2025-09-09 20:56:33,948 - root - INFO - [34mlr: 5.7445e-06 gnorm: 0.38 [35m[2 days, 3:21:05<21:51:03][39m
+[titan] 2025-09-09 20:57:06,122 - root - INFO - [31mstep: 28065 [32mloss: 2.6681 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,185 [36mtflops: 485.39 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 0.7432 [37mglobal_avg_top_loss: 1.9250
+[titan] 2025-09-09 20:57:06,123 - root - INFO - [34mlr: 5.7416e-06 gnorm: 0.37 [35m[2 days, 3:21:37<21:50:29][39m
+[titan] 2025-09-09 20:57:38,034 - root - INFO - [31mstep: 28070 [32mloss: 2.6305 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.40 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7281 [37mglobal_avg_top_loss: 1.9024
+[titan] 2025-09-09 20:57:38,034 - root - INFO - [34mlr: 5.7387e-06 gnorm: 0.36 [35m[2 days, 3:22:09<21:49:56][39m
+[titan] 2025-09-09 20:58:10,127 - root - INFO - [31mstep: 28075 [32mloss: 2.7710 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.63 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7932 [37mglobal_avg_top_loss: 1.9779
+[titan] 2025-09-09 20:58:10,128 - root - INFO - [34mlr: 5.7358e-06 gnorm: 0.38 [35m[2 days, 3:22:41<21:49:23][39m
+[titan] 2025-09-09 20:58:42,068 - root - INFO - [31mstep: 28080 [32mloss: 2.7469 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.95 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7770 [37mglobal_avg_top_loss: 1.9699
+[titan] 2025-09-09 20:58:42,069 - root - INFO - [34mlr: 5.7329e-06 gnorm: 0.39 [35m[2 days, 3:23:13<21:48:49][39m
+[titan] 2025-09-09 20:59:13,973 - root - INFO - [31mstep: 28085 [32mloss: 2.6655 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.50 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7405 [37mglobal_avg_top_loss: 1.9250
+[titan] 2025-09-09 20:59:13,973 - root - INFO - [34mlr: 5.7300e-06 gnorm: 0.38 [35m[2 days, 3:23:45<21:48:16][39m
+[titan] 2025-09-09 20:59:45,793 - root - INFO - [31mstep: 28090 [32mloss: 2.8909 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.81 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.8568 [37mglobal_avg_top_loss: 2.0341
+[titan] 2025-09-09 20:59:45,793 - root - INFO - [34mlr: 5.7271e-06 gnorm: 0.37 [35m[2 days, 3:24:16<21:47:43][39m
+[titan] 2025-09-09 21:00:17,735 - root - INFO - [31mstep: 28095 [32mloss: 2.7080 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.93 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7599 [37mglobal_avg_top_loss: 1.9481
+[titan] 2025-09-09 21:00:17,735 - root - INFO - [34mlr: 5.7242e-06 gnorm: 0.36 [35m[2 days, 3:24:48<21:47:09][39m
+[titan] 2025-09-09 21:00:43,422 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:00:50,019 - root - INFO - [31mstep: 28100 [32mloss: 3.1012 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,150 [36mtflops: 483.75 [35mmfu: 48.91%[39m [37mglobal_avg_ntp_loss: 0.9895 [37mglobal_avg_top_loss: 2.1117
+[titan] 2025-09-09 21:00:50,020 - root - INFO - [34mlr: 5.7213e-06 gnorm: 0.46 [35m[2 days, 3:25:21<21:46:36][39m
+[titan] 2025-09-09 21:01:21,923 - root - INFO - [31mstep: 28105 [32mloss: 2.6933 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.51 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7546 [37mglobal_avg_top_loss: 1.9386
+[titan] 2025-09-09 21:01:21,924 - root - INFO - [34mlr: 5.7184e-06 gnorm: 0.38 [35m[2 days, 3:25:53<21:46:03][39m
+[titan] 2025-09-09 21:01:53,925 - root - INFO - [31mstep: 28110 [32mloss: 2.6600 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.03 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7388 [37mglobal_avg_top_loss: 1.9212
+[titan] 2025-09-09 21:01:53,925 - root - INFO - [34mlr: 5.7155e-06 gnorm: 0.39 [35m[2 days, 3:26:25<21:45:29][39m
+[titan] 2025-09-09 21:02:25,994 - root - INFO - [31mstep: 28115 [32mloss: 2.7463 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,218 [36mtflops: 487.00 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7790 [37mglobal_avg_top_loss: 1.9673
+[titan] 2025-09-09 21:02:25,994 - root - INFO - [34mlr: 5.7126e-06 gnorm: 0.38 [35m[2 days, 3:26:57<21:44:56][39m
+[titan] 2025-09-09 21:02:57,893 - root - INFO - [31mstep: 28120 [32mloss: 2.7644 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.59 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7905 [37mglobal_avg_top_loss: 1.9739
+[titan] 2025-09-09 21:02:57,893 - root - INFO - [34mlr: 5.7098e-06 gnorm: 0.38 [35m[2 days, 3:27:29<21:44:23][39m
+[titan] 2025-09-09 21:03:29,958 - root - INFO - [31mstep: 28125 [32mloss: 2.7378 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.06 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7735 [37mglobal_avg_top_loss: 1.9643
+[titan] 2025-09-09 21:03:29,958 - root - INFO - [34mlr: 5.7069e-06 gnorm: 0.38 [35m[2 days, 3:28:01<21:43:49][39m
+[titan] 2025-09-09 21:04:02,044 - root - INFO - [31mstep: 28130 [32mloss: 2.7900 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.72 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7982 [37mglobal_avg_top_loss: 1.9918
+[titan] 2025-09-09 21:04:02,045 - root - INFO - [34mlr: 5.7040e-06 gnorm: 0.38 [35m[2 days, 3:28:33<21:43:16][39m
+[titan] 2025-09-09 21:04:33,945 - root - INFO - [31mstep: 28135 [32mloss: 2.6533 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.57 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7391 [37mglobal_avg_top_loss: 1.9142
+[titan] 2025-09-09 21:04:33,946 - root - INFO - [34mlr: 5.7011e-06 gnorm: 0.38 [35m[2 days, 3:29:05<21:42:43][39m
+[titan] 2025-09-09 21:05:05,868 - root - INFO - [31mstep: 28140 [32mloss: 2.7795 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.22 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7961 [37mglobal_avg_top_loss: 1.9834
+[titan] 2025-09-09 21:05:05,869 - root - INFO - [34mlr: 5.6982e-06 gnorm: 0.39 [35m[2 days, 3:29:36<21:42:09][39m
+[titan] 2025-09-09 21:05:37,771 - root - INFO - [31mstep: 28145 [32mloss: 2.7587 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.54 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7843 [37mglobal_avg_top_loss: 1.9745
+[titan] 2025-09-09 21:05:37,771 - root - INFO - [34mlr: 5.6953e-06 gnorm: 0.40 [35m[2 days, 3:30:08<21:41:36][39m
+[titan] 2025-09-09 21:06:03,272 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:06:09,730 - root - INFO - [31mstep: 28150 [32mloss: 2.7301 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.66 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7694 [37mglobal_avg_top_loss: 1.9607
+[titan] 2025-09-09 21:06:09,731 - root - INFO - [34mlr: 5.6924e-06 gnorm: 0.42 [35m[2 days, 3:30:40<21:41:03][39m
+[titan] 2025-09-09 21:06:41,503 - root - INFO - [31mstep: 28155 [32mloss: 2.6656 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,313 [36mtflops: 491.53 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.7436 [37mglobal_avg_top_loss: 1.9220
+[titan] 2025-09-09 21:06:41,504 - root - INFO - [34mlr: 5.6896e-06 gnorm: 0.38 [35m[2 days, 3:31:12<21:40:29][39m
+[titan] 2025-09-09 21:07:13,681 - root - INFO - [31mstep: 28160 [32mloss: 2.5493 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,184 [36mtflops: 485.35 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 0.6903 [37mglobal_avg_top_loss: 1.8590
+[titan] 2025-09-09 21:07:13,681 - root - INFO - [34mlr: 5.6867e-06 gnorm: 0.36 [35m[2 days, 3:31:44<21:39:56][39m
+[titan] 2025-09-09 21:07:13,988 - root - INFO - Dumping profiler traces at step 28160
+[titan] 2025-09-09 21:07:14,059 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 21:07:45,964 - root - INFO - [31mstep: 28165 [32mloss: 2.7377 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,151 [36mtflops: 483.77 [35mmfu: 48.91%[39m [37mglobal_avg_ntp_loss: 0.7750 [37mglobal_avg_top_loss: 1.9627
+[titan] 2025-09-09 21:07:45,964 - root - INFO - [34mlr: 5.6838e-06 gnorm: 0.39 [35m[2 days, 3:32:17<21:39:23][39m
+[titan] 2025-09-09 21:08:17,852 - root - INFO - [31mstep: 28170 [32mloss: 2.7077 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.75 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7605 [37mglobal_avg_top_loss: 1.9472
+[titan] 2025-09-09 21:08:17,853 - root - INFO - [34mlr: 5.6809e-06 gnorm: 0.37 [35m[2 days, 3:32:48<21:38:49][39m
+[titan] 2025-09-09 21:08:49,947 - root - INFO - [31mstep: 28175 [32mloss: 2.6875 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,210 [36mtflops: 486.61 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7510 [37mglobal_avg_top_loss: 1.9366
+[titan] 2025-09-09 21:08:49,947 - root - INFO - [34mlr: 5.6780e-06 gnorm: 0.37 [35m[2 days, 3:33:21<21:38:16][39m
+[titan] 2025-09-09 21:09:21,907 - root - INFO - [31mstep: 28180 [32mloss: 2.7553 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7859 [37mglobal_avg_top_loss: 1.9693
+[titan] 2025-09-09 21:09:21,908 - root - INFO - [34mlr: 5.6752e-06 gnorm: 0.37 [35m[2 days, 3:33:53<21:37:43][39m
+[titan] 2025-09-09 21:09:53,779 - root - INFO - [31mstep: 28185 [32mloss: 2.7911 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,281 [36mtflops: 490.00 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.8002 [37mglobal_avg_top_loss: 1.9909
+[titan] 2025-09-09 21:09:53,780 - root - INFO - [34mlr: 5.6723e-06 gnorm: 0.41 [35m[2 days, 3:34:24<21:37:09][39m
+[titan] 2025-09-09 21:10:25,646 - root - INFO - [31mstep: 28190 [32mloss: 2.6504 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,283 [36mtflops: 490.08 [35mmfu: 49.55%[39m [37mglobal_avg_ntp_loss: 0.7356 [37mglobal_avg_top_loss: 1.9148
+[titan] 2025-09-09 21:10:25,647 - root - INFO - [34mlr: 5.6694e-06 gnorm: 0.37 [35m[2 days, 3:34:56<21:36:36][39m
+[titan] 2025-09-09 21:10:57,553 - root - INFO - [31mstep: 28195 [32mloss: 2.6239 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.48 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7222 [37mglobal_avg_top_loss: 1.9017
+[titan] 2025-09-09 21:10:57,553 - root - INFO - [34mlr: 5.6665e-06 gnorm: 0.40 [35m[2 days, 3:35:28<21:36:02][39m
+[titan] 2025-09-09 21:11:23,102 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:11:29,443 - root - INFO - [31mstep: 28200 [32mloss: 2.8591 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.72 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.8402 [37mglobal_avg_top_loss: 2.0190
+[titan] 2025-09-09 21:11:29,443 - root - INFO - [34mlr: 5.6637e-06 gnorm: 0.38 [35m[2 days, 3:36:00<21:35:29][39m
+[titan] 2025-09-09 21:12:01,378 - root - INFO - [31mstep: 28205 [32mloss: 2.6971 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.04 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7600 [37mglobal_avg_top_loss: 1.9371
+[titan] 2025-09-09 21:12:01,379 - root - INFO - [34mlr: 5.6608e-06 gnorm: 0.39 [35m[2 days, 3:36:32<21:34:56][39m
+[titan] 2025-09-09 21:12:33,383 - root - INFO - [31mstep: 28210 [32mloss: 2.6892 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.98 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7563 [37mglobal_avg_top_loss: 1.9330
+[titan] 2025-09-09 21:12:33,383 - root - INFO - [34mlr: 5.6579e-06 gnorm: 0.38 [35m[2 days, 3:37:04<21:34:22][39m
+[titan] 2025-09-09 21:13:05,227 - root - INFO - [31mstep: 28215 [32mloss: 2.6069 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.44 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7227 [37mglobal_avg_top_loss: 1.8842
+[titan] 2025-09-09 21:13:05,227 - root - INFO - [34mlr: 5.6550e-06 gnorm: 0.39 [35m[2 days, 3:37:36<21:33:49][39m
+[titan] 2025-09-09 21:13:37,316 - root - INFO - [31mstep: 28220 [32mloss: 2.7630 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.69 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7858 [37mglobal_avg_top_loss: 1.9773
+[titan] 2025-09-09 21:13:37,316 - root - INFO - [34mlr: 5.6522e-06 gnorm: 0.38 [35m[2 days, 3:38:08<21:33:16][39m
+[titan] 2025-09-09 21:14:09,293 - root - INFO - [31mstep: 28225 [32mloss: 2.7865 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.38 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.8006 [37mglobal_avg_top_loss: 1.9858
+[titan] 2025-09-09 21:14:09,294 - root - INFO - [34mlr: 5.6493e-06 gnorm: 0.38 [35m[2 days, 3:38:40<21:32:42][39m
+[titan] 2025-09-09 21:14:41,201 - root - INFO - [31mstep: 28230 [32mloss: 2.6685 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.45 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7415 [37mglobal_avg_top_loss: 1.9270
+[titan] 2025-09-09 21:14:41,202 - root - INFO - [34mlr: 5.6464e-06 gnorm: 0.37 [35m[2 days, 3:39:12<21:32:09][39m
+[titan] 2025-09-09 21:15:13,120 - root - INFO - [31mstep: 28235 [32mloss: 2.7414 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.30 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7739 [37mglobal_avg_top_loss: 1.9676
+[titan] 2025-09-09 21:15:13,120 - root - INFO - [34mlr: 5.6436e-06 gnorm: 0.37 [35m[2 days, 3:39:44<21:31:36][39m
+[titan] 2025-09-09 21:15:45,140 - root - INFO - [31mstep: 28240 [32mloss: 2.7126 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.74 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7607 [37mglobal_avg_top_loss: 1.9519
+[titan] 2025-09-09 21:15:45,140 - root - INFO - [34mlr: 5.6407e-06 gnorm: 0.37 [35m[2 days, 3:40:16<21:31:02][39m
+[titan] 2025-09-09 21:16:17,076 - root - INFO - [31mstep: 28245 [32mloss: 2.7356 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.02 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7783 [37mglobal_avg_top_loss: 1.9573
+[titan] 2025-09-09 21:16:17,076 - root - INFO - [34mlr: 5.6378e-06 gnorm: 0.37 [35m[2 days, 3:40:48<21:30:29][39m
+[titan] 2025-09-09 21:16:42,585 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:16:49,094 - root - INFO - [31mstep: 28250 [32mloss: 2.6597 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.77 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7406 [37mglobal_avg_top_loss: 1.9191
+[titan] 2025-09-09 21:16:49,095 - root - INFO - [34mlr: 5.6350e-06 gnorm: 0.38 [35m[2 days, 3:41:20<21:29:56][39m
+[titan] 2025-09-09 21:17:21,154 - root - INFO - [31mstep: 28255 [32mloss: 2.8276 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.14 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.8282 [37mglobal_avg_top_loss: 1.9994
+[titan] 2025-09-09 21:17:21,154 - root - INFO - [34mlr: 5.6321e-06 gnorm: 0.39 [35m[2 days, 3:41:52<21:29:22][39m
+[titan] 2025-09-09 21:17:52,902 - root - INFO - [31mstep: 28260 [32mloss: 2.6914 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,322 [36mtflops: 491.92 [35mmfu: 49.74%[39m [37mglobal_avg_ntp_loss: 0.7553 [37mglobal_avg_top_loss: 1.9361
+[titan] 2025-09-09 21:17:52,902 - root - INFO - [34mlr: 5.6292e-06 gnorm: 0.38 [35m[2 days, 3:42:23<21:28:49][39m
+[titan] 2025-09-09 21:18:24,838 - root - INFO - [31mstep: 28265 [32mloss: 2.8613 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.01 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8460 [37mglobal_avg_top_loss: 2.0153
+[titan] 2025-09-09 21:18:24,839 - root - INFO - [34mlr: 5.6264e-06 gnorm: 0.38 [35m[2 days, 3:42:55<21:28:16][39m
+[titan] 2025-09-09 21:18:56,742 - root - INFO - [31mstep: 28270 [32mloss: 2.5910 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.52 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7136 [37mglobal_avg_top_loss: 1.8774
+[titan] 2025-09-09 21:18:56,742 - root - INFO - [34mlr: 5.6235e-06 gnorm: 0.37 [35m[2 days, 3:43:27<21:27:42][39m
+[titan] 2025-09-09 21:19:28,821 - root - INFO - [31mstep: 28275 [32mloss: 2.6543 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,215 [36mtflops: 486.83 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7392 [37mglobal_avg_top_loss: 1.9151
+[titan] 2025-09-09 21:19:28,822 - root - INFO - [34mlr: 5.6206e-06 gnorm: 0.38 [35m[2 days, 3:43:59<21:27:09][39m
+[titan] 2025-09-09 21:20:00,561 - root - INFO - [31mstep: 28280 [32mloss: 3.3227 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,324 [36mtflops: 492.04 [35mmfu: 49.75%[39m [37mglobal_avg_ntp_loss: 1.1079 [37mglobal_avg_top_loss: 2.2148
+[titan] 2025-09-09 21:20:00,562 - root - INFO - [34mlr: 5.6178e-06 gnorm: 0.40 [35m[2 days, 3:44:31<21:26:36][39m
+[titan] 2025-09-09 21:20:32,501 - root - INFO - [31mstep: 28285 [32mloss: 2.7512 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.97 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7806 [37mglobal_avg_top_loss: 1.9706
+[titan] 2025-09-09 21:20:32,501 - root - INFO - [34mlr: 5.6149e-06 gnorm: 0.37 [35m[2 days, 3:45:03<21:26:02][39m
+[titan] 2025-09-09 21:21:04,572 - root - INFO - [31mstep: 28290 [32mloss: 2.5216 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.96 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.6776 [37mglobal_avg_top_loss: 1.8440
+[titan] 2025-09-09 21:21:04,573 - root - INFO - [34mlr: 5.6120e-06 gnorm: 0.36 [35m[2 days, 3:45:35<21:25:29][39m
+[titan] 2025-09-09 21:21:36,332 - root - INFO - [31mstep: 28295 [32mloss: 2.6873 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,318 [36mtflops: 491.74 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 0.7580 [37mglobal_avg_top_loss: 1.9293
+[titan] 2025-09-09 21:21:36,332 - root - INFO - [34mlr: 5.6092e-06 gnorm: 0.40 [35m[2 days, 3:46:07<21:24:55][39m
+[titan] 2025-09-09 21:22:02,062 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:22:08,496 - root - INFO - [31mstep: 28300 [32mloss: 2.6765 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,188 [36mtflops: 485.56 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7471 [37mglobal_avg_top_loss: 1.9293
+[titan] 2025-09-09 21:22:08,496 - root - INFO - [34mlr: 5.6063e-06 gnorm: 0.37 [35m[2 days, 3:46:39<21:24:22][39m
+[titan] 2025-09-09 21:22:40,484 - root - INFO - [31mstep: 28305 [32mloss: 2.7023 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.22 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7591 [37mglobal_avg_top_loss: 1.9433
+[titan] 2025-09-09 21:22:40,485 - root - INFO - [34mlr: 5.6035e-06 gnorm: 0.38 [35m[2 days, 3:47:11<21:23:49][39m
+[titan] 2025-09-09 21:23:12,475 - root - INFO - [31mstep: 28310 [32mloss: 2.8013 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8024 [37mglobal_avg_top_loss: 1.9988
+[titan] 2025-09-09 21:23:12,476 - root - INFO - [34mlr: 5.6006e-06 gnorm: 0.39 [35m[2 days, 3:47:43<21:23:16][39m
+[titan] 2025-09-09 21:23:44,550 - root - INFO - [31mstep: 28315 [32mloss: 2.7410 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.91 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7780 [37mglobal_avg_top_loss: 1.9630
+[titan] 2025-09-09 21:23:44,550 - root - INFO - [34mlr: 5.5978e-06 gnorm: 0.38 [35m[2 days, 3:48:15<21:22:42][39m
+[titan] 2025-09-09 21:24:16,615 - root - INFO - [31mstep: 28320 [32mloss: 2.6325 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.06 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7306 [37mglobal_avg_top_loss: 1.9019
+[titan] 2025-09-09 21:24:16,615 - root - INFO - [34mlr: 5.5949e-06 gnorm: 0.36 [35m[2 days, 3:48:47<21:22:09][39m
+[titan] 2025-09-09 21:24:48,575 - root - INFO - [31mstep: 28325 [32mloss: 2.7379 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.65 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7735 [37mglobal_avg_top_loss: 1.9644
+[titan] 2025-09-09 21:24:48,575 - root - INFO - [34mlr: 5.5921e-06 gnorm: 0.38 [35m[2 days, 3:49:19<21:21:36][39m
+[titan] 2025-09-09 21:25:20,513 - root - INFO - [31mstep: 28330 [32mloss: 2.7198 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,260 [36mtflops: 488.99 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7656 [37mglobal_avg_top_loss: 1.9542
+[titan] 2025-09-09 21:25:20,514 - root - INFO - [34mlr: 5.5892e-06 gnorm: 0.37 [35m[2 days, 3:49:51<21:21:02][39m
+[titan] 2025-09-09 21:25:52,438 - root - INFO - [31mstep: 28335 [32mloss: 2.7184 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.20 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7665 [37mglobal_avg_top_loss: 1.9519
+[titan] 2025-09-09 21:25:52,438 - root - INFO - [34mlr: 5.5863e-06 gnorm: 0.37 [35m[2 days, 3:50:23<21:20:29][39m
+[titan] 2025-09-09 21:26:24,421 - root - INFO - [31mstep: 28340 [32mloss: 2.7392 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.30 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7835 [37mglobal_avg_top_loss: 1.9557
+[titan] 2025-09-09 21:26:24,422 - root - INFO - [34mlr: 5.5835e-06 gnorm: 0.39 [35m[2 days, 3:50:55<21:19:56][39m
+[titan] 2025-09-09 21:26:56,483 - root - INFO - [31mstep: 28345 [32mloss: 2.7355 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,221 [36mtflops: 487.10 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7774 [37mglobal_avg_top_loss: 1.9581
+[titan] 2025-09-09 21:26:56,483 - root - INFO - [34mlr: 5.5806e-06 gnorm: 0.41 [35m[2 days, 3:51:27<21:19:22][39m
+[titan] 2025-09-09 21:27:21,809 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:27:28,258 - root - INFO - [31mstep: 28350 [32mloss: 2.6791 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,313 [36mtflops: 491.51 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.7472 [37mglobal_avg_top_loss: 1.9319
+[titan] 2025-09-09 21:27:28,258 - root - INFO - [34mlr: 5.5778e-06 gnorm: 0.38 [35m[2 days, 3:51:59<21:18:49][39m
+[titan] 2025-09-09 21:28:00,207 - root - INFO - [31mstep: 28355 [32mloss: 2.7238 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.82 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7695 [37mglobal_avg_top_loss: 1.9543
+[titan] 2025-09-09 21:28:00,208 - root - INFO - [34mlr: 5.5749e-06 gnorm: 0.38 [35m[2 days, 3:52:31<21:18:16][39m
+[titan] 2025-09-09 21:28:32,171 - root - INFO - [31mstep: 28360 [32mloss: 3.3545 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.60 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 1.1257 [37mglobal_avg_top_loss: 2.2288
+[titan] 2025-09-09 21:28:32,171 - root - INFO - [34mlr: 5.5721e-06 gnorm: 0.37 [35m[2 days, 3:53:03<21:17:42][39m
+[titan] 2025-09-09 21:29:04,364 - root - INFO - [31mstep: 28365 [32mloss: 2.6643 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,179 [36mtflops: 485.11 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7408 [37mglobal_avg_top_loss: 1.9235
+[titan] 2025-09-09 21:29:04,365 - root - INFO - [34mlr: 5.5693e-06 gnorm: 0.37 [35m[2 days, 3:53:35<21:17:09][39m
+[titan] 2025-09-09 21:29:36,264 - root - INFO - [31mstep: 28370 [32mloss: 2.7348 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,273 [36mtflops: 489.59 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7726 [37mglobal_avg_top_loss: 1.9622
+[titan] 2025-09-09 21:29:36,264 - root - INFO - [34mlr: 5.5664e-06 gnorm: 0.44 [35m[2 days, 3:54:07<21:16:36][39m
+[titan] 2025-09-09 21:30:08,276 - root - INFO - [31mstep: 28375 [32mloss: 2.5622 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.86 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7009 [37mglobal_avg_top_loss: 1.8613
+[titan] 2025-09-09 21:30:08,277 - root - INFO - [34mlr: 5.5636e-06 gnorm: 0.44 [35m[2 days, 3:54:39<21:16:02][39m
+[titan] 2025-09-09 21:30:40,408 - root - INFO - [31mstep: 28380 [32mloss: 2.7838 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,198 [36mtflops: 486.05 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.8067 [37mglobal_avg_top_loss: 1.9770
+[titan] 2025-09-09 21:30:40,408 - root - INFO - [34mlr: 5.5607e-06 gnorm: 0.40 [35m[2 days, 3:55:11<21:15:29][39m
+[titan] 2025-09-09 21:31:12,132 - root - INFO - [31mstep: 28385 [32mloss: 2.6302 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,329 [36mtflops: 492.28 [35mmfu: 49.78%[39m [37mglobal_avg_ntp_loss: 0.7225 [37mglobal_avg_top_loss: 1.9078
+[titan] 2025-09-09 21:31:12,133 - root - INFO - [34mlr: 5.5579e-06 gnorm: 0.42 [35m[2 days, 3:55:43<21:14:56][39m
+[titan] 2025-09-09 21:31:44,056 - root - INFO - [31mstep: 28390 [32mloss: 2.8550 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.22 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.8501 [37mglobal_avg_top_loss: 2.0049
+[titan] 2025-09-09 21:31:44,056 - root - INFO - [34mlr: 5.5550e-06 gnorm: 0.39 [35m[2 days, 3:56:15<21:14:22][39m
+[titan] 2025-09-09 21:32:15,880 - root - INFO - [31mstep: 28395 [32mloss: 2.6919 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,297 [36mtflops: 490.75 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7558 [37mglobal_avg_top_loss: 1.9360
+[titan] 2025-09-09 21:32:15,880 - root - INFO - [34mlr: 5.5522e-06 gnorm: 0.43 [35m[2 days, 3:56:46<21:13:49][39m
+[titan] 2025-09-09 21:32:41,421 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:32:47,854 - root - INFO - [31mstep: 28400 [32mloss: 2.5807 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.43 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7034 [37mglobal_avg_top_loss: 1.8773
+[titan] 2025-09-09 21:32:47,855 - root - INFO - [34mlr: 5.5493e-06 gnorm: 0.42 [35m[2 days, 3:57:18<21:13:16][39m
+[titan] 2025-09-09 21:33:19,784 - root - INFO - [31mstep: 28405 [32mloss: 2.7164 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.12 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7639 [37mglobal_avg_top_loss: 1.9526
+[titan] 2025-09-09 21:33:19,785 - root - INFO - [34mlr: 5.5465e-06 gnorm: 0.37 [35m[2 days, 3:57:50<21:12:42][39m
+[titan] 2025-09-09 21:33:51,752 - root - INFO - [31mstep: 28410 [32mloss: 3.0325 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.54 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.9399 [37mglobal_avg_top_loss: 2.0926
+[titan] 2025-09-09 21:33:51,753 - root - INFO - [34mlr: 5.5437e-06 gnorm: 0.40 [35m[2 days, 3:58:22<21:12:09][39m
+[titan] 2025-09-09 21:34:23,754 - root - INFO - [31mstep: 28415 [32mloss: 2.7711 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.02 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7887 [37mglobal_avg_top_loss: 1.9824
+[titan] 2025-09-09 21:34:23,755 - root - INFO - [34mlr: 5.5408e-06 gnorm: 0.38 [35m[2 days, 3:58:54<21:11:36][39m
+[titan] 2025-09-09 21:34:55,955 - root - INFO - [31mstep: 28420 [32mloss: 2.7472 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,176 [36mtflops: 485.00 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7821 [37mglobal_avg_top_loss: 1.9651
+[titan] 2025-09-09 21:34:55,956 - root - INFO - [34mlr: 5.5380e-06 gnorm: 0.37 [35m[2 days, 3:59:27<21:11:02][39m
+[titan] 2025-09-09 21:35:27,770 - root - INFO - [31mstep: 28425 [32mloss: 2.8168 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.90 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.8165 [37mglobal_avg_top_loss: 2.0003
+[titan] 2025-09-09 21:35:27,770 - root - INFO - [34mlr: 5.5352e-06 gnorm: 0.39 [35m[2 days, 3:59:58<21:10:29][39m
+[titan] 2025-09-09 21:35:59,676 - root - INFO - [31mstep: 28430 [32mloss: 2.8166 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.47 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.8164 [37mglobal_avg_top_loss: 2.0003
+[titan] 2025-09-09 21:35:59,677 - root - INFO - [34mlr: 5.5323e-06 gnorm: 0.38 [35m[2 days, 4:00:30<21:09:56][39m
+[titan] 2025-09-09 21:36:31,636 - root - INFO - [31mstep: 28435 [32mloss: 2.6888 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.67 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7517 [37mglobal_avg_top_loss: 1.9371
+[titan] 2025-09-09 21:36:31,636 - root - INFO - [34mlr: 5.5295e-06 gnorm: 0.37 [35m[2 days, 4:01:02<21:09:22][39m
+[titan] 2025-09-09 21:37:03,607 - root - INFO - [31mstep: 28440 [32mloss: 3.1246 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.49 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 1.0049 [37mglobal_avg_top_loss: 2.1197
+[titan] 2025-09-09 21:37:03,607 - root - INFO - [34mlr: 5.5266e-06 gnorm: 0.42 [35m[2 days, 4:01:34<21:08:49][39m
+[titan] 2025-09-09 21:37:35,461 - root - INFO - [31mstep: 28445 [32mloss: 2.7246 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.28 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7677 [37mglobal_avg_top_loss: 1.9569
+[titan] 2025-09-09 21:37:35,461 - root - INFO - [34mlr: 5.5238e-06 gnorm: 0.38 [35m[2 days, 4:02:06<21:08:16][39m
+[titan] 2025-09-09 21:38:01,194 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:38:07,556 - root - INFO - [31mstep: 28450 [32mloss: 2.9222 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,210 [36mtflops: 486.60 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.8822 [37mglobal_avg_top_loss: 2.0400
+[titan] 2025-09-09 21:38:07,556 - root - INFO - [34mlr: 5.5210e-06 gnorm: 0.39 [35m[2 days, 4:02:38<21:07:42][39m
+[titan] 2025-09-09 21:38:39,331 - root - INFO - [31mstep: 28455 [32mloss: 2.6796 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,313 [36mtflops: 491.51 [35mmfu: 49.70%[39m [37mglobal_avg_ntp_loss: 0.7506 [37mglobal_avg_top_loss: 1.9290
+[titan] 2025-09-09 21:38:39,331 - root - INFO - [34mlr: 5.5182e-06 gnorm: 0.43 [35m[2 days, 4:03:10<21:07:09][39m
+[titan] 2025-09-09 21:39:11,176 - root - INFO - [31mstep: 28460 [32mloss: 2.7121 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.42 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7631 [37mglobal_avg_top_loss: 1.9491
+[titan] 2025-09-09 21:39:11,176 - root - INFO - [34mlr: 5.5153e-06 gnorm: 0.40 [35m[2 days, 4:03:42<21:06:36][39m
+[titan] 2025-09-09 21:39:43,200 - root - INFO - [31mstep: 28465 [32mloss: 2.6685 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.68 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7458 [37mglobal_avg_top_loss: 1.9226
+[titan] 2025-09-09 21:39:43,200 - root - INFO - [34mlr: 5.5125e-06 gnorm: 0.37 [35m[2 days, 4:04:14<21:06:02][39m
+[titan] 2025-09-09 21:40:15,359 - root - INFO - [31mstep: 28470 [32mloss: 2.7464 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.63 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7767 [37mglobal_avg_top_loss: 1.9698
+[titan] 2025-09-09 21:40:15,359 - root - INFO - [34mlr: 5.5097e-06 gnorm: 0.38 [35m[2 days, 4:04:46<21:05:29][39m
+[titan] 2025-09-09 21:40:47,551 - root - INFO - [31mstep: 28475 [32mloss: 2.6765 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,179 [36mtflops: 485.13 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7455 [37mglobal_avg_top_loss: 1.9310
+[titan] 2025-09-09 21:40:47,552 - root - INFO - [34mlr: 5.5068e-06 gnorm: 0.38 [35m[2 days, 4:05:18<21:04:56][39m
+[titan] 2025-09-09 21:41:19,545 - root - INFO - [31mstep: 28480 [32mloss: 2.7187 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.14 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7676 [37mglobal_avg_top_loss: 1.9511
+[titan] 2025-09-09 21:41:19,546 - root - INFO - [34mlr: 5.5040e-06 gnorm: 0.38 [35m[2 days, 4:05:50<21:04:23][39m
+[titan] 2025-09-09 21:41:51,389 - root - INFO - [31mstep: 28485 [32mloss: 2.6657 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.44 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7411 [37mglobal_avg_top_loss: 1.9245
+[titan] 2025-09-09 21:41:51,389 - root - INFO - [34mlr: 5.5012e-06 gnorm: 0.37 [35m[2 days, 4:06:22<21:03:49][39m
+[titan] 2025-09-09 21:42:23,178 - root - INFO - [31mstep: 28490 [32mloss: 3.2187 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,308 [36mtflops: 491.28 [35mmfu: 49.67%[39m [37mglobal_avg_ntp_loss: 1.0470 [37mglobal_avg_top_loss: 2.1716
+[titan] 2025-09-09 21:42:23,179 - root - INFO - [34mlr: 5.4984e-06 gnorm: 0.38 [35m[2 days, 4:06:54<21:03:16][39m
+[titan] 2025-09-09 21:42:55,118 - root - INFO - [31mstep: 28495 [32mloss: 2.6664 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.96 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7416 [37mglobal_avg_top_loss: 1.9248
+[titan] 2025-09-09 21:42:55,119 - root - INFO - [34mlr: 5.4955e-06 gnorm: 0.37 [35m[2 days, 4:07:26<21:02:43][39m
+[titan] 2025-09-09 21:43:20,653 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:43:27,078 - root - INFO - [31mstep: 28500 [32mloss: 2.7225 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.66 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7655 [37mglobal_avg_top_loss: 1.9571
+[titan] 2025-09-09 21:43:27,078 - root - INFO - [34mlr: 5.4927e-06 gnorm: 0.38 [35m[2 days, 4:07:58<21:02:09][39m
+[titan] 2025-09-09 21:43:59,034 - root - INFO - [31mstep: 28505 [32mloss: 2.6893 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.72 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7535 [37mglobal_avg_top_loss: 1.9358
+[titan] 2025-09-09 21:43:59,034 - root - INFO - [34mlr: 5.4899e-06 gnorm: 0.38 [35m[2 days, 4:08:30<21:01:36][39m
+[titan] 2025-09-09 21:44:30,839 - root - INFO - [31mstep: 28510 [32mloss: 2.7577 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,303 [36mtflops: 491.03 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7871 [37mglobal_avg_top_loss: 1.9706
+[titan] 2025-09-09 21:44:30,840 - root - INFO - [34mlr: 5.4871e-06 gnorm: 0.38 [35m[2 days, 4:09:01<21:01:03][39m
+[titan] 2025-09-09 21:45:03,153 - root - INFO - [31mstep: 28515 [32mloss: 2.6288 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,141 [36mtflops: 483.30 [35mmfu: 48.87%[39m [37mglobal_avg_ntp_loss: 0.7264 [37mglobal_avg_top_loss: 1.9024
+[titan] 2025-09-09 21:45:03,154 - root - INFO - [34mlr: 5.4842e-06 gnorm: 0.39 [35m[2 days, 4:09:34<21:00:29][39m
+[titan] 2025-09-09 21:45:35,081 - root - INFO - [31mstep: 28520 [32mloss: 3.2211 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.15 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 1.0502 [37mglobal_avg_top_loss: 2.1709
+[titan] 2025-09-09 21:45:35,081 - root - INFO - [34mlr: 5.4814e-06 gnorm: 0.39 [35m[2 days, 4:10:06<20:59:56][39m
+[titan] 2025-09-09 21:46:06,973 - root - INFO - [31mstep: 28525 [32mloss: 2.8803 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.69 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.8427 [37mglobal_avg_top_loss: 2.0376
+[titan] 2025-09-09 21:46:06,974 - root - INFO - [34mlr: 5.4786e-06 gnorm: 0.39 [35m[2 days, 4:10:38<20:59:23][39m
+[titan] 2025-09-09 21:46:38,919 - root - INFO - [31mstep: 28530 [32mloss: 2.5020 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.87 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.6680 [37mglobal_avg_top_loss: 1.8340
+[titan] 2025-09-09 21:46:38,920 - root - INFO - [34mlr: 5.4758e-06 gnorm: 0.36 [35m[2 days, 4:11:09<20:58:49][39m
+[titan] 2025-09-09 21:47:10,824 - root - INFO - [31mstep: 28535 [32mloss: 3.1806 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.51 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 1.0301 [37mglobal_avg_top_loss: 2.1505
+[titan] 2025-09-09 21:47:10,824 - root - INFO - [34mlr: 5.4730e-06 gnorm: 0.38 [35m[2 days, 4:11:41<20:58:16][39m
+[titan] 2025-09-09 21:47:42,767 - root - INFO - [31mstep: 28540 [32mloss: 2.7551 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.91 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7848 [37mglobal_avg_top_loss: 1.9703
+[titan] 2025-09-09 21:47:42,767 - root - INFO - [34mlr: 5.4701e-06 gnorm: 0.38 [35m[2 days, 4:12:13<20:57:43][39m
+[titan] 2025-09-09 21:48:14,746 - root - INFO - [31mstep: 28545 [32mloss: 2.6349 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.37 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7264 [37mglobal_avg_top_loss: 1.9085
+[titan] 2025-09-09 21:48:14,746 - root - INFO - [34mlr: 5.4673e-06 gnorm: 0.38 [35m[2 days, 4:12:45<20:57:09][39m
+[titan] 2025-09-09 21:48:40,188 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:48:46,628 - root - INFO - [31mstep: 28550 [32mloss: 2.7167 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.85 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7678 [37mglobal_avg_top_loss: 1.9489
+[titan] 2025-09-09 21:48:46,628 - root - INFO - [34mlr: 5.4645e-06 gnorm: 0.39 [35m[2 days, 4:13:17<20:56:36][39m
+[titan] 2025-09-09 21:49:18,585 - root - INFO - [31mstep: 28555 [32mloss: 2.6226 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.69 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7200 [37mglobal_avg_top_loss: 1.9026
+[titan] 2025-09-09 21:49:18,586 - root - INFO - [34mlr: 5.4617e-06 gnorm: 0.39 [35m[2 days, 4:13:49<20:56:03][39m
+[titan] 2025-09-09 21:49:50,577 - root - INFO - [31mstep: 28560 [32mloss: 2.8661 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.17 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.8341 [37mglobal_avg_top_loss: 2.0321
+[titan] 2025-09-09 21:49:50,578 - root - INFO - [34mlr: 5.4589e-06 gnorm: 0.41 [35m[2 days, 4:14:21<20:55:29][39m
+[titan] 2025-09-09 21:50:22,525 - root - INFO - [31mstep: 28565 [32mloss: 2.5915 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.84 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7055 [37mglobal_avg_top_loss: 1.8859
+[titan] 2025-09-09 21:50:22,526 - root - INFO - [34mlr: 5.4561e-06 gnorm: 0.39 [35m[2 days, 4:14:53<20:54:56][39m
+[titan] 2025-09-09 21:50:54,501 - root - INFO - [31mstep: 28570 [32mloss: 3.1971 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.41 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 1.0341 [37mglobal_avg_top_loss: 2.1630
+[titan] 2025-09-09 21:50:54,502 - root - INFO - [34mlr: 5.4533e-06 gnorm: 0.51 [35m[2 days, 4:15:25<20:54:23][39m
+[titan] 2025-09-09 21:51:26,697 - root - INFO - [31mstep: 28575 [32mloss: 2.6941 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.08 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7557 [37mglobal_avg_top_loss: 1.9385
+[titan] 2025-09-09 21:51:26,697 - root - INFO - [34mlr: 5.4504e-06 gnorm: 0.38 [35m[2 days, 4:15:57<20:53:50][39m
+[titan] 2025-09-09 21:51:58,672 - root - INFO - [31mstep: 28580 [32mloss: 2.6588 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.42 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7415 [37mglobal_avg_top_loss: 1.9174
+[titan] 2025-09-09 21:51:58,673 - root - INFO - [34mlr: 5.4476e-06 gnorm: 0.40 [35m[2 days, 4:16:29<20:53:16][39m
+[titan] 2025-09-09 21:52:30,508 - root - INFO - [31mstep: 28585 [32mloss: 3.1974 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.56 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 1.0373 [37mglobal_avg_top_loss: 2.1600
+[titan] 2025-09-09 21:52:30,509 - root - INFO - [34mlr: 5.4448e-06 gnorm: 0.39 [35m[2 days, 4:17:01<20:52:43][39m
+[titan] 2025-09-09 21:53:02,463 - root - INFO - [31mstep: 28590 [32mloss: 2.7603 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.73 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7827 [37mglobal_avg_top_loss: 1.9776
+[titan] 2025-09-09 21:53:02,464 - root - INFO - [34mlr: 5.4420e-06 gnorm: 0.40 [35m[2 days, 4:17:33<20:52:10][39m
+[titan] 2025-09-09 21:53:34,353 - root - INFO - [31mstep: 28595 [32mloss: 2.7405 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.73 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7735 [37mglobal_avg_top_loss: 1.9670
+[titan] 2025-09-09 21:53:34,354 - root - INFO - [34mlr: 5.4392e-06 gnorm: 0.40 [35m[2 days, 4:18:05<20:51:36][39m
+[titan] 2025-09-09 21:54:00,249 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:54:06,571 - root - INFO - [31mstep: 28600 [32mloss: 3.0858 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,171 [36mtflops: 484.75 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 0.9849 [37mglobal_avg_top_loss: 2.1009
+[titan] 2025-09-09 21:54:06,571 - root - INFO - [34mlr: 5.4364e-06 gnorm: 0.38 [35m[2 days, 4:18:37<20:51:03][39m
+[titan] 2025-09-09 21:54:38,498 - root - INFO - [31mstep: 28605 [32mloss: 2.6908 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.16 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7542 [37mglobal_avg_top_loss: 1.9367
+[titan] 2025-09-09 21:54:38,498 - root - INFO - [34mlr: 5.4336e-06 gnorm: 0.38 [35m[2 days, 4:19:09<20:50:30][39m
+[titan] 2025-09-09 21:55:10,493 - root - INFO - [31mstep: 28610 [32mloss: 2.7055 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.12 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7618 [37mglobal_avg_top_loss: 1.9437
+[titan] 2025-09-09 21:55:10,493 - root - INFO - [34mlr: 5.4308e-06 gnorm: 0.38 [35m[2 days, 4:19:41<20:49:57][39m
+[titan] 2025-09-09 21:55:42,512 - root - INFO - [31mstep: 28615 [32mloss: 3.1941 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.75 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 1.0331 [37mglobal_avg_top_loss: 2.1610
+[titan] 2025-09-09 21:55:42,512 - root - INFO - [34mlr: 5.4280e-06 gnorm: 0.40 [35m[2 days, 4:20:13<20:49:23][39m
+[titan] 2025-09-09 21:56:14,457 - root - INFO - [31mstep: 28620 [32mloss: 2.7257 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.89 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7699 [37mglobal_avg_top_loss: 1.9557
+[titan] 2025-09-09 21:56:14,457 - root - INFO - [34mlr: 5.4252e-06 gnorm: 0.37 [35m[2 days, 4:20:45<20:48:50][39m
+[titan] 2025-09-09 21:56:46,459 - root - INFO - [31mstep: 28625 [32mloss: 2.6829 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.01 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7492 [37mglobal_avg_top_loss: 1.9337
+[titan] 2025-09-09 21:56:46,459 - root - INFO - [34mlr: 5.4224e-06 gnorm: 0.38 [35m[2 days, 4:21:17<20:48:17][39m
+[titan] 2025-09-09 21:57:18,580 - root - INFO - [31mstep: 28630 [32mloss: 2.6641 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,202 [36mtflops: 486.21 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7534 [37mglobal_avg_top_loss: 1.9106
+[titan] 2025-09-09 21:57:18,580 - root - INFO - [34mlr: 5.4196e-06 gnorm: 0.41 [35m[2 days, 4:21:49<20:47:43][39m
+[titan] 2025-09-09 21:57:50,515 - root - INFO - [31mstep: 28635 [32mloss: 2.6440 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.03 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7312 [37mglobal_avg_top_loss: 1.9128
+[titan] 2025-09-09 21:57:50,516 - root - INFO - [34mlr: 5.4168e-06 gnorm: 0.37 [35m[2 days, 4:22:21<20:47:10][39m
+[titan] 2025-09-09 21:58:22,561 - root - INFO - [31mstep: 28640 [32mloss: 2.6476 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.35 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7352 [37mglobal_avg_top_loss: 1.9124
+[titan] 2025-09-09 21:58:22,561 - root - INFO - [34mlr: 5.4140e-06 gnorm: 0.38 [35m[2 days, 4:22:53<20:46:37][39m
+[titan] 2025-09-09 21:58:54,368 - root - INFO - [31mstep: 28645 [32mloss: 2.7637 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,302 [36mtflops: 491.00 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7900 [37mglobal_avg_top_loss: 1.9737
+[titan] 2025-09-09 21:58:54,369 - root - INFO - [34mlr: 5.4112e-06 gnorm: 0.42 [35m[2 days, 4:23:25<20:46:03][39m
+[titan] 2025-09-09 21:59:20,073 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 21:59:26,572 - root - INFO - [31mstep: 28650 [32mloss: 3.1641 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,175 [36mtflops: 484.95 [35mmfu: 49.03%[39m [37mglobal_avg_ntp_loss: 1.0333 [37mglobal_avg_top_loss: 2.1307
+[titan] 2025-09-09 21:59:26,573 - root - INFO - [34mlr: 5.4084e-06 gnorm: 0.39 [35m[2 days, 4:23:57<20:45:30][39m
+[titan] 2025-09-09 21:59:58,660 - root - INFO - [31mstep: 28655 [32mloss: 2.7249 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.72 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7668 [37mglobal_avg_top_loss: 1.9581
+[titan] 2025-09-09 21:59:58,660 - root - INFO - [34mlr: 5.4056e-06 gnorm: 0.39 [35m[2 days, 4:24:29<20:44:57][39m
+[titan] 2025-09-09 22:00:30,606 - root - INFO - [31mstep: 28660 [32mloss: 2.5616 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.87 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.6933 [37mglobal_avg_top_loss: 1.8683
+[titan] 2025-09-09 22:00:30,606 - root - INFO - [34mlr: 5.4028e-06 gnorm: 0.39 [35m[2 days, 4:25:01<20:44:24][39m
+[titan] 2025-09-09 22:01:02,610 - root - INFO - [31mstep: 28665 [32mloss: 3.1232 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.99 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 1.0025 [37mglobal_avg_top_loss: 2.1206
+[titan] 2025-09-09 22:01:02,610 - root - INFO - [34mlr: 5.4000e-06 gnorm: 0.38 [35m[2 days, 4:25:33<20:43:50][39m
+[titan] 2025-09-09 22:01:34,629 - root - INFO - [31mstep: 28670 [32mloss: 2.6952 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.75 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7604 [37mglobal_avg_top_loss: 1.9348
+[titan] 2025-09-09 22:01:34,629 - root - INFO - [34mlr: 5.3972e-06 gnorm: 0.39 [35m[2 days, 4:26:05<20:43:17][39m
+[titan] 2025-09-09 22:01:47,793 - root - INFO - Dumping profiler traces at step 28672
+[titan] 2025-09-09 22:01:47,864 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 22:02:06,898 - root - INFO - [31mstep: 28675 [32mloss: 2.6586 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,155 [36mtflops: 483.98 [35mmfu: 48.94%[39m [37mglobal_avg_ntp_loss: 0.7408 [37mglobal_avg_top_loss: 1.9179
+[titan] 2025-09-09 22:02:06,899 - root - INFO - [34mlr: 5.3944e-06 gnorm: 0.38 [35m[2 days, 4:26:37<20:42:44][39m
+[titan] 2025-09-09 22:02:38,744 - root - INFO - [31mstep: 28680 [32mloss: 2.6551 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.41 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7363 [37mglobal_avg_top_loss: 1.9188
+[titan] 2025-09-09 22:02:38,744 - root - INFO - [34mlr: 5.3916e-06 gnorm: 0.36 [35m[2 days, 4:27:09<20:42:11][39m
+[titan] 2025-09-09 22:03:10,799 - root - INFO - [31mstep: 28685 [32mloss: 2.7268 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.21 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7700 [37mglobal_avg_top_loss: 1.9569
+[titan] 2025-09-09 22:03:10,799 - root - INFO - [34mlr: 5.3888e-06 gnorm: 0.40 [35m[2 days, 4:27:41<20:41:37][39m
+[titan] 2025-09-09 22:03:42,679 - root - INFO - [31mstep: 28690 [32mloss: 2.6472 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.88 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7358 [37mglobal_avg_top_loss: 1.9114
+[titan] 2025-09-09 22:03:42,679 - root - INFO - [34mlr: 5.3860e-06 gnorm: 0.37 [35m[2 days, 4:28:13<20:41:04][39m
+[titan] 2025-09-09 22:04:14,853 - root - INFO - [31mstep: 28695 [32mloss: 3.1900 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,185 [36mtflops: 485.40 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 1.0345 [37mglobal_avg_top_loss: 2.1555
+[titan] 2025-09-09 22:04:14,854 - root - INFO - [34mlr: 5.3833e-06 gnorm: 0.37 [35m[2 days, 4:28:45<20:40:31][39m
+[titan] 2025-09-09 22:04:40,544 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:04:46,936 - root - INFO - [31mstep: 28700 [32mloss: 2.7025 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,214 [36mtflops: 486.78 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7573 [37mglobal_avg_top_loss: 1.9451
+[titan] 2025-09-09 22:04:46,937 - root - INFO - [34mlr: 5.3805e-06 gnorm: 0.38 [35m[2 days, 4:29:17<20:39:58][39m
+[titan] 2025-09-09 22:05:19,119 - root - INFO - [31mstep: 28705 [32mloss: 2.6819 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,182 [36mtflops: 485.27 [35mmfu: 49.07%[39m [37mglobal_avg_ntp_loss: 0.7505 [37mglobal_avg_top_loss: 1.9314
+[titan] 2025-09-09 22:05:19,120 - root - INFO - [34mlr: 5.3777e-06 gnorm: 0.38 [35m[2 days, 4:29:50<20:39:24][39m
+[titan] 2025-09-09 22:05:51,276 - root - INFO - [31mstep: 28710 [32mloss: 2.6720 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.67 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.7448 [37mglobal_avg_top_loss: 1.9272
+[titan] 2025-09-09 22:05:51,276 - root - INFO - [34mlr: 5.3749e-06 gnorm: 0.40 [35m[2 days, 4:30:22<20:38:51][39m
+[titan] 2025-09-09 22:06:23,209 - root - INFO - [31mstep: 28715 [32mloss: 3.1158 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.07 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.9988 [37mglobal_avg_top_loss: 2.1171
+[titan] 2025-09-09 22:06:23,209 - root - INFO - [34mlr: 5.3721e-06 gnorm: 0.39 [35m[2 days, 4:30:54<20:38:18][39m
+[titan] 2025-09-09 22:06:55,018 - root - INFO - [31mstep: 28720 [32mloss: 2.6877 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.97 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7519 [37mglobal_avg_top_loss: 1.9358
+[titan] 2025-09-09 22:06:55,018 - root - INFO - [34mlr: 5.3693e-06 gnorm: 0.40 [35m[2 days, 4:31:26<20:37:44][39m
+[titan] 2025-09-09 22:07:27,132 - root - INFO - [31mstep: 28725 [32mloss: 2.6639 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.31 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7421 [37mglobal_avg_top_loss: 1.9218
+[titan] 2025-09-09 22:07:27,133 - root - INFO - [34mlr: 5.3665e-06 gnorm: 0.40 [35m[2 days, 4:31:58<20:37:11][39m
+[titan] 2025-09-09 22:07:58,984 - root - INFO - [31mstep: 28730 [32mloss: 3.1616 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,288 [36mtflops: 490.32 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 1.0217 [37mglobal_avg_top_loss: 2.1399
+[titan] 2025-09-09 22:07:58,985 - root - INFO - [34mlr: 5.3637e-06 gnorm: 0.42 [35m[2 days, 4:32:29<20:36:38][39m
+[titan] 2025-09-09 22:08:31,198 - root - INFO - [31mstep: 28735 [32mloss: 2.6441 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,172 [36mtflops: 484.81 [35mmfu: 49.02%[39m [37mglobal_avg_ntp_loss: 0.7313 [37mglobal_avg_top_loss: 1.9128
+[titan] 2025-09-09 22:08:31,198 - root - INFO - [34mlr: 5.3610e-06 gnorm: 0.38 [35m[2 days, 4:33:02<20:36:05][39m
+[titan] 2025-09-09 22:09:03,305 - root - INFO - [31mstep: 28740 [32mloss: 2.7205 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,206 [36mtflops: 486.42 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7682 [37mglobal_avg_top_loss: 1.9522
+[titan] 2025-09-09 22:09:03,305 - root - INFO - [34mlr: 5.3582e-06 gnorm: 0.41 [35m[2 days, 4:33:34<20:35:31][39m
+[titan] 2025-09-09 22:09:35,325 - root - INFO - [31mstep: 28745 [32mloss: 3.0771 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.74 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.9833 [37mglobal_avg_top_loss: 2.0938
+[titan] 2025-09-09 22:09:35,325 - root - INFO - [34mlr: 5.3554e-06 gnorm: 0.39 [35m[2 days, 4:34:06<20:34:58][39m
+[titan] 2025-09-09 22:10:01,068 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:10:07,441 - root - INFO - [31mstep: 28750 [32mloss: 2.6752 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.28 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7440 [37mglobal_avg_top_loss: 1.9311
+[titan] 2025-09-09 22:10:07,441 - root - INFO - [34mlr: 5.3526e-06 gnorm: 0.38 [35m[2 days, 4:34:38<20:34:25][39m
+[titan] 2025-09-09 22:10:39,495 - root - INFO - [31mstep: 28755 [32mloss: 2.6750 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,223 [36mtflops: 487.23 [35mmfu: 49.26%[39m [37mglobal_avg_ntp_loss: 0.7475 [37mglobal_avg_top_loss: 1.9275
+[titan] 2025-09-09 22:10:39,495 - root - INFO - [34mlr: 5.3498e-06 gnorm: 0.37 [35m[2 days, 4:35:10<20:33:52][39m
+[titan] 2025-09-09 22:11:11,429 - root - INFO - [31mstep: 28760 [32mloss: 2.7428 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.05 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7751 [37mglobal_avg_top_loss: 1.9677
+[titan] 2025-09-09 22:11:11,430 - root - INFO - [34mlr: 5.3471e-06 gnorm: 0.39 [35m[2 days, 4:35:42<20:33:18][39m
+[titan] 2025-09-09 22:11:43,494 - root - INFO - [31mstep: 28765 [32mloss: 2.6364 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.06 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.7254 [37mglobal_avg_top_loss: 1.9110
+[titan] 2025-09-09 22:11:43,494 - root - INFO - [34mlr: 5.3443e-06 gnorm: 0.38 [35m[2 days, 4:36:14<20:32:45][39m
+[titan] 2025-09-09 22:12:15,584 - root - INFO - [31mstep: 28770 [32mloss: 2.6527 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.68 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7365 [37mglobal_avg_top_loss: 1.9163
+[titan] 2025-09-09 22:12:15,584 - root - INFO - [34mlr: 5.3415e-06 gnorm: 0.38 [35m[2 days, 4:36:46<20:32:12][39m
+[titan] 2025-09-09 22:12:47,720 - root - INFO - [31mstep: 28775 [32mloss: 3.1524 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,197 [36mtflops: 485.98 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 1.0147 [37mglobal_avg_top_loss: 2.1377
+[titan] 2025-09-09 22:12:47,720 - root - INFO - [34mlr: 5.3387e-06 gnorm: 0.39 [35m[2 days, 4:37:18<20:31:39][39m
+[titan] 2025-09-09 22:13:19,821 - root - INFO - [31mstep: 28780 [32mloss: 2.6481 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,208 [36mtflops: 486.50 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7351 [37mglobal_avg_top_loss: 1.9131
+[titan] 2025-09-09 22:13:19,822 - root - INFO - [34mlr: 5.3360e-06 gnorm: 0.39 [35m[2 days, 4:37:50<20:31:05][39m
+[titan] 2025-09-09 22:13:52,050 - root - INFO - [31mstep: 28785 [32mloss: 2.6342 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,168 [36mtflops: 484.59 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 0.7244 [37mglobal_avg_top_loss: 1.9098
+[titan] 2025-09-09 22:13:52,050 - root - INFO - [34mlr: 5.3332e-06 gnorm: 0.40 [35m[2 days, 4:38:23<20:30:32][39m
+[titan] 2025-09-09 22:14:24,061 - root - INFO - [31mstep: 28790 [32mloss: 2.6920 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.87 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7544 [37mglobal_avg_top_loss: 1.9376
+[titan] 2025-09-09 22:14:24,062 - root - INFO - [34mlr: 5.3304e-06 gnorm: 0.40 [35m[2 days, 4:38:55<20:29:59][39m
+[titan] 2025-09-09 22:14:56,308 - root - INFO - [31mstep: 28795 [32mloss: 3.1088 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,162 [36mtflops: 484.31 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.9950 [37mglobal_avg_top_loss: 2.1138
+[titan] 2025-09-09 22:14:56,308 - root - INFO - [34mlr: 5.3276e-06 gnorm: 0.39 [35m[2 days, 4:39:27<20:29:26][39m
+[titan] 2025-09-09 22:15:21,980 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:15:28,411 - root - INFO - [31mstep: 28800 [32mloss: 2.6261 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.47 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7230 [37mglobal_avg_top_loss: 1.9030
+[titan] 2025-09-09 22:15:28,412 - root - INFO - [34mlr: 5.3249e-06 gnorm: 0.38 [35m[2 days, 4:39:59<20:28:53][39m
+[titan] 2025-09-09 22:16:00,462 - root - INFO - [31mstep: 28805 [32mloss: 2.5911 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.27 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7104 [37mglobal_avg_top_loss: 1.8807
+[titan] 2025-09-09 22:16:00,463 - root - INFO - [34mlr: 5.3221e-06 gnorm: 0.37 [35m[2 days, 4:40:31<20:28:19][39m
+[titan] 2025-09-09 22:16:32,557 - root - INFO - [31mstep: 28810 [32mloss: 3.1791 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,210 [36mtflops: 486.60 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 1.0293 [37mglobal_avg_top_loss: 2.1497
+[titan] 2025-09-09 22:16:32,558 - root - INFO - [34mlr: 5.3193e-06 gnorm: 0.39 [35m[2 days, 4:41:03<20:27:46][39m
+[titan] 2025-09-09 22:17:04,649 - root - INFO - [31mstep: 28815 [32mloss: 2.6710 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.64 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7457 [37mglobal_avg_top_loss: 1.9253
+[titan] 2025-09-09 22:17:04,650 - root - INFO - [34mlr: 5.3166e-06 gnorm: 0.44 [35m[2 days, 4:41:35<20:27:13][39m
+[titan] 2025-09-09 22:17:36,857 - root - INFO - [31mstep: 28820 [32mloss: 2.7233 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,174 [36mtflops: 484.89 [35mmfu: 49.03%[39m [37mglobal_avg_ntp_loss: 0.7658 [37mglobal_avg_top_loss: 1.9575
+[titan] 2025-09-09 22:17:36,858 - root - INFO - [34mlr: 5.3138e-06 gnorm: 0.38 [35m[2 days, 4:42:07<20:26:40][39m
+[titan] 2025-09-09 22:18:08,733 - root - INFO - [31mstep: 28825 [32mloss: 2.8850 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.95 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.8689 [37mglobal_avg_top_loss: 2.0160
+[titan] 2025-09-09 22:18:08,733 - root - INFO - [34mlr: 5.3110e-06 gnorm: 0.38 [35m[2 days, 4:42:39<20:26:06][39m
+[titan] 2025-09-09 22:18:40,756 - root - INFO - [31mstep: 28830 [32mloss: 2.6594 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,233 [36mtflops: 487.69 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7398 [37mglobal_avg_top_loss: 1.9196
+[titan] 2025-09-09 22:18:40,756 - root - INFO - [34mlr: 5.3083e-06 gnorm: 0.39 [35m[2 days, 4:43:11<20:25:33][39m
+[titan] 2025-09-09 22:19:12,753 - root - INFO - [31mstep: 28835 [32mloss: 2.6925 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.09 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7521 [37mglobal_avg_top_loss: 1.9404
+[titan] 2025-09-09 22:19:12,754 - root - INFO - [34mlr: 5.3055e-06 gnorm: 0.43 [35m[2 days, 4:43:43<20:25:00][39m
+[titan] 2025-09-09 22:19:44,721 - root - INFO - [31mstep: 28840 [32mloss: 2.6651 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.55 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7416 [37mglobal_avg_top_loss: 1.9235
+[titan] 2025-09-09 22:19:44,721 - root - INFO - [34mlr: 5.3027e-06 gnorm: 0.39 [35m[2 days, 4:44:15<20:24:27][39m
+[titan] 2025-09-09 22:20:16,646 - root - INFO - [31mstep: 28845 [32mloss: 2.7803 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,264 [36mtflops: 489.19 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7930 [37mglobal_avg_top_loss: 1.9873
+[titan] 2025-09-09 22:20:16,646 - root - INFO - [34mlr: 5.3000e-06 gnorm: 0.40 [35m[2 days, 4:44:47<20:23:53][39m
+[titan] 2025-09-09 22:20:42,120 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:20:48,597 - root - INFO - [31mstep: 28850 [32mloss: 2.8029 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.79 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.8124 [37mglobal_avg_top_loss: 1.9906
+[titan] 2025-09-09 22:20:48,598 - root - INFO - [34mlr: 5.2972e-06 gnorm: 0.41 [35m[2 days, 4:45:19<20:23:20][39m
+[titan] 2025-09-09 22:21:20,777 - root - INFO - [31mstep: 28855 [32mloss: 2.6903 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,183 [36mtflops: 485.33 [35mmfu: 49.07%[39m [37mglobal_avg_ntp_loss: 0.7583 [37mglobal_avg_top_loss: 1.9320
+[titan] 2025-09-09 22:21:20,777 - root - INFO - [34mlr: 5.2944e-06 gnorm: 0.41 [35m[2 days, 4:45:51<20:22:47][39m
+[titan] 2025-09-09 22:21:52,887 - root - INFO - [31mstep: 28860 [32mloss: 2.7436 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.36 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7766 [37mglobal_avg_top_loss: 1.9670
+[titan] 2025-09-09 22:21:52,888 - root - INFO - [34mlr: 5.2917e-06 gnorm: 0.38 [35m[2 days, 4:46:23<20:22:14][39m
+[titan] 2025-09-09 22:22:24,858 - root - INFO - [31mstep: 28865 [32mloss: 2.7135 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,250 [36mtflops: 488.49 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7639 [37mglobal_avg_top_loss: 1.9496
+[titan] 2025-09-09 22:22:24,858 - root - INFO - [34mlr: 5.2889e-06 gnorm: 0.41 [35m[2 days, 4:46:55<20:21:40][39m
+[titan] 2025-09-09 22:22:56,995 - root - INFO - [31mstep: 28870 [32mloss: 2.7356 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,197 [36mtflops: 485.97 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.7734 [37mglobal_avg_top_loss: 1.9622
+[titan] 2025-09-09 22:22:56,995 - root - INFO - [34mlr: 5.2862e-06 gnorm: 0.38 [35m[2 days, 4:47:27<20:21:07][39m
+[titan] 2025-09-09 22:23:28,997 - root - INFO - [31mstep: 28875 [32mloss: 3.6074 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.01 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 1.2744 [37mglobal_avg_top_loss: 2.3330
+[titan] 2025-09-09 22:23:28,997 - root - INFO - [34mlr: 5.2834e-06 gnorm: 0.39 [35m[2 days, 4:47:59<20:20:34][39m
+[titan] 2025-09-09 22:24:01,395 - root - INFO - [31mstep: 28880 [32mloss: 2.6378 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,115 [36mtflops: 482.05 [35mmfu: 48.74%[39m [37mglobal_avg_ntp_loss: 0.7282 [37mglobal_avg_top_loss: 1.9096
+[titan] 2025-09-09 22:24:01,395 - root - INFO - [34mlr: 5.2807e-06 gnorm: 0.38 [35m[2 days, 4:48:32<20:20:01][39m
+[titan] 2025-09-09 22:24:33,525 - root - INFO - [31mstep: 28885 [32mloss: 2.6623 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,199 [36mtflops: 486.06 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7431 [37mglobal_avg_top_loss: 1.9191
+[titan] 2025-09-09 22:24:33,525 - root - INFO - [34mlr: 5.2779e-06 gnorm: 0.38 [35m[2 days, 4:49:04<20:19:27][39m
+[titan] 2025-09-09 22:25:05,463 - root - INFO - [31mstep: 28890 [32mloss: 3.1742 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,260 [36mtflops: 489.00 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 1.0287 [37mglobal_avg_top_loss: 2.1455
+[titan] 2025-09-09 22:25:05,463 - root - INFO - [34mlr: 5.2751e-06 gnorm: 0.38 [35m[2 days, 4:49:36<20:18:54][39m
+[titan] 2025-09-09 22:25:37,496 - root - INFO - [31mstep: 28895 [32mloss: 2.6747 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.53 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7448 [37mglobal_avg_top_loss: 1.9299
+[titan] 2025-09-09 22:25:37,497 - root - INFO - [34mlr: 5.2724e-06 gnorm: 0.40 [35m[2 days, 4:50:08<20:18:21][39m
+[titan] 2025-09-09 22:26:02,908 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:26:09,316 - root - INFO - [31mstep: 28900 [32mloss: 2.6546 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.82 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7368 [37mglobal_avg_top_loss: 1.9178
+[titan] 2025-09-09 22:26:09,316 - root - INFO - [34mlr: 5.2696e-06 gnorm: 0.42 [35m[2 days, 4:50:40<20:17:48][39m
+[titan] 2025-09-09 22:26:41,177 - root - INFO - [31mstep: 28905 [32mloss: 2.6410 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,285 [36mtflops: 490.17 [35mmfu: 49.56%[39m [37mglobal_avg_ntp_loss: 0.7279 [37mglobal_avg_top_loss: 1.9131
+[titan] 2025-09-09 22:26:41,177 - root - INFO - [34mlr: 5.2669e-06 gnorm: 0.38 [35m[2 days, 4:51:12<20:17:14][39m
+[titan] 2025-09-09 22:27:13,290 - root - INFO - [31mstep: 28910 [32mloss: 2.6782 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.33 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7516 [37mglobal_avg_top_loss: 1.9265
+[titan] 2025-09-09 22:27:13,290 - root - INFO - [34mlr: 5.2641e-06 gnorm: 0.38 [35m[2 days, 4:51:44<20:16:41][39m
+[titan] 2025-09-09 22:27:45,302 - root - INFO - [31mstep: 28915 [32mloss: 2.6866 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.85 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7632 [37mglobal_avg_top_loss: 1.9234
+[titan] 2025-09-09 22:27:45,303 - root - INFO - [34mlr: 5.2614e-06 gnorm: 0.37 [35m[2 days, 4:52:16<20:16:08][39m
+[titan] 2025-09-09 22:28:17,219 - root - INFO - [31mstep: 28920 [32mloss: 2.7901 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.33 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7978 [37mglobal_avg_top_loss: 1.9922
+[titan] 2025-09-09 22:28:17,219 - root - INFO - [34mlr: 5.2586e-06 gnorm: 0.39 [35m[2 days, 4:52:48<20:15:34][39m
+[titan] 2025-09-09 22:28:49,229 - root - INFO - [31mstep: 28925 [32mloss: 2.6673 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.89 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7464 [37mglobal_avg_top_loss: 1.9209
+[titan] 2025-09-09 22:28:49,229 - root - INFO - [34mlr: 5.2559e-06 gnorm: 0.38 [35m[2 days, 4:53:20<20:15:01][39m
+[titan] 2025-09-09 22:29:21,338 - root - INFO - [31mstep: 28930 [32mloss: 2.7162 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.38 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7631 [37mglobal_avg_top_loss: 1.9531
+[titan] 2025-09-09 22:29:21,339 - root - INFO - [34mlr: 5.2531e-06 gnorm: 0.41 [35m[2 days, 4:53:52<20:14:28][39m
+[titan] 2025-09-09 22:29:53,381 - root - INFO - [31mstep: 28935 [32mloss: 2.6847 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.39 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7489 [37mglobal_avg_top_loss: 1.9358
+[titan] 2025-09-09 22:29:53,381 - root - INFO - [34mlr: 5.2504e-06 gnorm: 0.40 [35m[2 days, 4:54:24<20:13:55][39m
+[titan] 2025-09-09 22:30:25,603 - root - INFO - [31mstep: 28940 [32mloss: 2.7387 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,170 [36mtflops: 484.68 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 0.7740 [37mglobal_avg_top_loss: 1.9647
+[titan] 2025-09-09 22:30:25,603 - root - INFO - [34mlr: 5.2476e-06 gnorm: 0.39 [35m[2 days, 4:54:56<20:13:22][39m
+[titan] 2025-09-09 22:30:57,518 - root - INFO - [31mstep: 28945 [32mloss: 2.7068 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.35 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7588 [37mglobal_avg_top_loss: 1.9480
+[titan] 2025-09-09 22:30:57,518 - root - INFO - [34mlr: 5.2449e-06 gnorm: 0.41 [35m[2 days, 4:55:28<20:12:48][39m
+[titan] 2025-09-09 22:31:23,140 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:31:29,555 - root - INFO - [31mstep: 28950 [32mloss: 2.7610 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.47 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7847 [37mglobal_avg_top_loss: 1.9763
+[titan] 2025-09-09 22:31:29,556 - root - INFO - [34mlr: 5.2422e-06 gnorm: 0.39 [35m[2 days, 4:56:00<20:12:15][39m
+[titan] 2025-09-09 22:32:01,535 - root - INFO - [31mstep: 28955 [32mloss: 3.6030 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.35 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 1.2743 [37mglobal_avg_top_loss: 2.3287
+[titan] 2025-09-09 22:32:01,536 - root - INFO - [34mlr: 5.2394e-06 gnorm: 0.42 [35m[2 days, 4:56:32<20:11:42][39m
+[titan] 2025-09-09 22:32:33,518 - root - INFO - [31mstep: 28960 [32mloss: 2.6624 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.31 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7422 [37mglobal_avg_top_loss: 1.9202
+[titan] 2025-09-09 22:32:33,518 - root - INFO - [34mlr: 5.2367e-06 gnorm: 0.38 [35m[2 days, 4:57:04<20:11:08][39m
+[titan] 2025-09-09 22:33:05,436 - root - INFO - [31mstep: 28965 [32mloss: 2.6714 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.30 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7453 [37mglobal_avg_top_loss: 1.9261
+[titan] 2025-09-09 22:33:05,436 - root - INFO - [34mlr: 5.2339e-06 gnorm: 0.40 [35m[2 days, 4:57:36<20:10:35][39m
+[titan] 2025-09-09 22:33:37,573 - root - INFO - [31mstep: 28970 [32mloss: 2.6994 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,197 [36mtflops: 485.96 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.7558 [37mglobal_avg_top_loss: 1.9435
+[titan] 2025-09-09 22:33:37,573 - root - INFO - [34mlr: 5.2312e-06 gnorm: 0.39 [35m[2 days, 4:58:08<20:10:02][39m
+[titan] 2025-09-09 22:34:09,684 - root - INFO - [31mstep: 28975 [32mloss: 2.5462 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.36 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.6863 [37mglobal_avg_top_loss: 1.8599
+[titan] 2025-09-09 22:34:09,685 - root - INFO - [34mlr: 5.2284e-06 gnorm: 0.39 [35m[2 days, 4:58:40<20:09:29][39m
+[titan] 2025-09-09 22:34:41,910 - root - INFO - [31mstep: 28980 [32mloss: 2.6703 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,169 [36mtflops: 484.63 [35mmfu: 49.00%[39m [37mglobal_avg_ntp_loss: 0.7429 [37mglobal_avg_top_loss: 1.9274
+[titan] 2025-09-09 22:34:41,910 - root - INFO - [34mlr: 5.2257e-06 gnorm: 0.39 [35m[2 days, 4:59:12<20:08:56][39m
+[titan] 2025-09-09 22:35:13,710 - root - INFO - [31mstep: 28985 [32mloss: 2.6346 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,305 [36mtflops: 491.11 [35mmfu: 49.66%[39m [37mglobal_avg_ntp_loss: 0.7294 [37mglobal_avg_top_loss: 1.9052
+[titan] 2025-09-09 22:35:13,710 - root - INFO - [34mlr: 5.2230e-06 gnorm: 0.37 [35m[2 days, 4:59:44<20:08:22][39m
+[titan] 2025-09-09 22:35:45,684 - root - INFO - [31mstep: 28990 [32mloss: 2.7377 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,249 [36mtflops: 488.44 [35mmfu: 49.39%[39m [37mglobal_avg_ntp_loss: 0.7751 [37mglobal_avg_top_loss: 1.9626
+[titan] 2025-09-09 22:35:45,685 - root - INFO - [34mlr: 5.2202e-06 gnorm: 0.38 [35m[2 days, 5:00:16<20:07:49][39m
+[titan] 2025-09-09 22:36:17,704 - root - INFO - [31mstep: 28995 [32mloss: 2.6655 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.75 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7393 [37mglobal_avg_top_loss: 1.9262
+[titan] 2025-09-09 22:36:17,704 - root - INFO - [34mlr: 5.2175e-06 gnorm: 0.39 [35m[2 days, 5:00:48<20:07:16][39m
+[titan] 2025-09-09 22:36:43,429 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:36:49,790 - root - INFO - [31mstep: 29000 [32mloss: 2.6706 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,213 [36mtflops: 486.74 [35mmfu: 49.22%[39m [37mglobal_avg_ntp_loss: 0.7425 [37mglobal_avg_top_loss: 1.9281
+[titan] 2025-09-09 22:36:49,790 - root - INFO - [34mlr: 5.2148e-06 gnorm: 0.38 [35m[2 days, 5:01:20<20:06:43][39m
+[titan] 2025-09-09 22:37:21,733 - root - INFO - [31mstep: 29005 [32mloss: 2.6569 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.91 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7413 [37mglobal_avg_top_loss: 1.9156
+[titan] 2025-09-09 22:37:21,734 - root - INFO - [34mlr: 5.2120e-06 gnorm: 0.38 [35m[2 days, 5:01:52<20:06:09][39m
+[titan] 2025-09-09 22:37:53,908 - root - INFO - [31mstep: 29010 [32mloss: 2.7091 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,185 [36mtflops: 485.39 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 0.7599 [37mglobal_avg_top_loss: 1.9493
+[titan] 2025-09-09 22:37:53,909 - root - INFO - [34mlr: 5.2093e-06 gnorm: 0.39 [35m[2 days, 5:02:24<20:05:36][39m
+[titan] 2025-09-09 22:38:25,933 - root - INFO - [31mstep: 29015 [32mloss: 2.6873 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.67 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.7524 [37mglobal_avg_top_loss: 1.9349
+[titan] 2025-09-09 22:38:25,934 - root - INFO - [34mlr: 5.2066e-06 gnorm: 0.39 [35m[2 days, 5:02:56<20:05:03][39m
+[titan] 2025-09-09 22:38:58,108 - root - INFO - [31mstep: 29020 [32mloss: 2.6560 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,185 [36mtflops: 485.39 [35mmfu: 49.08%[39m [37mglobal_avg_ntp_loss: 0.7429 [37mglobal_avg_top_loss: 1.9131
+[titan] 2025-09-09 22:38:58,109 - root - INFO - [34mlr: 5.2038e-06 gnorm: 0.40 [35m[2 days, 5:03:29<20:04:30][39m
+[titan] 2025-09-09 22:39:30,483 - root - INFO - [31mstep: 29025 [32mloss: 2.6399 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,122 [36mtflops: 482.40 [35mmfu: 48.78%[39m [37mglobal_avg_ntp_loss: 0.7348 [37mglobal_avg_top_loss: 1.9052
+[titan] 2025-09-09 22:39:30,483 - root - INFO - [34mlr: 5.2011e-06 gnorm: 0.47 [35m[2 days, 5:04:01<20:03:57][39m
+[titan] 2025-09-09 22:40:02,312 - root - INFO - [31mstep: 29030 [32mloss: 2.7287 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.67 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7708 [37mglobal_avg_top_loss: 1.9578
+[titan] 2025-09-09 22:40:02,312 - root - INFO - [34mlr: 5.1984e-06 gnorm: 0.39 [35m[2 days, 5:04:33<20:03:23][39m
+[titan] 2025-09-09 22:40:34,212 - root - INFO - [31mstep: 29035 [32mloss: 3.6279 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.57 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 1.2845 [37mglobal_avg_top_loss: 2.3435
+[titan] 2025-09-09 22:40:34,212 - root - INFO - [34mlr: 5.1956e-06 gnorm: 0.38 [35m[2 days, 5:05:05<20:02:50][39m
+[titan] 2025-09-09 22:41:06,176 - root - INFO - [31mstep: 29040 [32mloss: 2.6558 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,252 [36mtflops: 488.59 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7384 [37mglobal_avg_top_loss: 1.9174
+[titan] 2025-09-09 22:41:06,177 - root - INFO - [34mlr: 5.1929e-06 gnorm: 0.40 [35m[2 days, 5:05:37<20:02:17][39m
+[titan] 2025-09-09 22:41:38,478 - root - INFO - [31mstep: 29045 [32mloss: 2.6775 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,145 [36mtflops: 483.49 [35mmfu: 48.89%[39m [37mglobal_avg_ntp_loss: 0.7511 [37mglobal_avg_top_loss: 1.9264
+[titan] 2025-09-09 22:41:38,478 - root - INFO - [34mlr: 5.1902e-06 gnorm: 0.38 [35m[2 days, 5:06:09<20:01:44][39m
+[titan] 2025-09-09 22:42:03,931 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:42:10,293 - root - INFO - [31mstep: 29050 [32mloss: 2.6216 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,300 [36mtflops: 490.88 [35mmfu: 49.63%[39m [37mglobal_avg_ntp_loss: 0.7192 [37mglobal_avg_top_loss: 1.9024
+[titan] 2025-09-09 22:42:10,293 - root - INFO - [34mlr: 5.1875e-06 gnorm: 0.39 [35m[2 days, 5:06:41<20:01:10][39m
+[titan] 2025-09-09 22:42:42,442 - root - INFO - [31mstep: 29055 [32mloss: 2.6030 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,193 [36mtflops: 485.78 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7124 [37mglobal_avg_top_loss: 1.8906
+[titan] 2025-09-09 22:42:42,443 - root - INFO - [34mlr: 5.1847e-06 gnorm: 0.40 [35m[2 days, 5:07:13<20:00:37][39m
+[titan] 2025-09-09 22:43:14,432 - root - INFO - [31mstep: 29060 [32mloss: 2.6507 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.20 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7355 [37mglobal_avg_top_loss: 1.9152
+[titan] 2025-09-09 22:43:14,432 - root - INFO - [34mlr: 5.1820e-06 gnorm: 0.39 [35m[2 days, 5:07:45<20:00:04][39m
+[titan] 2025-09-09 22:43:46,262 - root - INFO - [31mstep: 29065 [32mloss: 2.6855 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,295 [36mtflops: 490.65 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7500 [37mglobal_avg_top_loss: 1.9355
+[titan] 2025-09-09 22:43:46,262 - root - INFO - [34mlr: 5.1793e-06 gnorm: 0.39 [35m[2 days, 5:08:17<19:59:30][39m
+[titan] 2025-09-09 22:44:18,373 - root - INFO - [31mstep: 29070 [32mloss: 2.9020 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.36 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.8524 [37mglobal_avg_top_loss: 2.0496
+[titan] 2025-09-09 22:44:18,374 - root - INFO - [34mlr: 5.1766e-06 gnorm: 0.42 [35m[2 days, 5:08:49<19:58:57][39m
+[titan] 2025-09-09 22:44:50,386 - root - INFO - [31mstep: 29075 [32mloss: 2.7733 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.86 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7906 [37mglobal_avg_top_loss: 1.9827
+[titan] 2025-09-09 22:44:50,386 - root - INFO - [34mlr: 5.1738e-06 gnorm: 0.40 [35m[2 days, 5:09:21<19:58:24][39m
+[titan] 2025-09-09 22:45:22,384 - root - INFO - [31mstep: 29080 [32mloss: 2.7112 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.08 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7622 [37mglobal_avg_top_loss: 1.9490
+[titan] 2025-09-09 22:45:22,384 - root - INFO - [34mlr: 5.1711e-06 gnorm: 0.40 [35m[2 days, 5:09:53<19:57:51][39m
+[titan] 2025-09-09 22:45:54,391 - root - INFO - [31mstep: 29085 [32mloss: 2.6443 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.93 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7329 [37mglobal_avg_top_loss: 1.9114
+[titan] 2025-09-09 22:45:54,392 - root - INFO - [34mlr: 5.1684e-06 gnorm: 0.37 [35m[2 days, 5:10:25<19:57:17][39m
+[titan] 2025-09-09 22:46:26,547 - root - INFO - [31mstep: 29090 [32mloss: 2.6211 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,191 [36mtflops: 485.68 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.7190 [37mglobal_avg_top_loss: 1.9021
+[titan] 2025-09-09 22:46:26,547 - root - INFO - [34mlr: 5.1657e-06 gnorm: 0.41 [35m[2 days, 5:10:57<19:56:44][39m
+[titan] 2025-09-09 22:46:58,590 - root - INFO - [31mstep: 29095 [32mloss: 3.2135 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.39 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 1.0503 [37mglobal_avg_top_loss: 2.1632
+[titan] 2025-09-09 22:46:58,590 - root - INFO - [34mlr: 5.1630e-06 gnorm: 0.38 [35m[2 days, 5:11:29<19:56:11][39m
+[titan] 2025-09-09 22:47:24,379 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:47:30,777 - root - INFO - [31mstep: 29100 [32mloss: 2.5813 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,181 [36mtflops: 485.20 [35mmfu: 49.06%[39m [37mglobal_avg_ntp_loss: 0.7079 [37mglobal_avg_top_loss: 1.8733
+[titan] 2025-09-09 22:47:30,778 - root - INFO - [34mlr: 5.1602e-06 gnorm: 0.36 [35m[2 days, 5:12:01<19:55:38][39m
+[titan] 2025-09-09 22:48:02,654 - root - INFO - [31mstep: 29105 [32mloss: 2.6240 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.94 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7261 [37mglobal_avg_top_loss: 1.8979
+[titan] 2025-09-09 22:48:02,654 - root - INFO - [34mlr: 5.1575e-06 gnorm: 0.43 [35m[2 days, 5:12:33<19:55:05][39m
+[titan] 2025-09-09 22:48:34,748 - root - INFO - [31mstep: 29110 [32mloss: 2.6238 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,210 [36mtflops: 486.61 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 0.7265 [37mglobal_avg_top_loss: 1.8973
+[titan] 2025-09-09 22:48:34,749 - root - INFO - [34mlr: 5.1548e-06 gnorm: 0.39 [35m[2 days, 5:13:05<19:54:31][39m
+[titan] 2025-09-09 22:49:06,774 - root - INFO - [31mstep: 29115 [32mloss: 3.0516 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,232 [36mtflops: 487.65 [35mmfu: 49.31%[39m [37mglobal_avg_ntp_loss: 0.9731 [37mglobal_avg_top_loss: 2.0786
+[titan] 2025-09-09 22:49:06,775 - root - INFO - [34mlr: 5.1521e-06 gnorm: 0.37 [35m[2 days, 5:13:37<19:53:58][39m
+[titan] 2025-09-09 22:49:38,841 - root - INFO - [31mstep: 29120 [32mloss: 2.7758 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.03 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7954 [37mglobal_avg_top_loss: 1.9804
+[titan] 2025-09-09 22:49:38,841 - root - INFO - [34mlr: 5.1494e-06 gnorm: 0.41 [35m[2 days, 5:14:09<19:53:25][39m
+[titan] 2025-09-09 22:50:10,762 - root - INFO - [31mstep: 29125 [32mloss: 2.6811 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,266 [36mtflops: 489.25 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7535 [37mglobal_avg_top_loss: 1.9276
+[titan] 2025-09-09 22:50:10,763 - root - INFO - [34mlr: 5.1467e-06 gnorm: 0.41 [35m[2 days, 5:14:41<19:52:52][39m
+[titan] 2025-09-09 22:50:42,971 - root - INFO - [31mstep: 29130 [32mloss: 2.6952 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,174 [36mtflops: 484.88 [35mmfu: 49.03%[39m [37mglobal_avg_ntp_loss: 0.7592 [37mglobal_avg_top_loss: 1.9360
+[titan] 2025-09-09 22:50:42,972 - root - INFO - [34mlr: 5.1440e-06 gnorm: 0.38 [35m[2 days, 5:15:13<19:52:18][39m
+[titan] 2025-09-09 22:51:15,063 - root - INFO - [31mstep: 29135 [32mloss: 2.7569 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.66 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7881 [37mglobal_avg_top_loss: 1.9688
+[titan] 2025-09-09 22:51:15,063 - root - INFO - [34mlr: 5.1413e-06 gnorm: 0.39 [35m[2 days, 5:15:45<19:51:45][39m
+[titan] 2025-09-09 22:51:47,021 - root - INFO - [31mstep: 29140 [32mloss: 2.7331 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.68 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7839 [37mglobal_avg_top_loss: 1.9492
+[titan] 2025-09-09 22:51:47,022 - root - INFO - [34mlr: 5.1385e-06 gnorm: 0.39 [35m[2 days, 5:16:17<19:51:12][39m
+[titan] 2025-09-09 22:52:19,111 - root - INFO - [31mstep: 29145 [32mloss: 2.7940 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.68 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.8079 [37mglobal_avg_top_loss: 1.9861
+[titan] 2025-09-09 22:52:19,112 - root - INFO - [34mlr: 5.1358e-06 gnorm: 0.41 [35m[2 days, 5:16:50<19:50:39][39m
+[titan] 2025-09-09 22:52:44,647 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:52:51,045 - root - INFO - [31mstep: 29150 [32mloss: 2.9851 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,262 [36mtflops: 489.06 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.8898 [37mglobal_avg_top_loss: 2.0953
+[titan] 2025-09-09 22:52:51,045 - root - INFO - [34mlr: 5.1331e-06 gnorm: 0.40 [35m[2 days, 5:17:21<19:50:06][39m
+[titan] 2025-09-09 22:53:22,988 - root - INFO - [31mstep: 29155 [32mloss: 2.6518 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,258 [36mtflops: 488.91 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7328 [37mglobal_avg_top_loss: 1.9191
+[titan] 2025-09-09 22:53:22,989 - root - INFO - [34mlr: 5.1304e-06 gnorm: 0.40 [35m[2 days, 5:17:53<19:49:32][39m
+[titan] 2025-09-09 22:53:54,930 - root - INFO - [31mstep: 29160 [32mloss: 2.8450 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.93 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8424 [37mglobal_avg_top_loss: 2.0026
+[titan] 2025-09-09 22:53:54,931 - root - INFO - [34mlr: 5.1277e-06 gnorm: 0.40 [35m[2 days, 5:18:25<19:48:59][39m
+[titan] 2025-09-09 22:54:26,964 - root - INFO - [31mstep: 29165 [32mloss: 2.7427 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.53 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7772 [37mglobal_avg_top_loss: 1.9655
+[titan] 2025-09-09 22:54:26,965 - root - INFO - [34mlr: 5.1250e-06 gnorm: 0.42 [35m[2 days, 5:18:57<19:48:26][39m
+[titan] 2025-09-09 22:54:59,096 - root - INFO - [31mstep: 29170 [32mloss: 2.7462 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,199 [36mtflops: 486.06 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7783 [37mglobal_avg_top_loss: 1.9678
+[titan] 2025-09-09 22:54:59,096 - root - INFO - [34mlr: 5.1223e-06 gnorm: 0.38 [35m[2 days, 5:19:30<19:47:53][39m
+[titan] 2025-09-09 22:55:30,853 - root - INFO - [31mstep: 29175 [32mloss: 2.7078 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,319 [36mtflops: 491.78 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 0.7600 [37mglobal_avg_top_loss: 1.9478
+[titan] 2025-09-09 22:55:30,853 - root - INFO - [34mlr: 5.1196e-06 gnorm: 0.40 [35m[2 days, 5:20:01<19:47:19][39m
+[titan] 2025-09-09 22:56:02,956 - root - INFO - [31mstep: 29180 [32mloss: 2.7188 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,208 [36mtflops: 486.48 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7658 [37mglobal_avg_top_loss: 1.9529
+[titan] 2025-09-09 22:56:02,956 - root - INFO - [34mlr: 5.1169e-06 gnorm: 0.38 [35m[2 days, 5:20:33<19:46:46][39m
+[titan] 2025-09-09 22:56:28,921 - root - INFO - Dumping profiler traces at step 29184
+[titan] 2025-09-09 22:56:28,992 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 22:56:35,311 - root - INFO - [31mstep: 29185 [32mloss: 2.6094 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,128 [36mtflops: 482.68 [35mmfu: 48.81%[39m [37mglobal_avg_ntp_loss: 0.7306 [37mglobal_avg_top_loss: 1.8788
+[titan] 2025-09-09 22:56:35,311 - root - INFO - [34mlr: 5.1142e-06 gnorm: 0.48 [35m[2 days, 5:21:06<19:46:13][39m
+[titan] 2025-09-09 22:57:07,091 - root - INFO - [31mstep: 29190 [32mloss: 2.6938 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,311 [36mtflops: 491.43 [35mmfu: 49.69%[39m [37mglobal_avg_ntp_loss: 0.7530 [37mglobal_avg_top_loss: 1.9408
+[titan] 2025-09-09 22:57:07,091 - root - INFO - [34mlr: 5.1115e-06 gnorm: 0.38 [35m[2 days, 5:21:37<19:45:40][39m
+[titan] 2025-09-09 22:57:38,985 - root - INFO - [31mstep: 29195 [32mloss: 3.1577 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.66 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 1.0210 [37mglobal_avg_top_loss: 2.1367
+[titan] 2025-09-09 22:57:38,986 - root - INFO - [34mlr: 5.1088e-06 gnorm: 0.38 [35m[2 days, 5:22:09<19:45:06][39m
+[titan] 2025-09-09 22:58:04,660 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 22:58:11,054 - root - INFO - [31mstep: 29200 [32mloss: 2.6686 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,218 [36mtflops: 486.99 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7422 [37mglobal_avg_top_loss: 1.9263
+[titan] 2025-09-09 22:58:11,055 - root - INFO - [34mlr: 5.1061e-06 gnorm: 0.39 [35m[2 days, 5:22:41<19:44:33][39m
+[titan] 2025-09-09 22:58:42,842 - root - INFO - [31mstep: 29205 [32mloss: 2.6944 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,309 [36mtflops: 491.30 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.7533 [37mglobal_avg_top_loss: 1.9411
+[titan] 2025-09-09 22:58:42,842 - root - INFO - [34mlr: 5.1034e-06 gnorm: 0.39 [35m[2 days, 5:23:13<19:44:00][39m
+[titan] 2025-09-09 22:59:14,821 - root - INFO - [31mstep: 29210 [32mloss: 2.6511 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.36 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7349 [37mglobal_avg_top_loss: 1.9162
+[titan] 2025-09-09 22:59:14,821 - root - INFO - [34mlr: 5.1007e-06 gnorm: 0.39 [35m[2 days, 5:23:45<19:43:27][39m
+[titan] 2025-09-09 22:59:46,629 - root - INFO - [31mstep: 29215 [32mloss: 2.7013 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.99 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7585 [37mglobal_avg_top_loss: 1.9429
+[titan] 2025-09-09 22:59:46,630 - root - INFO - [34mlr: 5.0980e-06 gnorm: 0.39 [35m[2 days, 5:24:17<19:42:53][39m
+[titan] 2025-09-09 23:00:18,596 - root - INFO - [31mstep: 29220 [32mloss: 2.6984 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,251 [36mtflops: 488.55 [35mmfu: 49.40%[39m [37mglobal_avg_ntp_loss: 0.7608 [37mglobal_avg_top_loss: 1.9376
+[titan] 2025-09-09 23:00:18,597 - root - INFO - [34mlr: 5.0953e-06 gnorm: 0.39 [35m[2 days, 5:24:49<19:42:20][39m
+[titan] 2025-09-09 23:00:50,664 - root - INFO - [31mstep: 29225 [32mloss: 2.6469 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.02 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7335 [37mglobal_avg_top_loss: 1.9134
+[titan] 2025-09-09 23:00:50,664 - root - INFO - [34mlr: 5.0926e-06 gnorm: 0.38 [35m[2 days, 5:25:21<19:41:47][39m
+[titan] 2025-09-09 23:01:22,703 - root - INFO - [31mstep: 29230 [32mloss: 2.6773 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.45 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7482 [37mglobal_avg_top_loss: 1.9290
+[titan] 2025-09-09 23:01:22,703 - root - INFO - [34mlr: 5.0899e-06 gnorm: 0.40 [35m[2 days, 5:25:53<19:41:14][39m
+[titan] 2025-09-09 23:01:54,652 - root - INFO - [31mstep: 29235 [32mloss: 2.7078 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.82 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7596 [37mglobal_avg_top_loss: 1.9482
+[titan] 2025-09-09 23:01:54,653 - root - INFO - [34mlr: 5.0872e-06 gnorm: 0.41 [35m[2 days, 5:26:25<19:40:40][39m
+[titan] 2025-09-09 23:02:26,593 - root - INFO - [31mstep: 29240 [32mloss: 2.6131 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.95 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7177 [37mglobal_avg_top_loss: 1.8954
+[titan] 2025-09-09 23:02:26,594 - root - INFO - [34mlr: 5.0846e-06 gnorm: 0.39 [35m[2 days, 5:26:57<19:40:07][39m
+[titan] 2025-09-09 23:02:58,597 - root - INFO - [31mstep: 29245 [32mloss: 2.7600 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.99 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7883 [37mglobal_avg_top_loss: 1.9716
+[titan] 2025-09-09 23:02:58,598 - root - INFO - [34mlr: 5.0819e-06 gnorm: 0.38 [35m[2 days, 5:27:29<19:39:34][39m
+[titan] 2025-09-09 23:03:24,208 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:03:30,592 - root - INFO - [31mstep: 29250 [32mloss: 2.6628 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.12 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7411 [37mglobal_avg_top_loss: 1.9217
+[titan] 2025-09-09 23:03:30,592 - root - INFO - [34mlr: 5.0792e-06 gnorm: 0.38 [35m[2 days, 5:28:01<19:39:01][39m
+[titan] 2025-09-09 23:04:02,496 - root - INFO - [31mstep: 29255 [32mloss: 2.7456 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.52 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7792 [37mglobal_avg_top_loss: 1.9664
+[titan] 2025-09-09 23:04:02,496 - root - INFO - [34mlr: 5.0765e-06 gnorm: 0.39 [35m[2 days, 5:28:33<19:38:27][39m
+[titan] 2025-09-09 23:04:34,402 - root - INFO - [31mstep: 29260 [32mloss: 2.7025 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,270 [36mtflops: 489.48 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7557 [37mglobal_avg_top_loss: 1.9468
+[titan] 2025-09-09 23:04:34,403 - root - INFO - [34mlr: 5.0738e-06 gnorm: 0.39 [35m[2 days, 5:29:05<19:37:54][39m
+[titan] 2025-09-09 23:05:06,379 - root - INFO - [31mstep: 29265 [32mloss: 2.6506 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,248 [36mtflops: 488.41 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7370 [37mglobal_avg_top_loss: 1.9136
+[titan] 2025-09-09 23:05:06,379 - root - INFO - [34mlr: 5.0711e-06 gnorm: 0.38 [35m[2 days, 5:29:37<19:37:21][39m
+[titan] 2025-09-09 23:05:38,292 - root - INFO - [31mstep: 29270 [32mloss: 2.7339 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,268 [36mtflops: 489.37 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7735 [37mglobal_avg_top_loss: 1.9604
+[titan] 2025-09-09 23:05:38,293 - root - INFO - [34mlr: 5.0684e-06 gnorm: 0.39 [35m[2 days, 5:30:09<19:36:48][39m
+[titan] 2025-09-09 23:06:10,247 - root - INFO - [31mstep: 29275 [32mloss: 2.6792 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,255 [36mtflops: 488.73 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7468 [37mglobal_avg_top_loss: 1.9324
+[titan] 2025-09-09 23:06:10,248 - root - INFO - [34mlr: 5.0657e-06 gnorm: 0.41 [35m[2 days, 5:30:41<19:36:14][39m
+[titan] 2025-09-09 23:06:42,444 - root - INFO - [31mstep: 29280 [32mloss: 2.7697 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.07 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7903 [37mglobal_avg_top_loss: 1.9795
+[titan] 2025-09-09 23:06:42,444 - root - INFO - [34mlr: 5.0631e-06 gnorm: 0.38 [35m[2 days, 5:31:13<19:35:41][39m
+[titan] 2025-09-09 23:07:14,579 - root - INFO - [31mstep: 29285 [32mloss: 2.7344 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,197 [36mtflops: 486.00 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.7738 [37mglobal_avg_top_loss: 1.9606
+[titan] 2025-09-09 23:07:14,579 - root - INFO - [34mlr: 5.0604e-06 gnorm: 0.41 [35m[2 days, 5:31:45<19:35:08][39m
+[titan] 2025-09-09 23:07:46,617 - root - INFO - [31mstep: 29290 [32mloss: 2.6415 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.45 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7295 [37mglobal_avg_top_loss: 1.9121
+[titan] 2025-09-09 23:07:46,618 - root - INFO - [34mlr: 5.0577e-06 gnorm: 0.38 [35m[2 days, 5:32:17<19:34:35][39m
+[titan] 2025-09-09 23:08:18,621 - root - INFO - [31mstep: 29295 [32mloss: 2.6646 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 488.00 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7405 [37mglobal_avg_top_loss: 1.9241
+[titan] 2025-09-09 23:08:18,621 - root - INFO - [34mlr: 5.0550e-06 gnorm: 0.46 [35m[2 days, 5:32:49<19:34:01][39m
+[titan] 2025-09-09 23:08:44,164 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:08:50,538 - root - INFO - [31mstep: 29300 [32mloss: 2.7990 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.31 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7976 [37mglobal_avg_top_loss: 2.0014
+[titan] 2025-09-09 23:08:50,539 - root - INFO - [34mlr: 5.0523e-06 gnorm: 0.39 [35m[2 days, 5:33:21<19:33:28][39m
+[titan] 2025-09-09 23:09:22,323 - root - INFO - [31mstep: 29305 [32mloss: 2.6830 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,310 [36mtflops: 491.35 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.7503 [37mglobal_avg_top_loss: 1.9327
+[titan] 2025-09-09 23:09:22,323 - root - INFO - [34mlr: 5.0497e-06 gnorm: 0.39 [35m[2 days, 5:33:53<19:32:55][39m
+[titan] 2025-09-09 23:09:54,600 - root - INFO - [31mstep: 29310 [32mloss: 2.7757 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,152 [36mtflops: 483.85 [35mmfu: 48.92%[39m [37mglobal_avg_ntp_loss: 0.7954 [37mglobal_avg_top_loss: 1.9802
+[titan] 2025-09-09 23:09:54,600 - root - INFO - [34mlr: 5.0470e-06 gnorm: 0.40 [35m[2 days, 5:34:25<19:32:22][39m
+[titan] 2025-09-09 23:10:26,550 - root - INFO - [31mstep: 29315 [32mloss: 2.7023 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,256 [36mtflops: 488.80 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7604 [37mglobal_avg_top_loss: 1.9418
+[titan] 2025-09-09 23:10:26,551 - root - INFO - [34mlr: 5.0443e-06 gnorm: 0.38 [35m[2 days, 5:34:57<19:31:49][39m
+[titan] 2025-09-09 23:10:58,510 - root - INFO - [31mstep: 29320 [32mloss: 2.5561 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,253 [36mtflops: 488.66 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.6911 [37mglobal_avg_top_loss: 1.8650
+[titan] 2025-09-09 23:10:58,510 - root - INFO - [34mlr: 5.0416e-06 gnorm: 0.38 [35m[2 days, 5:35:29<19:31:15][39m
+[titan] 2025-09-09 23:11:30,391 - root - INFO - [31mstep: 29325 [32mloss: 2.6740 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,279 [36mtflops: 489.87 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7473 [37mglobal_avg_top_loss: 1.9267
+[titan] 2025-09-09 23:11:30,391 - root - INFO - [34mlr: 5.0389e-06 gnorm: 0.39 [35m[2 days, 5:36:01<19:30:42][39m
+[titan] 2025-09-09 23:12:02,294 - root - INFO - [31mstep: 29330 [32mloss: 2.7124 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.53 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7615 [37mglobal_avg_top_loss: 1.9509
+[titan] 2025-09-09 23:12:02,294 - root - INFO - [34mlr: 5.0363e-06 gnorm: 0.38 [35m[2 days, 5:36:33<19:30:09][39m
+[titan] 2025-09-09 23:12:34,251 - root - INFO - [31mstep: 29335 [32mloss: 2.6808 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.69 [35mmfu: 49.41%[39m [37mglobal_avg_ntp_loss: 0.7482 [37mglobal_avg_top_loss: 1.9326
+[titan] 2025-09-09 23:12:34,252 - root - INFO - [34mlr: 5.0336e-06 gnorm: 0.40 [35m[2 days, 5:37:05<19:29:35][39m
+[titan] 2025-09-09 23:13:06,286 - root - INFO - [31mstep: 29340 [32mloss: 2.6380 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.52 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7297 [37mglobal_avg_top_loss: 1.9083
+[titan] 2025-09-09 23:13:06,286 - root - INFO - [34mlr: 5.0309e-06 gnorm: 0.38 [35m[2 days, 5:37:37<19:29:02][39m
+[titan] 2025-09-09 23:13:38,327 - root - INFO - [31mstep: 29345 [32mloss: 2.7049 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.42 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7606 [37mglobal_avg_top_loss: 1.9443
+[titan] 2025-09-09 23:13:38,327 - root - INFO - [34mlr: 5.0283e-06 gnorm: 0.38 [35m[2 days, 5:38:09<19:28:29][39m
+[titan] 2025-09-09 23:14:03,863 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:14:10,283 - root - INFO - [31mstep: 29350 [32mloss: 2.6685 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.72 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7434 [37mglobal_avg_top_loss: 1.9251
+[titan] 2025-09-09 23:14:10,284 - root - INFO - [34mlr: 5.0256e-06 gnorm: 0.39 [35m[2 days, 5:38:41<19:27:56][39m
+[titan] 2025-09-09 23:14:42,167 - root - INFO - [31mstep: 29355 [32mloss: 2.7210 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.82 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7705 [37mglobal_avg_top_loss: 1.9505
+[titan] 2025-09-09 23:14:42,167 - root - INFO - [34mlr: 5.0229e-06 gnorm: 0.39 [35m[2 days, 5:39:13<19:27:23][39m
+[titan] 2025-09-09 23:15:14,043 - root - INFO - [31mstep: 29360 [32mloss: 2.6904 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,280 [36mtflops: 489.95 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7531 [37mglobal_avg_top_loss: 1.9373
+[titan] 2025-09-09 23:15:14,043 - root - INFO - [34mlr: 5.0203e-06 gnorm: 0.38 [35m[2 days, 5:39:44<19:26:49][39m
+[titan] 2025-09-09 23:15:46,334 - root - INFO - [31mstep: 29365 [32mloss: 2.6537 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,148 [36mtflops: 483.64 [35mmfu: 48.90%[39m [37mglobal_avg_ntp_loss: 0.7375 [37mglobal_avg_top_loss: 1.9161
+[titan] 2025-09-09 23:15:46,334 - root - INFO - [34mlr: 5.0176e-06 gnorm: 0.41 [35m[2 days, 5:40:17<19:26:16][39m
+[titan] 2025-09-09 23:16:18,020 - root - INFO - [31mstep: 29370 [32mloss: 2.7244 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,342 [36mtflops: 492.89 [35mmfu: 49.84%[39m [37mglobal_avg_ntp_loss: 0.7680 [37mglobal_avg_top_loss: 1.9564
+[titan] 2025-09-09 23:16:18,020 - root - INFO - [34mlr: 5.0149e-06 gnorm: 0.40 [35m[2 days, 5:40:48<19:25:43][39m
+[titan] 2025-09-09 23:16:50,166 - root - INFO - [31mstep: 29375 [32mloss: 2.7198 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.82 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7676 [37mglobal_avg_top_loss: 1.9521
+[titan] 2025-09-09 23:16:50,167 - root - INFO - [34mlr: 5.0123e-06 gnorm: 0.38 [35m[2 days, 5:41:21<19:25:10][39m
+[titan] 2025-09-09 23:17:22,259 - root - INFO - [31mstep: 29380 [32mloss: 2.5785 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.64 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7027 [37mglobal_avg_top_loss: 1.8758
+[titan] 2025-09-09 23:17:22,259 - root - INFO - [34mlr: 5.0096e-06 gnorm: 0.39 [35m[2 days, 5:41:53<19:24:36][39m
+[titan] 2025-09-09 23:17:54,068 - root - INFO - [31mstep: 29385 [32mloss: 2.7280 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,302 [36mtflops: 490.96 [35mmfu: 49.64%[39m [37mglobal_avg_ntp_loss: 0.7727 [37mglobal_avg_top_loss: 1.9553
+[titan] 2025-09-09 23:17:54,069 - root - INFO - [34mlr: 5.0069e-06 gnorm: 0.39 [35m[2 days, 5:42:24<19:24:03][39m
+[titan] 2025-09-09 23:18:25,942 - root - INFO - [31mstep: 29390 [32mloss: 2.6867 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.98 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7522 [37mglobal_avg_top_loss: 1.9344
+[titan] 2025-09-09 23:18:25,943 - root - INFO - [34mlr: 5.0043e-06 gnorm: 0.39 [35m[2 days, 5:42:56<19:23:30][39m
+[titan] 2025-09-09 23:18:58,087 - root - INFO - [31mstep: 29395 [32mloss: 2.5905 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.85 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7117 [37mglobal_avg_top_loss: 1.8787
+[titan] 2025-09-09 23:18:58,087 - root - INFO - [34mlr: 5.0016e-06 gnorm: 0.38 [35m[2 days, 5:43:28<19:22:57][39m
+[titan] 2025-09-09 23:19:23,629 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:19:30,067 - root - INFO - [31mstep: 29400 [32mloss: 2.6546 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,247 [36mtflops: 488.35 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7355 [37mglobal_avg_top_loss: 1.9192
+[titan] 2025-09-09 23:19:30,067 - root - INFO - [34mlr: 4.9989e-06 gnorm: 0.39 [35m[2 days, 5:44:00<19:22:24][39m
+[titan] 2025-09-09 23:20:02,218 - root - INFO - [31mstep: 29405 [32mloss: 2.7080 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,192 [36mtflops: 485.75 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7621 [37mglobal_avg_top_loss: 1.9458
+[titan] 2025-09-09 23:20:02,218 - root - INFO - [34mlr: 4.9963e-06 gnorm: 0.41 [35m[2 days, 5:44:33<19:21:50][39m
+[titan] 2025-09-09 23:20:34,210 - root - INFO - [31mstep: 29410 [32mloss: 2.7444 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.16 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7791 [37mglobal_avg_top_loss: 1.9653
+[titan] 2025-09-09 23:20:34,210 - root - INFO - [34mlr: 4.9936e-06 gnorm: 0.41 [35m[2 days, 5:45:05<19:21:17][39m
+[titan] 2025-09-09 23:21:06,547 - root - INFO - [31mstep: 29415 [32mloss: 2.7118 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,134 [36mtflops: 482.96 [35mmfu: 48.83%[39m [37mglobal_avg_ntp_loss: 0.7672 [37mglobal_avg_top_loss: 1.9445
+[titan] 2025-09-09 23:21:06,547 - root - INFO - [34mlr: 4.9910e-06 gnorm: 0.40 [35m[2 days, 5:45:37<19:20:44][39m
+[titan] 2025-09-09 23:21:38,636 - root - INFO - [31mstep: 29420 [32mloss: 2.7307 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,212 [36mtflops: 486.69 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7686 [37mglobal_avg_top_loss: 1.9621
+[titan] 2025-09-09 23:21:38,636 - root - INFO - [34mlr: 4.9883e-06 gnorm: 0.40 [35m[2 days, 5:46:09<19:20:11][39m
+[titan] 2025-09-09 23:22:10,683 - root - INFO - [31mstep: 29425 [32mloss: 2.7485 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.32 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7819 [37mglobal_avg_top_loss: 1.9666
+[titan] 2025-09-09 23:22:10,684 - root - INFO - [34mlr: 4.9856e-06 gnorm: 0.39 [35m[2 days, 5:46:41<19:19:38][39m
+[titan] 2025-09-09 23:22:42,613 - root - INFO - [31mstep: 29430 [32mloss: 2.7111 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.12 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7616 [37mglobal_avg_top_loss: 1.9495
+[titan] 2025-09-09 23:22:42,614 - root - INFO - [34mlr: 4.9830e-06 gnorm: 0.39 [35m[2 days, 5:47:13<19:19:04][39m
+[titan] 2025-09-09 23:23:14,555 - root - INFO - [31mstep: 29435 [32mloss: 2.7984 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.93 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.8023 [37mglobal_avg_top_loss: 1.9961
+[titan] 2025-09-09 23:23:14,556 - root - INFO - [34mlr: 4.9803e-06 gnorm: 0.44 [35m[2 days, 5:47:45<19:18:31][39m
+[titan] 2025-09-09 23:23:46,631 - root - INFO - [31mstep: 29440 [32mloss: 2.7336 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.89 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7707 [37mglobal_avg_top_loss: 1.9629
+[titan] 2025-09-09 23:23:46,632 - root - INFO - [34mlr: 4.9777e-06 gnorm: 0.39 [35m[2 days, 5:48:17<19:17:58][39m
+[titan] 2025-09-09 23:24:18,631 - root - INFO - [31mstep: 29445 [32mloss: 2.6786 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.06 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7484 [37mglobal_avg_top_loss: 1.9302
+[titan] 2025-09-09 23:24:18,631 - root - INFO - [34mlr: 4.9750e-06 gnorm: 0.39 [35m[2 days, 5:48:49<19:17:25][39m
+[titan] 2025-09-09 23:24:44,351 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:24:50,723 - root - INFO - [31mstep: 29450 [32mloss: 2.6512 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.64 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7335 [37mglobal_avg_top_loss: 1.9177
+[titan] 2025-09-09 23:24:50,723 - root - INFO - [34mlr: 4.9724e-06 gnorm: 0.53 [35m[2 days, 5:49:21<19:16:52][39m
+[titan] 2025-09-09 23:25:22,863 - root - INFO - [31mstep: 29455 [32mloss: 2.6324 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,196 [36mtflops: 485.91 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7294 [37mglobal_avg_top_loss: 1.9030
+[titan] 2025-09-09 23:25:22,864 - root - INFO - [34mlr: 4.9697e-06 gnorm: 0.38 [35m[2 days, 5:49:53<19:16:18][39m
+[titan] 2025-09-09 23:25:54,872 - root - INFO - [31mstep: 29460 [32mloss: 2.6107 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.91 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7171 [37mglobal_avg_top_loss: 1.8937
+[titan] 2025-09-09 23:25:54,873 - root - INFO - [34mlr: 4.9671e-06 gnorm: 0.50 [35m[2 days, 5:50:25<19:15:45][39m
+[titan] 2025-09-09 23:26:26,822 - root - INFO - [31mstep: 29465 [32mloss: 2.7100 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,257 [36mtflops: 488.82 [35mmfu: 49.43%[39m [37mglobal_avg_ntp_loss: 0.7655 [37mglobal_avg_top_loss: 1.9444
+[titan] 2025-09-09 23:26:26,822 - root - INFO - [34mlr: 4.9644e-06 gnorm: 0.39 [35m[2 days, 5:50:57<19:15:12][39m
+[titan] 2025-09-09 23:26:59,115 - root - INFO - [31mstep: 29470 [32mloss: 2.8733 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,147 [36mtflops: 483.61 [35mmfu: 48.90%[39m [37mglobal_avg_ntp_loss: 0.8526 [37mglobal_avg_top_loss: 2.0207
+[titan] 2025-09-09 23:26:59,116 - root - INFO - [34mlr: 4.9618e-06 gnorm: 0.40 [35m[2 days, 5:51:29<19:14:39][39m
+[titan] 2025-09-09 23:27:31,112 - root - INFO - [31mstep: 29475 [32mloss: 2.6146 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.09 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7173 [37mglobal_avg_top_loss: 1.8973
+[titan] 2025-09-09 23:27:31,113 - root - INFO - [34mlr: 4.9591e-06 gnorm: 0.39 [35m[2 days, 5:52:01<19:14:06][39m
+[titan] 2025-09-09 23:28:03,055 - root - INFO - [31mstep: 29480 [32mloss: 2.7077 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.93 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7603 [37mglobal_avg_top_loss: 1.9475
+[titan] 2025-09-09 23:28:03,055 - root - INFO - [34mlr: 4.9565e-06 gnorm: 0.41 [35m[2 days, 5:52:33<19:13:32][39m
+[titan] 2025-09-09 23:28:34,949 - root - INFO - [31mstep: 29485 [32mloss: 2.6600 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,274 [36mtflops: 489.66 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7423 [37mglobal_avg_top_loss: 1.9177
+[titan] 2025-09-09 23:28:34,949 - root - INFO - [34mlr: 4.9538e-06 gnorm: 0.39 [35m[2 days, 5:53:05<19:12:59][39m
+[titan] 2025-09-09 23:29:06,990 - root - INFO - [31mstep: 29490 [32mloss: 2.7052 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,227 [36mtflops: 487.43 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7591 [37mglobal_avg_top_loss: 1.9461
+[titan] 2025-09-09 23:29:06,990 - root - INFO - [34mlr: 4.9512e-06 gnorm: 0.43 [35m[2 days, 5:53:37<19:12:26][39m
+[titan] 2025-09-09 23:29:39,008 - root - INFO - [31mstep: 29495 [32mloss: 2.6494 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.76 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7352 [37mglobal_avg_top_loss: 1.9142
+[titan] 2025-09-09 23:29:39,009 - root - INFO - [34mlr: 4.9486e-06 gnorm: 0.38 [35m[2 days, 5:54:09<19:11:53][39m
+[titan] 2025-09-09 23:30:04,501 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:30:10,900 - root - INFO - [31mstep: 29500 [32mloss: 2.6307 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,275 [36mtflops: 489.70 [35mmfu: 49.51%[39m [37mglobal_avg_ntp_loss: 0.7293 [37mglobal_avg_top_loss: 1.9014
+[titan] 2025-09-09 23:30:10,901 - root - INFO - [34mlr: 4.9459e-06 gnorm: 0.37 [35m[2 days, 5:54:41<19:11:19][39m
+[titan] 2025-09-09 23:30:42,669 - root - INFO - [31mstep: 29505 [32mloss: 2.6899 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,315 [36mtflops: 491.60 [35mmfu: 49.71%[39m [37mglobal_avg_ntp_loss: 0.7533 [37mglobal_avg_top_loss: 1.9365
+[titan] 2025-09-09 23:30:42,670 - root - INFO - [34mlr: 4.9433e-06 gnorm: 0.40 [35m[2 days, 5:55:13<19:10:46][39m
+[titan] 2025-09-09 23:31:14,714 - root - INFO - [31mstep: 29510 [32mloss: 2.7504 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.36 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7848 [37mglobal_avg_top_loss: 1.9656
+[titan] 2025-09-09 23:31:14,714 - root - INFO - [34mlr: 4.9406e-06 gnorm: 0.38 [35m[2 days, 5:55:45<19:10:13][39m
+[titan] 2025-09-09 23:31:46,701 - root - INFO - [31mstep: 29515 [32mloss: 2.6921 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.24 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7494 [37mglobal_avg_top_loss: 1.9427
+[titan] 2025-09-09 23:31:46,702 - root - INFO - [34mlr: 4.9380e-06 gnorm: 0.39 [35m[2 days, 5:56:17<19:09:40][39m
+[titan] 2025-09-09 23:32:18,822 - root - INFO - [31mstep: 29520 [32mloss: 2.6763 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,202 [36mtflops: 486.20 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7621 [37mglobal_avg_top_loss: 1.9143
+[titan] 2025-09-09 23:32:18,823 - root - INFO - [34mlr: 4.9354e-06 gnorm: 0.39 [35m[2 days, 5:56:49<19:09:07][39m
+[titan] 2025-09-09 23:32:50,838 - root - INFO - [31mstep: 29525 [32mloss: 3.0576 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.81 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.9634 [37mglobal_avg_top_loss: 2.0942
+[titan] 2025-09-09 23:32:50,838 - root - INFO - [34mlr: 4.9327e-06 gnorm: 0.39 [35m[2 days, 5:57:21<19:08:33][39m
+[titan] 2025-09-09 23:33:22,842 - root - INFO - [31mstep: 29530 [32mloss: 2.6854 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.99 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7522 [37mglobal_avg_top_loss: 1.9332
+[titan] 2025-09-09 23:33:22,842 - root - INFO - [34mlr: 4.9301e-06 gnorm: 0.38 [35m[2 days, 5:57:53<19:08:00][39m
+[titan] 2025-09-09 23:33:54,772 - root - INFO - [31mstep: 29535 [32mloss: 2.6469 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,263 [36mtflops: 489.11 [35mmfu: 49.46%[39m [37mglobal_avg_ntp_loss: 0.7293 [37mglobal_avg_top_loss: 1.9177
+[titan] 2025-09-09 23:33:54,772 - root - INFO - [34mlr: 4.9274e-06 gnorm: 0.39 [35m[2 days, 5:58:25<19:07:27][39m
+[titan] 2025-09-09 23:34:26,818 - root - INFO - [31mstep: 29540 [32mloss: 2.6060 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.34 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7195 [37mglobal_avg_top_loss: 1.8864
+[titan] 2025-09-09 23:34:26,818 - root - INFO - [34mlr: 4.9248e-06 gnorm: 0.47 [35m[2 days, 5:58:57<19:06:54][39m
+[titan] 2025-09-09 23:34:58,816 - root - INFO - [31mstep: 29545 [32mloss: 2.6410 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,241 [36mtflops: 488.07 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7293 [37mglobal_avg_top_loss: 1.9117
+[titan] 2025-09-09 23:34:58,817 - root - INFO - [34mlr: 4.9222e-06 gnorm: 0.38 [35m[2 days, 5:59:29<19:06:20][39m
+[titan] 2025-09-09 23:35:24,538 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:35:30,941 - root - INFO - [31mstep: 29550 [32mloss: 2.7671 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,201 [36mtflops: 486.16 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7879 [37mglobal_avg_top_loss: 1.9792
+[titan] 2025-09-09 23:35:30,941 - root - INFO - [34mlr: 4.9195e-06 gnorm: 0.45 [35m[2 days, 6:00:01<19:05:47][39m
+[titan] 2025-09-09 23:36:03,162 - root - INFO - [31mstep: 29555 [32mloss: 2.6948 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,170 [36mtflops: 484.68 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 0.7547 [37mglobal_avg_top_loss: 1.9401
+[titan] 2025-09-09 23:36:03,163 - root - INFO - [34mlr: 4.9169e-06 gnorm: 0.40 [35m[2 days, 6:00:33<19:05:14][39m
+[titan] 2025-09-09 23:36:35,000 - root - INFO - [31mstep: 29560 [32mloss: 2.7432 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,293 [36mtflops: 490.54 [35mmfu: 49.60%[39m [37mglobal_avg_ntp_loss: 0.7774 [37mglobal_avg_top_loss: 1.9657
+[titan] 2025-09-09 23:36:35,000 - root - INFO - [34mlr: 4.9143e-06 gnorm: 0.39 [35m[2 days, 6:01:05<19:04:41][39m
+[titan] 2025-09-09 23:37:06,856 - root - INFO - [31mstep: 29565 [32mloss: 2.7192 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.25 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7649 [37mglobal_avg_top_loss: 1.9543
+[titan] 2025-09-09 23:37:06,856 - root - INFO - [34mlr: 4.9117e-06 gnorm: 0.39 [35m[2 days, 6:01:37<19:04:08][39m
+[titan] 2025-09-09 23:37:38,968 - root - INFO - [31mstep: 29570 [32mloss: 2.6684 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,205 [36mtflops: 486.34 [35mmfu: 49.18%[39m [37mglobal_avg_ntp_loss: 0.7420 [37mglobal_avg_top_loss: 1.9264
+[titan] 2025-09-09 23:37:38,968 - root - INFO - [34mlr: 4.9090e-06 gnorm: 0.38 [35m[2 days, 6:02:09<19:03:35][39m
+[titan] 2025-09-09 23:38:10,969 - root - INFO - [31mstep: 29575 [32mloss: 2.6341 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.02 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7259 [37mglobal_avg_top_loss: 1.9081
+[titan] 2025-09-09 23:38:10,970 - root - INFO - [34mlr: 4.9064e-06 gnorm: 0.38 [35m[2 days, 6:02:41<19:03:01][39m
+[titan] 2025-09-09 23:38:43,179 - root - INFO - [31mstep: 29580 [32mloss: 2.5590 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,174 [36mtflops: 484.86 [35mmfu: 49.03%[39m [37mglobal_avg_ntp_loss: 0.6934 [37mglobal_avg_top_loss: 1.8656
+[titan] 2025-09-09 23:38:43,180 - root - INFO - [34mlr: 4.9038e-06 gnorm: 0.40 [35m[2 days, 6:03:13<19:02:28][39m
+[titan] 2025-09-09 23:39:15,183 - root - INFO - [31mstep: 29585 [32mloss: 2.7235 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.99 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7690 [37mglobal_avg_top_loss: 1.9545
+[titan] 2025-09-09 23:39:15,184 - root - INFO - [34mlr: 4.9011e-06 gnorm: 0.39 [35m[2 days, 6:03:46<19:01:55][39m
+[titan] 2025-09-09 23:39:47,246 - root - INFO - [31mstep: 29590 [32mloss: 2.8727 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,220 [36mtflops: 487.09 [35mmfu: 49.25%[39m [37mglobal_avg_ntp_loss: 0.8667 [37mglobal_avg_top_loss: 2.0060
+[titan] 2025-09-09 23:39:47,246 - root - INFO - [34mlr: 4.8985e-06 gnorm: 0.39 [35m[2 days, 6:04:18<19:01:22][39m
+[titan] 2025-09-09 23:40:19,391 - root - INFO - [31mstep: 29595 [32mloss: 2.6904 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,194 [36mtflops: 485.84 [35mmfu: 49.12%[39m [37mglobal_avg_ntp_loss: 0.7493 [37mglobal_avg_top_loss: 1.9411
+[titan] 2025-09-09 23:40:19,391 - root - INFO - [34mlr: 4.8959e-06 gnorm: 0.39 [35m[2 days, 6:04:50<19:00:49][39m
+[titan] 2025-09-09 23:40:44,958 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:40:51,412 - root - INFO - [31mstep: 29600 [32mloss: 2.6792 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.73 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7478 [37mglobal_avg_top_loss: 1.9314
+[titan] 2025-09-09 23:40:51,412 - root - INFO - [34mlr: 4.8933e-06 gnorm: 0.39 [35m[2 days, 6:05:22<19:00:15][39m
+[titan] 2025-09-09 23:41:23,486 - root - INFO - [31mstep: 29605 [32mloss: 2.6825 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.92 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7484 [37mglobal_avg_top_loss: 1.9341
+[titan] 2025-09-09 23:41:23,486 - root - INFO - [34mlr: 4.8907e-06 gnorm: 0.40 [35m[2 days, 6:05:54<18:59:42][39m
+[titan] 2025-09-09 23:41:55,559 - root - INFO - [31mstep: 29610 [32mloss: 2.6834 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.94 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7484 [37mglobal_avg_top_loss: 1.9349
+[titan] 2025-09-09 23:41:55,559 - root - INFO - [34mlr: 4.8880e-06 gnorm: 0.41 [35m[2 days, 6:06:26<18:59:09][39m
+[titan] 2025-09-09 23:42:27,592 - root - INFO - [31mstep: 29615 [32mloss: 2.7094 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.53 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.7609 [37mglobal_avg_top_loss: 1.9485
+[titan] 2025-09-09 23:42:27,593 - root - INFO - [34mlr: 4.8854e-06 gnorm: 0.39 [35m[2 days, 6:06:58<18:58:36][39m
+[titan] 2025-09-09 23:42:59,640 - root - INFO - [31mstep: 29620 [32mloss: 3.0617 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.32 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.9728 [37mglobal_avg_top_loss: 2.0889
+[titan] 2025-09-09 23:42:59,641 - root - INFO - [34mlr: 4.8828e-06 gnorm: 0.39 [35m[2 days, 6:07:30<18:58:03][39m
+[titan] 2025-09-09 23:43:31,486 - root - INFO - [31mstep: 29625 [32mloss: 2.8192 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.41 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.8222 [37mglobal_avg_top_loss: 1.9970
+[titan] 2025-09-09 23:43:31,487 - root - INFO - [34mlr: 4.8802e-06 gnorm: 0.39 [35m[2 days, 6:08:02<18:57:29][39m
+[titan] 2025-09-09 23:44:03,370 - root - INFO - [31mstep: 29630 [32mloss: 2.6848 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,277 [36mtflops: 489.82 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7519 [37mglobal_avg_top_loss: 1.9330
+[titan] 2025-09-09 23:44:03,371 - root - INFO - [34mlr: 4.8776e-06 gnorm: 0.38 [35m[2 days, 6:08:34<18:56:56][39m
+[titan] 2025-09-09 23:44:35,405 - root - INFO - [31mstep: 29635 [32mloss: 2.6599 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.52 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7397 [37mglobal_avg_top_loss: 1.9202
+[titan] 2025-09-09 23:44:35,406 - root - INFO - [34mlr: 4.8749e-06 gnorm: 0.38 [35m[2 days, 6:09:06<18:56:23][39m
+[titan] 2025-09-09 23:45:07,457 - root - INFO - [31mstep: 29640 [32mloss: 2.7675 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,224 [36mtflops: 487.26 [35mmfu: 49.27%[39m [37mglobal_avg_ntp_loss: 0.7881 [37mglobal_avg_top_loss: 1.9794
+[titan] 2025-09-09 23:45:07,457 - root - INFO - [34mlr: 4.8723e-06 gnorm: 0.39 [35m[2 days, 6:09:38<18:55:50][39m
+[titan] 2025-09-09 23:45:39,587 - root - INFO - [31mstep: 29645 [32mloss: 2.6829 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,199 [36mtflops: 486.07 [35mmfu: 49.15%[39m [37mglobal_avg_ntp_loss: 0.7513 [37mglobal_avg_top_loss: 1.9316
+[titan] 2025-09-09 23:45:39,588 - root - INFO - [34mlr: 4.8697e-06 gnorm: 0.38 [35m[2 days, 6:10:10<18:55:17][39m
+[titan] 2025-09-09 23:46:05,078 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:46:11,475 - root - INFO - [31mstep: 29650 [32mloss: 2.6912 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,276 [36mtflops: 489.76 [35mmfu: 49.52%[39m [37mglobal_avg_ntp_loss: 0.7516 [37mglobal_avg_top_loss: 1.9395
+[titan] 2025-09-09 23:46:11,476 - root - INFO - [34mlr: 4.8671e-06 gnorm: 0.64 [35m[2 days, 6:10:42<18:54:43][39m
+[titan] 2025-09-09 23:46:43,410 - root - INFO - [31mstep: 29655 [32mloss: 2.6293 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,261 [36mtflops: 489.04 [35mmfu: 49.45%[39m [37mglobal_avg_ntp_loss: 0.7226 [37mglobal_avg_top_loss: 1.9067
+[titan] 2025-09-09 23:46:43,411 - root - INFO - [34mlr: 4.8645e-06 gnorm: 0.38 [35m[2 days, 6:11:14<18:54:10][39m
+[titan] 2025-09-09 23:47:15,405 - root - INFO - [31mstep: 29660 [32mloss: 2.7703 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.12 [35mmfu: 49.35%[39m [37mglobal_avg_ntp_loss: 0.7866 [37mglobal_avg_top_loss: 1.9838
+[titan] 2025-09-09 23:47:15,406 - root - INFO - [34mlr: 4.8619e-06 gnorm: 0.39 [35m[2 days, 6:11:46<18:53:37][39m
+[titan] 2025-09-09 23:47:47,232 - root - INFO - [31mstep: 29665 [32mloss: 2.7646 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.70 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7845 [37mglobal_avg_top_loss: 1.9800
+[titan] 2025-09-09 23:47:47,233 - root - INFO - [34mlr: 4.8593e-06 gnorm: 0.38 [35m[2 days, 6:12:18<18:53:04][39m
+[titan] 2025-09-09 23:48:19,241 - root - INFO - [31mstep: 29670 [32mloss: 2.6916 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,237 [36mtflops: 487.91 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7538 [37mglobal_avg_top_loss: 1.9378
+[titan] 2025-09-09 23:48:19,242 - root - INFO - [34mlr: 4.8567e-06 gnorm: 0.38 [35m[2 days, 6:12:50<18:52:31][39m
+[titan] 2025-09-09 23:48:51,378 - root - INFO - [31mstep: 29675 [32mloss: 2.7000 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,197 [36mtflops: 485.97 [35mmfu: 49.14%[39m [37mglobal_avg_ntp_loss: 0.7544 [37mglobal_avg_top_loss: 1.9456
+[titan] 2025-09-09 23:48:51,378 - root - INFO - [34mlr: 4.8541e-06 gnorm: 0.38 [35m[2 days, 6:13:22<18:51:57][39m
+[titan] 2025-09-09 23:49:23,199 - root - INFO - [31mstep: 29680 [32mloss: 2.7927 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,298 [36mtflops: 490.79 [35mmfu: 49.62%[39m [37mglobal_avg_ntp_loss: 0.7960 [37mglobal_avg_top_loss: 1.9967
+[titan] 2025-09-09 23:49:23,199 - root - INFO - [34mlr: 4.8514e-06 gnorm: 0.40 [35m[2 days, 6:13:53<18:51:24][39m
+[titan] 2025-09-09 23:49:55,237 - root - INFO - [31mstep: 29685 [32mloss: 2.7399 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,228 [36mtflops: 487.47 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7752 [37mglobal_avg_top_loss: 1.9647
+[titan] 2025-09-09 23:49:55,237 - root - INFO - [34mlr: 4.8488e-06 gnorm: 0.39 [35m[2 days, 6:14:26<18:50:51][39m
+[titan] 2025-09-09 23:50:27,228 - root - INFO - [31mstep: 29690 [32mloss: 2.7406 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,243 [36mtflops: 488.18 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7764 [37mglobal_avg_top_loss: 1.9642
+[titan] 2025-09-09 23:50:27,228 - root - INFO - [34mlr: 4.8462e-06 gnorm: 0.39 [35m[2 days, 6:14:58<18:50:18][39m
+[titan] 2025-09-09 23:50:59,184 - root - INFO - [31mstep: 29695 [32mloss: 2.6932 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.72 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7544 [37mglobal_avg_top_loss: 1.9388
+[titan] 2025-09-09 23:50:59,185 - root - INFO - [34mlr: 4.8436e-06 gnorm: 0.40 [35m[2 days, 6:15:29<18:49:44][39m
+[titan] 2025-09-09 23:51:05,815 - root - INFO - Dumping profiler traces at step 29696
+[titan] 2025-09-09 23:51:05,887 - root - INFO - Finished dumping profiler traces in 0.07 seconds
+[titan] 2025-09-09 23:51:24,941 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:51:31,380 - root - INFO - [31mstep: 29700 [32mloss: 2.7486 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.08 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7782 [37mglobal_avg_top_loss: 1.9704
+[titan] 2025-09-09 23:51:31,381 - root - INFO - [34mlr: 4.8410e-06 gnorm: 0.39 [35m[2 days, 6:16:02<18:49:11][39m
+[titan] 2025-09-09 23:52:03,289 - root - INFO - [31mstep: 29705 [32mloss: 2.6199 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,269 [36mtflops: 489.44 [35mmfu: 49.49%[39m [37mglobal_avg_ntp_loss: 0.7254 [37mglobal_avg_top_loss: 1.8945
+[titan] 2025-09-09 23:52:03,290 - root - INFO - [34mlr: 4.8384e-06 gnorm: 0.39 [35m[2 days, 6:16:34<18:48:38][39m
+[titan] 2025-09-09 23:52:35,048 - root - INFO - [31mstep: 29710 [32mloss: 2.6929 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,318 [36mtflops: 491.75 [35mmfu: 49.72%[39m [37mglobal_avg_ntp_loss: 0.7557 [37mglobal_avg_top_loss: 1.9372
+[titan] 2025-09-09 23:52:35,048 - root - INFO - [34mlr: 4.8358e-06 gnorm: 0.40 [35m[2 days, 6:17:05<18:48:05][39m
+[titan] 2025-09-09 23:53:06,855 - root - INFO - [31mstep: 29715 [32mloss: 2.6783 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,302 [36mtflops: 491.00 [35mmfu: 49.65%[39m [37mglobal_avg_ntp_loss: 0.7491 [37mglobal_avg_top_loss: 1.9293
+[titan] 2025-09-09 23:53:06,856 - root - INFO - [34mlr: 4.8332e-06 gnorm: 0.38 [35m[2 days, 6:17:37<18:47:32][39m
+[titan] 2025-09-09 23:53:38,701 - root - INFO - [31mstep: 29720 [32mloss: 2.7316 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,290 [36mtflops: 490.40 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7712 [37mglobal_avg_top_loss: 1.9604
+[titan] 2025-09-09 23:53:38,702 - root - INFO - [34mlr: 4.8306e-06 gnorm: 0.39 [35m[2 days, 6:18:09<18:46:58][39m
+[titan] 2025-09-09 23:54:10,529 - root - INFO - [31mstep: 29725 [32mloss: 2.6654 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,296 [36mtflops: 490.69 [35mmfu: 49.61%[39m [37mglobal_avg_ntp_loss: 0.7409 [37mglobal_avg_top_loss: 1.9245
+[titan] 2025-09-09 23:54:10,529 - root - INFO - [34mlr: 4.8280e-06 gnorm: 0.39 [35m[2 days, 6:18:41<18:46:25][39m
+[titan] 2025-09-09 23:54:42,620 - root - INFO - [31mstep: 29730 [32mloss: 2.6882 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.66 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7527 [37mglobal_avg_top_loss: 1.9356
+[titan] 2025-09-09 23:54:42,620 - root - INFO - [34mlr: 4.8254e-06 gnorm: 0.38 [35m[2 days, 6:19:13<18:45:52][39m
+[titan] 2025-09-09 23:55:14,622 - root - INFO - [31mstep: 29735 [32mloss: 3.1121 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,240 [36mtflops: 488.01 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 1.0012 [37mglobal_avg_top_loss: 2.1108
+[titan] 2025-09-09 23:55:14,623 - root - INFO - [34mlr: 4.8228e-06 gnorm: 0.41 [35m[2 days, 6:19:45<18:45:19][39m
+[titan] 2025-09-09 23:55:46,563 - root - INFO - [31mstep: 29740 [32mloss: 2.7354 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.95 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7733 [37mglobal_avg_top_loss: 1.9620
+[titan] 2025-09-09 23:55:46,563 - root - INFO - [34mlr: 4.8202e-06 gnorm: 0.39 [35m[2 days, 6:20:17<18:44:46][39m
+[titan] 2025-09-09 23:56:18,419 - root - INFO - [31mstep: 29745 [32mloss: 2.8026 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,287 [36mtflops: 490.25 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.8278 [37mglobal_avg_top_loss: 1.9747
+[titan] 2025-09-09 23:56:18,419 - root - INFO - [34mlr: 4.8176e-06 gnorm: 0.38 [35m[2 days, 6:20:49<18:44:12][39m
+[titan] 2025-09-09 23:56:44,039 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-09 23:56:50,341 - root - INFO - [31mstep: 29750 [32mloss: 2.5971 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.24 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 0.7216 [37mglobal_avg_top_loss: 1.8755
+[titan] 2025-09-09 23:56:50,341 - root - INFO - [34mlr: 4.8150e-06 gnorm: 1.13 [35m[2 days, 6:21:21<18:43:39][39m
+[titan] 2025-09-09 23:57:22,521 - root - INFO - [31mstep: 29755 [32mloss: 3.0763 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,183 [36mtflops: 485.31 [35mmfu: 49.07%[39m [37mglobal_avg_ntp_loss: 0.9800 [37mglobal_avg_top_loss: 2.0963
+[titan] 2025-09-09 23:57:22,521 - root - INFO - [34mlr: 4.8124e-06 gnorm: 0.37 [35m[2 days, 6:21:53<18:43:06][39m
+[titan] 2025-09-09 23:57:54,378 - root - INFO - [31mstep: 29760 [32mloss: 2.6953 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.23 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7549 [37mglobal_avg_top_loss: 1.9404
+[titan] 2025-09-09 23:57:54,378 - root - INFO - [34mlr: 4.8099e-06 gnorm: 0.43 [35m[2 days, 6:22:25<18:42:33][39m
+[titan] 2025-09-09 23:58:26,318 - root - INFO - [31mstep: 29765 [32mloss: 2.6468 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,259 [36mtflops: 488.96 [35mmfu: 49.44%[39m [37mglobal_avg_ntp_loss: 0.7329 [37mglobal_avg_top_loss: 1.9139
+[titan] 2025-09-09 23:58:26,318 - root - INFO - [34mlr: 4.8073e-06 gnorm: 0.43 [35m[2 days, 6:22:57<18:41:59][39m
+[titan] 2025-09-09 23:58:58,421 - root - INFO - [31mstep: 29770 [32mloss: 2.6920 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,207 [36mtflops: 486.48 [35mmfu: 49.19%[39m [37mglobal_avg_ntp_loss: 0.7535 [37mglobal_avg_top_loss: 1.9385
+[titan] 2025-09-09 23:58:58,422 - root - INFO - [34mlr: 4.8047e-06 gnorm: 0.40 [35m[2 days, 6:23:29<18:41:26][39m
+[titan] 2025-09-09 23:59:30,710 - root - INFO - [31mstep: 29775 [32mloss: 2.6059 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,149 [36mtflops: 483.69 [35mmfu: 48.91%[39m [37mglobal_avg_ntp_loss: 0.7144 [37mglobal_avg_top_loss: 1.8915
+[titan] 2025-09-09 23:59:30,710 - root - INFO - [34mlr: 4.8021e-06 gnorm: 0.37 [35m[2 days, 6:24:01<18:40:53][39m
+[titan] 2025-09-10 00:00:02,498 - root - INFO - [31mstep: 29780 [32mloss: 2.6640 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,309 [36mtflops: 491.30 [35mmfu: 49.68%[39m [37mglobal_avg_ntp_loss: 0.7417 [37mglobal_avg_top_loss: 1.9223
+[titan] 2025-09-10 00:00:02,498 - root - INFO - [34mlr: 4.7995e-06 gnorm: 0.40 [35m[2 days, 6:24:33<18:40:20][39m
+[titan] 2025-09-10 00:00:34,400 - root - INFO - [31mstep: 29785 [32mloss: 2.5482 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,272 [36mtflops: 489.54 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.6895 [37mglobal_avg_top_loss: 1.8587
+[titan] 2025-09-10 00:00:34,401 - root - INFO - [34mlr: 4.7969e-06 gnorm: 0.37 [35m[2 days, 6:25:05<18:39:47][39m
+[titan] 2025-09-10 00:01:06,418 - root - INFO - [31mstep: 29790 [32mloss: 2.7028 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,235 [36mtflops: 487.78 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.7597 [37mglobal_avg_top_loss: 1.9431
+[titan] 2025-09-10 00:01:06,418 - root - INFO - [34mlr: 4.7943e-06 gnorm: 0.39 [35m[2 days, 6:25:37<18:39:14][39m
+[titan] 2025-09-10 00:01:38,374 - root - INFO - [31mstep: 29795 [32mloss: 2.6177 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,254 [36mtflops: 488.72 [35mmfu: 49.42%[39m [37mglobal_avg_ntp_loss: 0.7188 [37mglobal_avg_top_loss: 1.8989
+[titan] 2025-09-10 00:01:38,374 - root - INFO - [34mlr: 4.7917e-06 gnorm: 0.39 [35m[2 days, 6:26:09<18:38:40][39m
+[titan] 2025-09-10 00:02:03,862 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-10 00:02:10,277 - root - INFO - [31mstep: 29800 [32mloss: 2.6450 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,271 [36mtflops: 489.53 [35mmfu: 49.50%[39m [37mglobal_avg_ntp_loss: 0.7344 [37mglobal_avg_top_loss: 1.9106
+[titan] 2025-09-10 00:02:10,277 - root - INFO - [34mlr: 4.7892e-06 gnorm: 0.39 [35m[2 days, 6:26:41<18:38:07][39m
+[titan] 2025-09-10 00:02:42,328 - root - INFO - [31mstep: 29805 [32mloss: 2.8560 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,225 [36mtflops: 487.33 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.8431 [37mglobal_avg_top_loss: 2.0129
+[titan] 2025-09-10 00:02:42,329 - root - INFO - [34mlr: 4.7866e-06 gnorm: 0.38 [35m[2 days, 6:27:13<18:37:34][39m
+[titan] 2025-09-10 00:03:14,336 - root - INFO - [31mstep: 29810 [32mloss: 2.6454 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.93 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7318 [37mglobal_avg_top_loss: 1.9136
+[titan] 2025-09-10 00:03:14,336 - root - INFO - [34mlr: 4.7840e-06 gnorm: 0.38 [35m[2 days, 6:27:45<18:37:01][39m
+[titan] 2025-09-10 00:03:46,354 - root - INFO - [31mstep: 29815 [32mloss: 2.5502 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,234 [36mtflops: 487.76 [35mmfu: 49.32%[39m [37mglobal_avg_ntp_loss: 0.6868 [37mglobal_avg_top_loss: 1.8634
+[titan] 2025-09-10 00:03:46,355 - root - INFO - [34mlr: 4.7814e-06 gnorm: 0.38 [35m[2 days, 6:28:17<18:36:28][39m
+[titan] 2025-09-10 00:04:18,238 - root - INFO - [31mstep: 29820 [32mloss: 2.6542 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,278 [36mtflops: 489.83 [35mmfu: 49.53%[39m [37mglobal_avg_ntp_loss: 0.7341 [37mglobal_avg_top_loss: 1.9202
+[titan] 2025-09-10 00:04:18,238 - root - INFO - [34mlr: 4.7788e-06 gnorm: 0.41 [35m[2 days, 6:28:49<18:35:54][39m
+[titan] 2025-09-10 00:04:50,377 - root - INFO - [31mstep: 29825 [32mloss: 2.5905 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,196 [36mtflops: 485.92 [35mmfu: 49.13%[39m [37mglobal_avg_ntp_loss: 0.7089 [37mglobal_avg_top_loss: 1.8816
+[titan] 2025-09-10 00:04:50,378 - root - INFO - [34mlr: 4.7762e-06 gnorm: 0.37 [35m[2 days, 6:29:21<18:35:21][39m
+[titan] 2025-09-10 00:05:22,383 - root - INFO - [31mstep: 29830 [32mloss: 2.5534 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 487.96 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.6934 [37mglobal_avg_top_loss: 1.8600
+[titan] 2025-09-10 00:05:22,383 - root - INFO - [34mlr: 4.7737e-06 gnorm: 0.39 [35m[2 days, 6:29:53<18:34:48][39m
+[titan] 2025-09-10 00:05:54,578 - root - INFO - [31mstep: 29835 [32mloss: 2.6753 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,178 [36mtflops: 485.08 [35mmfu: 49.05%[39m [37mglobal_avg_ntp_loss: 0.7460 [37mglobal_avg_top_loss: 1.9292
+[titan] 2025-09-10 00:05:54,579 - root - INFO - [34mlr: 4.7711e-06 gnorm: 0.38 [35m[2 days, 6:30:25<18:34:15][39m
+[titan] 2025-09-10 00:06:26,566 - root - INFO - [31mstep: 29840 [32mloss: 2.7302 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.24 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7746 [37mglobal_avg_top_loss: 1.9556
+[titan] 2025-09-10 00:06:26,566 - root - INFO - [34mlr: 4.7685e-06 gnorm: 0.38 [35m[2 days, 6:30:57<18:33:42][39m
+[titan] 2025-09-10 00:06:58,803 - root - INFO - [31mstep: 29845 [32mloss: 2.6537 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,165 [36mtflops: 484.45 [35mmfu: 48.98%[39m [37mglobal_avg_ntp_loss: 0.7356 [37mglobal_avg_top_loss: 1.9181
+[titan] 2025-09-10 00:06:58,803 - root - INFO - [34mlr: 4.7659e-06 gnorm: 0.38 [35m[2 days, 6:31:29<18:33:09][39m
+[titan] 2025-09-10 00:07:24,447 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-10 00:07:30,806 - root - INFO - [31mstep: 29850 [32mloss: 2.7362 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,239 [36mtflops: 488.00 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7745 [37mglobal_avg_top_loss: 1.9617
+[titan] 2025-09-10 00:07:30,806 - root - INFO - [34mlr: 4.7634e-06 gnorm: 0.38 [35m[2 days, 6:32:01<18:32:35][39m
+[titan] 2025-09-10 00:08:02,841 - root - INFO - [31mstep: 29855 [32mloss: 2.7087 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,229 [36mtflops: 487.51 [35mmfu: 49.29%[39m [37mglobal_avg_ntp_loss: 0.7605 [37mglobal_avg_top_loss: 1.9483
+[titan] 2025-09-10 00:08:02,841 - root - INFO - [34mlr: 4.7608e-06 gnorm: 0.39 [35m[2 days, 6:32:33<18:32:02][39m
+[titan] 2025-09-10 00:08:34,684 - root - INFO - [31mstep: 29860 [32mloss: 2.6531 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,291 [36mtflops: 490.46 [35mmfu: 49.59%[39m [37mglobal_avg_ntp_loss: 0.7355 [37mglobal_avg_top_loss: 1.9176
+[titan] 2025-09-10 00:08:34,684 - root - INFO - [34mlr: 4.7582e-06 gnorm: 0.39 [35m[2 days, 6:33:05<18:31:29][39m
+[titan] 2025-09-10 00:09:06,601 - root - INFO - [31mstep: 29865 [32mloss: 2.7168 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,267 [36mtflops: 489.31 [35mmfu: 49.48%[39m [37mglobal_avg_ntp_loss: 0.7646 [37mglobal_avg_top_loss: 1.9522
+[titan] 2025-09-10 00:09:06,602 - root - INFO - [34mlr: 4.7557e-06 gnorm: 0.39 [35m[2 days, 6:33:37<18:30:56][39m
+[titan] 2025-09-10 00:09:38,451 - root - INFO - [31mstep: 29870 [32mloss: 2.7116 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,289 [36mtflops: 490.35 [35mmfu: 49.58%[39m [37mglobal_avg_ntp_loss: 0.7654 [37mglobal_avg_top_loss: 1.9462
+[titan] 2025-09-10 00:09:38,452 - root - INFO - [34mlr: 4.7531e-06 gnorm: 0.39 [35m[2 days, 6:34:09<18:30:23][39m
+[titan] 2025-09-10 00:10:10,458 - root - INFO - [31mstep: 29875 [32mloss: 2.7238 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,238 [36mtflops: 487.93 [35mmfu: 49.34%[39m [37mglobal_avg_ntp_loss: 0.7679 [37mglobal_avg_top_loss: 1.9559
+[titan] 2025-09-10 00:10:10,459 - root - INFO - [34mlr: 4.7505e-06 gnorm: 0.40 [35m[2 days, 6:34:41<18:29:49][39m
+[titan] 2025-09-10 00:10:42,572 - root - INFO - [31mstep: 29880 [32mloss: 2.6896 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,204 [36mtflops: 486.33 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.7511 [37mglobal_avg_top_loss: 1.9384
+[titan] 2025-09-10 00:10:42,572 - root - INFO - [34mlr: 4.7479e-06 gnorm: 0.40 [35m[2 days, 6:35:13<18:29:16][39m
+[titan] 2025-09-10 00:11:14,690 - root - INFO - [31mstep: 29885 [32mloss: 3.1702 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,202 [36mtflops: 486.24 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 1.0232 [37mglobal_avg_top_loss: 2.1470
+[titan] 2025-09-10 00:11:14,691 - root - INFO - [34mlr: 4.7454e-06 gnorm: 0.39 [35m[2 days, 6:35:45<18:28:43][39m
+[titan] 2025-09-10 00:11:46,939 - root - INFO - [31mstep: 29890 [32mloss: 2.6405 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,161 [36mtflops: 484.28 [35mmfu: 48.97%[39m [37mglobal_avg_ntp_loss: 0.7311 [37mglobal_avg_top_loss: 1.9094
+[titan] 2025-09-10 00:11:46,940 - root - INFO - [34mlr: 4.7428e-06 gnorm: 0.38 [35m[2 days, 6:36:17<18:28:10][39m
+[titan] 2025-09-10 00:12:19,176 - root - INFO - [31mstep: 29895 [32mloss: 2.7750 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,165 [36mtflops: 484.46 [35mmfu: 48.99%[39m [37mglobal_avg_ntp_loss: 0.7865 [37mglobal_avg_top_loss: 1.9884
+[titan] 2025-09-10 00:12:19,177 - root - INFO - [34mlr: 4.7402e-06 gnorm: 0.60 [35m[2 days, 6:36:49<18:27:37][39m
+[titan] 2025-09-10 00:12:44,693 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-10 00:12:51,251 - root - INFO - [31mstep: 29900 [32mloss: 2.6360 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,216 [36mtflops: 486.90 [35mmfu: 49.23%[39m [37mglobal_avg_ntp_loss: 0.7259 [37mglobal_avg_top_loss: 1.9101
+[titan] 2025-09-10 00:12:51,252 - root - INFO - [34mlr: 4.7377e-06 gnorm: 0.40 [35m[2 days, 6:37:22<18:27:04][39m
+[titan] 2025-09-10 00:13:23,319 - root - INFO - [31mstep: 29905 [32mloss: 2.7614 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,219 [36mtflops: 487.01 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7881 [37mglobal_avg_top_loss: 1.9733
+[titan] 2025-09-10 00:13:23,320 - root - INFO - [34mlr: 4.7351e-06 gnorm: 0.40 [35m[2 days, 6:37:54<18:26:31][39m
+[titan] 2025-09-10 00:13:55,301 - root - INFO - [31mstep: 29910 [32mloss: 2.5872 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,246 [36mtflops: 488.33 [35mmfu: 49.38%[39m [37mglobal_avg_ntp_loss: 0.7106 [37mglobal_avg_top_loss: 1.8765
+[titan] 2025-09-10 00:13:55,302 - root - INFO - [34mlr: 4.7326e-06 gnorm: 0.38 [35m[2 days, 6:38:26<18:25:57][39m
+[titan] 2025-09-10 00:14:27,418 - root - INFO - [31mstep: 29915 [32mloss: 2.9859 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,203 [36mtflops: 486.26 [35mmfu: 49.17%[39m [37mglobal_avg_ntp_loss: 0.9417 [37mglobal_avg_top_loss: 2.0442
+[titan] 2025-09-10 00:14:27,419 - root - INFO - [34mlr: 4.7300e-06 gnorm: 0.39 [35m[2 days, 6:38:58<18:25:24][39m
+[titan] 2025-09-10 00:14:59,292 - root - INFO - [31mstep: 29920 [32mloss: 2.7048 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,281 [36mtflops: 489.98 [35mmfu: 49.54%[39m [37mglobal_avg_ntp_loss: 0.7620 [37mglobal_avg_top_loss: 1.9428
+[titan] 2025-09-10 00:14:59,292 - root - INFO - [34mlr: 4.7274e-06 gnorm: 0.38 [35m[2 days, 6:39:30<18:24:51][39m
+[titan] 2025-09-10 00:15:31,364 - root - INFO - [31mstep: 29925 [32mloss: 2.6873 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.95 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7512 [37mglobal_avg_top_loss: 1.9362
+[titan] 2025-09-10 00:15:31,364 - root - INFO - [34mlr: 4.7249e-06 gnorm: 0.42 [35m[2 days, 6:40:02<18:24:18][39m
+[titan] 2025-09-10 00:16:03,584 - root - INFO - [31mstep: 29930 [32mloss: 2.6387 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,170 [36mtflops: 484.71 [35mmfu: 49.01%[39m [37mglobal_avg_ntp_loss: 0.7322 [37mglobal_avg_top_loss: 1.9065
+[titan] 2025-09-10 00:16:03,584 - root - INFO - [34mlr: 4.7223e-06 gnorm: 0.39 [35m[2 days, 6:40:34<18:23:45][39m
+[titan] 2025-09-10 00:16:35,629 - root - INFO - [31mstep: 29935 [32mloss: 2.6520 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,226 [36mtflops: 487.36 [35mmfu: 49.28%[39m [37mglobal_avg_ntp_loss: 0.7334 [37mglobal_avg_top_loss: 1.9186
+[titan] 2025-09-10 00:16:35,629 - root - INFO - [34mlr: 4.7198e-06 gnorm: 0.40 [35m[2 days, 6:41:06<18:23:12][39m
+[titan] 2025-09-10 00:17:07,617 - root - INFO - [31mstep: 29940 [32mloss: 2.7165 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,244 [36mtflops: 488.23 [35mmfu: 49.37%[39m [37mglobal_avg_ntp_loss: 0.7679 [37mglobal_avg_top_loss: 1.9486
+[titan] 2025-09-10 00:17:07,617 - root - INFO - [34mlr: 4.7172e-06 gnorm: 0.39 [35m[2 days, 6:41:38<18:22:38][39m
+[titan] 2025-09-10 00:17:39,816 - root - INFO - [31mstep: 29945 [32mloss: 2.7217 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,177 [36mtflops: 485.03 [35mmfu: 49.04%[39m [37mglobal_avg_ntp_loss: 0.7665 [37mglobal_avg_top_loss: 1.9552
+[titan] 2025-09-10 00:17:39,816 - root - INFO - [34mlr: 4.7146e-06 gnorm: 0.39 [35m[2 days, 6:42:10<18:22:05][39m
+[titan] 2025-09-10 00:18:05,464 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds.
+[titan] 2025-09-10 00:18:11,908 - root - INFO - [31mstep: 29950 [32mloss: 2.6150 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,211 [36mtflops: 486.64 [35mmfu: 49.21%[39m [37mglobal_avg_ntp_loss: 0.7222 [37mglobal_avg_top_loss: 1.8928
+[titan] 2025-09-10 00:18:11,909 - root - INFO - [34mlr: 4.7121e-06 gnorm: 0.38 [35m[2 days, 6:42:42<18:21:32][39m
+[titan] 2025-09-10 00:18:44,033 - root - INFO - [31mstep: 29955 [32mloss: 2.7258 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,201 [36mtflops: 486.16 [35mmfu: 49.16%[39m [37mglobal_avg_ntp_loss: 0.7661 [37mglobal_avg_top_loss: 1.9597
+[titan] 2025-09-10 00:18:44,033 - root - INFO - [34mlr: 4.7095e-06 gnorm: 0.40 [35m[2 days, 6:43:14<18:20:59][39m
+[titan] 2025-09-10 00:19:16,106 - root - INFO - [31mstep: 29960 [32mloss: 2.5972 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,217 [36mtflops: 486.93 [35mmfu: 49.24%[39m [37mglobal_avg_ntp_loss: 0.7099 [37mglobal_avg_top_loss: 1.8873
+[titan] 2025-09-10 00:19:16,106 - root - INFO - [34mlr: 4.7070e-06 gnorm: 0.39 [35m[2 days, 6:43:46<18:20:26][39m
+[titan] 2025-09-10 00:19:48,029 - root - INFO - [31mstep: 29965 [32mloss: 3.8520 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,265 [36mtflops: 489.22 [35mmfu: 49.47%[39m [37mglobal_avg_ntp_loss: 1.3764 [37mglobal_avg_top_loss: 2.4756
+[titan] 2025-09-10 00:19:48,029 - root - INFO - [34mlr: 4.7044e-06 gnorm: 0.41 [35m[2 days, 6:44:18<18:19:53][39m
+[titan] 2025-09-10 00:20:20,187 - root - INFO - [31mstep: 29970 [32mloss: 2.6966 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.64 [35mmfu: 49.10%[39m [37mglobal_avg_ntp_loss: 0.7559 [37mglobal_avg_top_loss: 1.9408
+[titan] 2025-09-10 00:20:20,187 - root - INFO - [34mlr: 4.7019e-06 gnorm: 0.41 [35m[2 days, 6:44:50<18:19:20][39m
+[titan] 2025-09-10 00:20:52,285 - root - INFO - [31mstep: 29975 [32mloss: 3.1491 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,209 [36mtflops: 486.56 [35mmfu: 49.20%[39m [37mglobal_avg_ntp_loss: 1.0164 [37mglobal_avg_top_loss: 2.1327
+[titan] 2025-09-10 00:20:52,285 - root - INFO - [34mlr: 4.6993e-06 gnorm: 0.38 [35m[2 days, 6:45:23<18:18:46][39m
+[titan] 2025-09-10 00:21:24,143 - root - INFO - [31mstep: 29980 [32mloss: 2.7489 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,286 [36mtflops: 490.22 [35mmfu: 49.57%[39m [37mglobal_avg_ntp_loss: 0.7802 [37mglobal_avg_top_loss: 1.9688
+[titan] 2025-09-10 00:21:24,143 - root - INFO - [34mlr: 4.6968e-06 gnorm: 0.38 [35m[2 days, 6:45:54<18:18:13][39m
+[titan] 2025-09-10 00:21:56,300 - root - INFO - [31mstep: 29985 [32mloss: 2.6586 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,190 [36mtflops: 485.66 [35mmfu: 49.11%[39m [37mglobal_avg_ntp_loss: 0.7358 [37mglobal_avg_top_loss: 1.9227
+[titan] 2025-09-10 00:21:56,301 - root - INFO - [34mlr: 4.6942e-06 gnorm: 0.39 [35m[2 days, 6:46:27<18:17:40][39m
+[titan] 2025-09-10 00:22:28,294 - root - INFO - [31mstep: 29990 [32mloss: 2.5988 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,242 [36mtflops: 488.14 [35mmfu: 49.36%[39m [37mglobal_avg_ntp_loss: 0.7105 [37mglobal_avg_top_loss: 1.8884
+[titan] 2025-09-10 00:22:28,295 - root - INFO - [34mlr: 4.6917e-06 gnorm: 0.39 [35m[2 days, 6:46:59<18:17:07][39m
+[titan] 2025-09-10 00:23:00,308 - root - INFO - [31mstep: 29995 [32mloss: 2.5806 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,236 [36mtflops: 487.84 [35mmfu: 49.33%[39m [37mglobal_avg_ntp_loss: 0.7012 [37mglobal_avg_top_loss: 1.8794
+[titan] 2025-09-10 00:23:00,308 - root - INFO - [34mlr: 4.6891e-06 gnorm: 0.38 [35m[2 days, 6:47:31<18:16:34][39m
+[titan] 2025-09-10 00:23:25,917 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
+[titan] 2025-09-10 00:23:32,341 - root - INFO - [31mstep: 30000 [32mloss: 2.5194 [33mmemory: 122.04GiB(87.57%) [34mtps: 10,230 [36mtflops: 487.54 [35mmfu: 49.30%[39m [37mglobal_avg_ntp_loss: 0.6778 [37mglobal_avg_top_loss: 1.8416
+[titan] 2025-09-10 00:23:32,341 - root - INFO - [34mlr: 4.6866e-06 gnorm: 0.39 [35m[2 days, 6:48:03<18:16:01][39m
+[titan] 2025-09-10 00:23:32,341 - root - INFO - Saving the checkpoint (or staging if async is enabled).
+[titan] 2025-09-10 00:24:07,145 - root - INFO - [GC] GC collection invoked by checkpointer. 0.01 seconds.
+[titan] 2025-09-10 00:24:07,145 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 34.80 seconds.
+[titan] 2025-09-10 00:24:07,145 - root - INFO - Ensuring repository zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757 exists...
+[titan] 2025-09-10 00:24:07,296 - root - INFO - Repository zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757 ensured.
+[titan] 2025-09-10 00:24:07,296 - root - INFO - Uploading exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/checkpoint/step-30000 to zaydzuhri/top-code-7B-4096-batch8x2-steps40000-20250909-061757/step-30000 on Hugging Face Hub...
+Processing Files (8 / 9) : 100%|█████████▉| 83.2GB / 83.3GB, 114MB/s
+New Data Upload : 100%|█████████▉| 83.2GB / 83.3GB, 114MB/s
+ ...ine/checkpoint/step-30000/.metadata: 100%|██████████| 2.47MB / 2.47MB
+ .../checkpoint/step-30000/__1_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-30000/__5_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-30000/__3_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-30000/__7_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-30000/__0_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-30000/__4_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-30000/__2_0.distcp: 100%|██████████| 10.4GB / 10.4GB
+ .../checkpoint/step-30000/__6_0.distcp: 99%|█████████▉| 10.4GB / 10.4GB
diff --git a/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/requirements.txt b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cf79d25031b268c8bf0ae12f6799c04bf1cbbba1
--- /dev/null
+++ b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/requirements.txt
@@ -0,0 +1,207 @@
+flame==0.1.0
+pluggy==1.6.0
+triton==3.2.0
+sympy==1.13.1
+wcwidth==0.2.13
+nvidia-cusolver-cu12==11.6.1.9
+peft==0.17.0
+smart_open==7.3.0.post1
+cymem==2.0.11
+spacy-legacy==3.0.12
+h11==0.16.0
+pytablewriter==1.2.1
+idna==3.10
+regex==2025.7.34
+antlr4-python3-runtime==4.13.2
+wandb==0.21.0
+nvidia-cuda-cupti-cu12==12.4.127
+sentencepiece==0.2.1
+zstandard==0.23.0
+pybind11==3.0.0
+inquirerpy==0.3.4
+contourpy==1.3.3
+Pygments==2.19.2
+sniffio==1.3.1
+Jinja2==3.1.6
+packaging==25.0
+Markdown==3.8.2
+astunparse==1.6.3
+spacy==3.8.7
+pyparsing==3.2.3
+networkx==3.5
+ninja==1.11.1.4
+tf-slim==1.1.0
+PyYAML==6.0.2
+smmap==5.0.2
+tiktoken==0.9.0
+flatbuffers==25.2.10
+tensorflow==2.20.0
+langcodes==3.5.0
+nvidia-cuda-nvrtc-cu12==12.4.127
+numexpr==2.11.0
+charset-normalizer==3.4.3
+frozenlist==1.7.0
+setuptools==80.9.0
+cycler==0.12.1
+weasel==0.4.1
+tzdata==2025.2
+sacrebleu==2.5.1
+rouge_score==0.1.2
+requests==2.32.5
+nvidia-nvjitlink-cu12==12.4.127
+grpcio==1.74.0
+nvidia-cusparse-cu12==12.3.1.170
+mdurl==0.1.2
+pandas==2.3.1
+preshed==3.0.10
+attrs==25.3.0
+tensorboard-data-server==0.7.2
+aiohappyeyeballs==2.6.1
+keras==3.11.2
+wrapt==1.17.3
+aiosignal==1.4.0
+tcolorpy==0.1.7
+platformdirs==4.3.8
+tqdm-multiprocess==0.0.11
+python-dotenv==1.1.1
+wasabi==1.1.3
+google-pasta==0.2.0
+optree==0.17.0
+MarkupSafe==3.0.2
+colorlog==6.9.0
+nvidia-cufft-cu12==11.2.1.3
+lm_eval==0.4.9.1
+lxml==6.0.0
+protobuf==6.32.0
+radgraph==0.1.18
+scipy==1.16.1
+click==8.2.1
+wheel==0.45.1
+marisa-trie==1.3.0
+pathvalidate==3.3.1
+nvidia-nccl-cu12==2.21.5
+evaluate==0.4.5
+nvidia-cuda-runtime-cu12==12.4.127
+transformers==4.51.3
+aenum==3.1.15
+typing-inspection==0.4.1
+gitdb==4.0.12
+iniconfig==2.1.0
+multidict==6.6.3
+huggingface-hub==0.34.4
+tokenizers==0.21.4
+tabledata==1.3.4
+mbstrdecoder==1.1.4
+Werkzeug==3.1.3
+accelerate==1.10.0
+hf-xet==1.1.8
+tensorboard==2.20.0
+ml_dtypes==0.5.3
+pytest==8.4.1
+namex==0.1.0
+pillow==11.3.0
+datasets==3.6.0
+tqdm==4.67.1
+murmurhash==1.0.13
+fonttools==4.59.1
+absl-py==2.3.1
+multiprocess==0.70.16
+fsspec==2025.3.0
+transformers==4.51.3
+dill==0.3.8
+propcache==0.3.2
+jsonpickle==4.1.1
+BLEURT==0.0.2
+yarl==1.20.1
+portalocker==3.2.0
+httpx==0.27.2
+numpy==2.3.2
+mpmath==1.3.0
+pyarrow==21.0.0
+matplotlib==3.10.5
+typepy==1.3.4
+pycountry==24.6.1
+word2number==1.1
+psutil==7.0.0
+catalogue==2.0.10
+latex2sympy2_extended==1.0.6
+pydantic_core==2.33.2
+threadpoolctl==3.6.0
+spacy-loggers==1.0.5
+certifi==2025.8.3
+confection==0.1.5
+flame==0.1.0
+pfzy==0.3.4
+safetensors==0.6.2
+pip==25.1
+DataProperty==1.1.0
+lighteval==0.10.1.dev0
+jsonlines==4.0.0
+scikit-learn==1.7.1
+torch==2.6.0
+pytz==2025.2
+python-dateutil==2.9.0.post0
+nltk==3.9.1
+sqlitedict==2.1.0
+gast==0.6.0
+nvidia-curand-cu12==10.3.5.147
+rich==14.1.0
+sentry-sdk==2.33.2
+nvidia-cusparselt-cu12==0.6.2
+kiwisolver==1.4.9
+appdirs==1.4.4
+bert-score==0.3.13
+blis==1.3.0
+GitPython==3.1.45
+chardet==5.2.0
+more-itertools==10.7.0
+filelock==3.19.1
+transformers==4.51.3
+httpcore==1.0.9
+termcolor==3.1.0
+typer==0.16.1
+einops==0.8.1
+torchdata==0.11.0
+six==1.17.0
+colorama==0.4.6
+aiohttp==3.12.14
+srsly==2.5.1
+urllib3==2.5.0
+nvidia-cublas-cu12==12.4.5.8
+cloudpathlib==0.21.1
+h5py==3.14.0
+thinc==8.3.6
+markdown-it-py==4.0.0
+flash-attn==2.7.3
+prompt_toolkit==3.0.52
+nvidia-nvtx-cu12==12.4.127
+en_core_web_sm==3.8.0
+xxhash==3.5.0
+anyio==4.10.0
+joblib==1.5.1
+pydantic==2.11.7
+opt_einsum==3.4.0
+dotmap==1.3.30
+language_data==1.3.0
+shellingham==1.5.4
+nvidia-cudnn-cu12==9.1.0.70
+typing_extensions==4.14.1
+libclang==18.1.1
+tabulate==0.9.0
+annotated-types==0.7.0
+jaraco.context==5.3.0
+autocommand==2.2.2
+more-itertools==10.3.0
+tomli==2.0.1
+jaraco.functools==4.0.1
+zipp==3.19.2
+backports.tarfile==1.2.0
+wheel==0.45.1
+platformdirs==4.2.2
+inflect==7.3.1
+typing_extensions==4.12.2
+jaraco.text==3.12.1
+typeguard==4.3.0
+importlib_metadata==8.0.0
+packaging==24.2
+jaraco.collections==5.1.0
diff --git a/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..050118e4b9f12bc5327706ed04d6d2d1cbd6ce67
--- /dev/null
+++ b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log
@@ -0,0 +1,10 @@
+{"time":"2025-09-09T06:19:20.029854482Z","level":"INFO","msg":"stream: starting","core version":"0.21.0"}
+{"time":"2025-09-09T06:19:20.338868384Z","level":"INFO","msg":"stream: created new stream","id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T06:19:20.338942945Z","level":"INFO","msg":"stream: started","id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T06:19:20.338955936Z","level":"INFO","msg":"handler: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T06:19:20.33900181Z","level":"INFO","msg":"writer: Do: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T06:19:20.339014387Z","level":"INFO","msg":"sender: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T16:55:51.461783187Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-09T17:52:23.968650788Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream","body":"\n\n\n502 Server Error\n\n\nError: Server Error
\nThe server encountered a temporary error and could not complete your request.
Please try again in 30 seconds.\n
\n\n"}
+{"time":"2025-09-09T22:51:18.011409168Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream\": dial tcp 35.186.228.49:443: connect: connection refused"}
+{"time":"2025-09-09T22:58:20.165767227Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream\": dial tcp 35.186.228.49:443: connect: connection refused"}
diff --git a/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..b1fe6180502b5e0c9a480097879c081c907d671c
--- /dev/null
+++ b/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log
@@ -0,0 +1,21 @@
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Configure stats pid to 795439
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/.config/wandb/settings
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/flame/wandb/settings
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:setup_run_log_directory():703] Logging user logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():830] calling init triggers
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-09-09 06:19:19,818 INFO MainThread:795439 [wandb_init.py:init():871] starting backend
+2025-09-09 06:19:20,025 INFO MainThread:795439 [wandb_init.py:init():874] sending inform_init request
+2025-09-09 06:19:20,027 INFO MainThread:795439 [wandb_init.py:init():882] backend started and connected
+2025-09-09 06:19:20,033 INFO MainThread:795439 [wandb_init.py:init():953] updated telemetry
+2025-09-09 06:19:20,039 INFO MainThread:795439 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
+2025-09-09 06:19:20,682 INFO MainThread:795439 [wandb_init.py:init():1029] starting run threads in backend
+2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_console_start():2458] atexit reg
+2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2306] redirect: wrap_raw
+2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2375] Wrapping output streams.
+2025-09-09 06:19:20,815 INFO MainThread:795439 [wandb_run.py:_redirect():2398] Redirects installed.
+2025-09-09 06:19:20,817 INFO MainThread:795439 [wandb_init.py:init():1075] run started, returning control to user process
diff --git a/torchtitan/components/__pycache__/dataloader.cpython-312.pyc b/torchtitan/components/__pycache__/dataloader.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7c2ae5f7ce6a985bba4e4780597cf9971f42316
Binary files /dev/null and b/torchtitan/components/__pycache__/dataloader.cpython-312.pyc differ
diff --git a/torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc b/torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86532b3bc4da661183c020f21ccd5f27ea079a04
Binary files /dev/null and b/torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc differ
diff --git a/torchtitan/components/__pycache__/metrics.cpython-312.pyc b/torchtitan/components/__pycache__/metrics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00c9b41b8f5e4e1e3d80eac861f354891c01494a
Binary files /dev/null and b/torchtitan/components/__pycache__/metrics.cpython-312.pyc differ
diff --git a/torchtitan/components/__pycache__/tokenizer.cpython-312.pyc b/torchtitan/components/__pycache__/tokenizer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e6ef655c5f8cc3fe62d17d3f86b66a7ea3c4ab5
Binary files /dev/null and b/torchtitan/components/__pycache__/tokenizer.cpython-312.pyc differ
diff --git a/torchtitan/components/metrics.py b/torchtitan/components/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c90ed54a4af3d644abb552615675a7af5f15910
--- /dev/null
+++ b/torchtitan/components/metrics.py
@@ -0,0 +1,435 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+from collections import namedtuple
+from datetime import datetime
+from typing import Any
+
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from torchtitan.components.lr_scheduler import LRSchedulersContainer
+from torchtitan.components.optimizer import OptimizersContainer
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import ParallelDims
+from torchtitan.tools import utils
+from torchtitan.tools.logging import logger
+from torchtitan.tools.utils import Color, device_module, device_type
+
+# named tuple for passing device memory stats for logging
+DeviceMemStats = namedtuple(
+ "DeviceMemStats",
+ [
+ "max_active_gib",
+ "max_active_pct",
+ "max_reserved_gib",
+ "max_reserved_pct",
+ "num_alloc_retries",
+ "num_ooms",
+ ],
+)
+
+
+class DeviceMemoryMonitor:
+ def __init__(self, device: str = f"{device_type}:0"):
+ self.device = torch.device(device) # device object
+ self.device_name = device_module.get_device_name(self.device)
+ self.device_index = device_module.current_device()
+ self.device_capacity = device_module.get_device_properties(
+ self.device
+ ).total_memory
+ self.device_capacity_gib = self._to_gib(self.device_capacity)
+
+ device_module.reset_peak_memory_stats()
+ device_module.empty_cache()
+
+ def _to_gib(self, memory_in_bytes):
+ # NOTE: GiB (gibibyte) is 1024, vs GB is 1000
+ _gib_in_bytes = 1024 * 1024 * 1024
+ memory_in_gib = memory_in_bytes / _gib_in_bytes
+ return memory_in_gib
+
+ def _to_pct(self, memory):
+ return 100 * memory / self.device_capacity
+
+ def get_peak_stats(self):
+ device_info = device_module.memory_stats(self.device)
+
+ max_active = device_info.get("active_bytes.all.peak", -1)
+ max_active_gib = self._to_gib(max_active)
+ max_active_pct = self._to_pct(max_active)
+
+ max_reserved = device_info.get("reserved_bytes.all.peak", -1)
+ max_reserved_gib = self._to_gib(max_reserved)
+ max_reserved_pct = self._to_pct(max_reserved)
+
+ num_retries = device_info.get("num_alloc_retries", -1)
+ num_ooms = device_info.get("num_ooms", -1)
+
+ if num_retries > 0:
+ logger.warning(
+ f"{num_retries} {device_type.upper()} memory allocation retries."
+ )
+ if num_ooms > 0:
+ logger.warning(f"{num_ooms} {device_type.upper()} OOM errors thrown.")
+
+ return DeviceMemStats(
+ max_active_gib,
+ max_active_pct,
+ max_reserved_gib,
+ max_reserved_pct,
+ num_retries,
+ num_ooms,
+ )
+
+ def reset_peak_stats(self):
+ device_module.reset_peak_memory_stats()
+
+
+def build_device_memory_monitor():
+ device_memory_monitor = DeviceMemoryMonitor(device_type)
+ logger.info(
+ f"{device_type.upper()} capacity: {device_memory_monitor.device_name} "
+ f"with {device_memory_monitor.device_capacity_gib:.2f}GiB memory"
+ )
+ return device_memory_monitor
+
+
+class BaseLogger:
+ """Logger that does nothing, used when logging is disabled."""
+
+ def log(self, metrics: dict[str, Any], step: int) -> None:
+ pass
+
+ def close(self) -> None:
+ pass
+
+
+class TensorBoardLogger(BaseLogger):
+ """Logger implementation for TensorBoard."""
+
+ def __init__(self, log_dir: str, tag: str | None = None):
+ self.tag = tag
+ self.writer = SummaryWriter(log_dir, max_queue=1000)
+ logger.info(f"TensorBoard logging enabled. Logs will be saved at {log_dir}")
+
+ def log(self, metrics: dict[str, Any], step: int) -> None:
+ for k, v in metrics.items():
+ tag = k if self.tag is None else f"{self.tag}/{k}"
+ self.writer.add_scalar(tag, v, step)
+
+ def close(self) -> None:
+ self.writer.close()
+
+
+class WandBLogger(BaseLogger):
+ """Logger implementation for Weights & Biases."""
+
+ def __init__(self, log_dir: str, tag: str | None = None):
+ # Import wandb here to avoid startup import
+ import wandb
+
+ self.wandb = wandb
+ self.tag = tag
+
+ # Create logging directory
+ os.makedirs(log_dir, exist_ok=True)
+
+ self.wandb.init(
+ project=os.getenv("WANDB_PROJECT", "torchtitan"),
+ dir=log_dir,
+ )
+ logger.info("WandB logging enabled")
+
+ def log(self, metrics: dict[str, Any], step: int) -> None:
+ wandb_metrics = {
+ (k if self.tag is None else f"{self.tag}/{k}"): v
+ for k, v in metrics.items()
+ }
+ self.wandb.log(wandb_metrics, step=step)
+
+ def close(self) -> None:
+ if self.wandb.run is not None:
+ self.wandb.finish()
+
+
+def ensure_pp_loss_visible(
+ parallel_dims: ParallelDims, job_config: JobConfig, color: Color
+) -> None:
+ """
+ Ensures that the loss is visible on the console for pipeline-parallel training.
+
+ For pipeline-parallel training, the loss is only visible on the last pipeline stage.
+ This function checks if the appropriate rank is included in the LOG_RANK environment
+ variable and warns if it's not.
+ """
+
+ # V Block Schedules return loss on rank 0
+ if job_config.parallelism.pipeline_parallel_schedule == "ZBVZeroBubble":
+ return
+
+ # Calculate the rank where loss is visible (first rank of the last pipeline stage)
+ world_size = parallel_dims.world_size
+ pp_size = parallel_dims.pp
+ loss_visible_rank = (world_size // pp_size) * (pp_size - 1)
+
+ # Check if the loss-visible rank is included in LOG_RANK environment variable
+ env_logged_ranks = os.environ.get("LOG_RANK", "").split(",")
+ if env_logged_ranks == [""]:
+ env_logged_ranks = []
+
+ if str(loss_visible_rank) not in env_logged_ranks:
+ logger.warning(
+ f"{color.red}Pipeline Parallel loss is not visible. "
+ f"Please add {color.yellow}rank {loss_visible_rank}{color.red} "
+ f"to LOG_RANK environment variable in run_train.sh.{color.reset}"
+ )
+
+
+def _get_metrics_rank(
+ parallel_dims: ParallelDims,
+ job_config: JobConfig,
+) -> int:
+ """
+ Determines which rank should log metrics.
+
+ Returns:
+ int: The rank responsible for logging metrics:
+ - Rank 0 for non-pipeline-parallel configs
+ - Rank 0 for pipeline-parallel 'ZBVZeroBubble' schedule
+ - The first rank of the last pipeline stage for other pipeline-parallel schedules
+ """
+ # Early return for non-pipeline-parallel configurations
+ if not parallel_dims.pp_enabled:
+ return 0
+
+ # V Block Schedules return loss on rank 0
+ if job_config.parallelism.pipeline_parallel_schedule == "ZBVZeroBubble":
+ return 0
+
+ # Calculate first rank of the last pipeline stage
+ world_size = parallel_dims.world_size
+ pp_size = parallel_dims.pp
+ return (world_size // pp_size) * (pp_size - 1)
+
+
+def _build_metric_logger(
+ job_config: JobConfig, parallel_dims: ParallelDims, tag: str | None = None
+) -> BaseLogger:
+ """
+ Build an appropriate metric logger based on configuration.
+ """
+ metrics_config = job_config.metrics
+
+ # Log initial config state
+ logger.debug(
+ f"Building logger with config: wandb={metrics_config.enable_wandb}, "
+ f"tensorboard={metrics_config.enable_tensorboard}"
+ )
+
+ # Check if any logging backend is enabled
+ has_logging_enabled = (
+ metrics_config.enable_tensorboard or metrics_config.enable_wandb
+ )
+
+ # Determine if this rank should log
+ should_log = has_logging_enabled
+ if (not metrics_config.save_for_all_ranks) and should_log:
+ metrics_rank = _get_metrics_rank(parallel_dims, job_config)
+ should_log = torch.distributed.get_rank() == metrics_rank
+
+ logger.debug(
+ f"Logging decision: has_logging_enabled={has_logging_enabled}, should_log={should_log}"
+ )
+
+ if not should_log:
+ logger.debug("Returning BaseLogger due to should_log=False")
+ return BaseLogger()
+
+ # Setup logging directory
+ dump_dir = job_config.job.dump_folder
+ base_log_dir = os.path.join(
+ dump_dir, metrics_config.save_tb_folder, datetime.now().strftime("%Y%m%d-%H%M")
+ )
+
+ if metrics_config.save_for_all_ranks:
+ base_log_dir = os.path.join(
+ base_log_dir, f"rank_{torch.distributed.get_rank()}"
+ )
+
+ # Create loggers in priority order
+ if metrics_config.enable_wandb:
+ logger.debug("Attempting to create WandB logger")
+ try:
+ return WandBLogger(base_log_dir, tag)
+ except Exception as e:
+ if "No module named 'wandb'" in str(e):
+ logger.error(
+ "Failed to create WandB logger: No module named 'wandb'. Please install it using 'pip install wandb'."
+ )
+ else:
+ logger.error(f"Failed to create WandB logger: {e}")
+
+ if metrics_config.enable_tensorboard:
+ logger.debug("Creating TensorBoard logger")
+ return TensorBoardLogger(base_log_dir, tag)
+
+ logger.debug("No loggers enabled, returning BaseLogger")
+ return BaseLogger()
+
+
+class MetricsProcessor:
+ """Metrics processor to processes the metrics and log metrics.
+
+ The current MetricsProcessor log some metrics to STDOUT and some metrics to
+ TensorBoard or WandB.
+
+ Args:
+ job_config (JobConfig): Job configuration.
+ parallel_dims (ParallelDims): Parallel dimensions.
+ tag (Optional[str]): Tag to use for TensorBoard or WandB. Defaults to None.
+ """
+
+ logger: BaseLogger
+ parallel_dims: ParallelDims
+ job_config: JobConfig
+ device_memory_monitor: DeviceMemoryMonitor
+ color: utils.NoColor | utils.Color
+
+ gpu_peak_flops: int
+ ntokens_since_last_log: int
+ data_loading_times: list[float]
+ time_last_log: float
+
+ num_flops_per_token: int
+ optimizers: OptimizersContainer | None
+ lr_schedulers: LRSchedulersContainer | None
+
+ def __init__(
+ self,
+ job_config: JobConfig,
+ parallel_dims: ParallelDims,
+ tag: str | None = None,
+ ):
+ self.logger = _build_metric_logger(job_config, parallel_dims, tag)
+ self.parallel_dims = parallel_dims
+ self.job_config = job_config
+ self.device_memory_monitor = build_device_memory_monitor()
+ # used for colorful printing
+ self.color = (
+ utils.NoColor()
+ if job_config.metrics.disable_color_printing
+ else utils.Color()
+ )
+
+ self.gpu_peak_flops = utils.get_peak_flops(
+ self.device_memory_monitor.device_name
+ )
+ self.ntokens_since_last_log = 0
+ self.data_loading_times = []
+ self.time_last_log = time.perf_counter()
+ self.device_memory_monitor.reset_peak_stats()
+
+ # These variables have to be set later as they depend on other components or model.
+ self.num_flops_per_token = -1
+ self.optimizers = None
+ self.lr_schedulers = None
+
+ def should_log(self, step: int) -> bool:
+ return step == 1 or step % self.job_config.metrics.log_freq == 0
+
+ def log(
+ self,
+ step: int,
+ global_avg_loss: float,
+ global_max_loss: float,
+ extra_metrics: dict[str, Any] | None = None,
+ ):
+ assert self.num_flops_per_token > 0, "num_flops_per_token must be set"
+
+ time_delta = time.perf_counter() - self.time_last_log
+
+ # tokens per second per device, abbreviated as tps
+ tps = self.ntokens_since_last_log / (
+ time_delta * self.parallel_dims.non_data_parallel_size
+ )
+ # model FLOPS utilization
+ # For its definition and calculation, please refer to the PaLM paper:
+ # https://arxiv.org/abs/2204.02311
+ mfu = 100 * self.num_flops_per_token * tps / self.gpu_peak_flops
+ tflops = self.num_flops_per_token * tps / 1e12
+
+ time_end_to_end = time_delta / self.job_config.metrics.log_freq
+ time_data_loading = sum(self.data_loading_times) / len(self.data_loading_times)
+ time_data_loading_pct = 100 * sum(self.data_loading_times) / time_delta
+
+ device_mem_stats = self.device_memory_monitor.get_peak_stats()
+
+ metrics = {
+ "loss_metrics/global_avg_loss": global_avg_loss,
+ "loss_metrics/global_max_loss": global_max_loss,
+ "throughput(tps)": tps,
+ "tflops": tflops,
+ "mfu(%)": mfu,
+ "time_metrics/end_to_end(s)": time_end_to_end,
+ "time_metrics/data_loading(s)": time_data_loading,
+ "time_metrics/data_loading(%)": time_data_loading_pct,
+ "memory/max_active(GiB)": device_mem_stats.max_active_gib,
+ "memory/max_active(%)": device_mem_stats.max_active_pct,
+ "memory/max_reserved(GiB)": device_mem_stats.max_reserved_gib,
+ "memory/max_reserved(%)": device_mem_stats.max_reserved_pct,
+ "memory/num_alloc_retries": device_mem_stats.num_alloc_retries,
+ "memory/num_ooms": device_mem_stats.num_ooms,
+ }
+
+ if extra_metrics:
+ metrics.update(extra_metrics)
+
+ self.logger.log(metrics, step)
+
+ color = self.color
+ construct_string = str(
+ f"{color.red}step: {step:2} "
+ f"{color.green}loss: {global_avg_loss:7.4f} "
+ f"{color.yellow}memory: {device_mem_stats.max_reserved_gib:5.2f}GiB"
+ f"({device_mem_stats.max_reserved_pct:.2f}%) "
+ f"{color.blue}tps: {round(tps):,} "
+ f"{color.cyan}tflops: {tflops:,.2f} "
+ f"{color.magenta}mfu: {mfu:.2f}%{color.reset}"
+ )
+
+ if extra_metrics:
+ for k, v in extra_metrics.items():
+ if "loss" in k:
+ construct_string += f" {color.white}{k.lstrip('loss_metrics/')}: {v:7.4f}"
+ logger.info(
+ construct_string
+ )
+
+ self.ntokens_since_last_log = 0
+ self.data_loading_times.clear()
+ self.time_last_log = time.perf_counter()
+ self.device_memory_monitor.reset_peak_stats()
+
+ def close(self):
+ self.logger.close()
+
+
+def build_metrics_processor(
+ job_config: JobConfig, parallel_dims: ParallelDims, tag: str | None = None
+) -> MetricsProcessor:
+ """Create a metrics processor.
+
+ Args:
+ job_config (JobConfig): Job configuration.
+ parallel_dims (ParallelDims): Parallel dimensions.
+ tag (Optional[str]): Tag to use for TensorBoard or WandB. Defaults to None.
+
+ Returns:
+ MetricsProcessor: A metrics processor.
+ """
+ return MetricsProcessor(job_config, parallel_dims, tag)
diff --git a/torchtitan/experiments/deepseek_v3/LICENSE-CODE b/torchtitan/experiments/deepseek_v3/LICENSE-CODE
new file mode 100644
index 0000000000000000000000000000000000000000..d84f527e101b2cdd171e2b14253f84ea4fedabe9
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/LICENSE-CODE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 DeepSeek
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/torchtitan/experiments/deepseek_v3/README.md b/torchtitan/experiments/deepseek_v3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1c4303a91e1fc4d31f88f0f4b345af90a9ec3f0
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/README.md
@@ -0,0 +1,40 @@
+# Running DeepSeek in Titan (experimental)
+
+This folder contains a DeepSeek model supporting v2 and v3 as well as kernels
+and scripts needed to run it.
+
+## Inference
+
+### Prerequisites:
+
+You will need to download a DeepSeek model's weights if you want to run a
+pre-trained checkpoint. We provided a script to download the weights from
+HuggingFace Model Hub:
+```bash
+python download.py [vX]
+```
+where `vX` can be v2 or v3, both are supported. You may be required to create a
+HuggingFace account and log in first.
+
+### Running inference:
+
+The inference script is in `generate.py`. You can run it with the following
+command:
+```bash
+torchrun --standalone --nproc-per-node 4 generate.py
+```
+This will run inference on the `DeepSeek-V2-Lite-Chat` model using 4 GPUs by
+default.
+
+Alternatively, you can run inference by using `bash inference.sh`, optionally
+followed by your prompt.
+
+## Training
+
+The training script is in `train.py`. You can run it by the following command:
+```bash
+torchrun --standalone --nproc-per-node 8 train.py
+```
+
+This will run training on the `DeepSeek-V2-Lite-Chat` model using 8 GPUs by
+default, with pipeline parallel, expert parallel, and data parallel enabled.
diff --git a/torchtitan/experiments/deepseek_v3/checkpoint.py b/torchtitan/experiments/deepseek_v3/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..535ac7fe069a88555841181dddc1e870c2d30934
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/checkpoint.py
@@ -0,0 +1,154 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import os
+from typing import Dict, Optional, Set, Tuple
+
+import torch
+from safetensors import safe_open
+
+from transformers.utils import cached_file
+
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json"
+
+
+def read_weights_from_json(file_path: str) -> Optional[Dict[str, str]]:
+ try:
+ with open(file_path, "r") as file:
+ data = json.load(file)
+
+ if "weight_map" in data and isinstance(data["weight_map"], dict):
+ return data["weight_map"]
+ else:
+ logger.info("No 'weight_map' dictionary found in the JSON file.")
+ return None
+ except (json.JSONDecodeError, Exception) as e:
+ logger.info(f"An error occurred while reading the JSON file: {str(e)}")
+ return None
+
+
+def get_hf_weight_map_and_path(
+ model_id: str,
+) -> Tuple[Dict[str, str], str]:
+ """Get the weight map for a given HF model id and also the cache path for loading the weights"""
+ try:
+ index_file = cached_file(model_id, _DEFAULT_SAFETENSOR_FILE_NAME)
+ except Exception as e:
+ logger.error(
+ f"Model `{model_id}` not found in HF cache. "
+ f"You can download the model using `python download.py {model_id}"
+ )
+ raise e
+
+ weight_map = read_weights_from_json(index_file)
+ weight_path = os.path.dirname(index_file)
+ logger.info(f"Loading weights from: {weight_path}")
+ return weight_map, weight_path
+
+
+def get_needed_files(
+ state_dict: Dict[str, torch.Tensor], weight_map: Dict[str, str]
+) -> Set[str]:
+ needed_files = set()
+ for param in state_dict.keys():
+ file = weight_map.get(param)
+ if file:
+ needed_files.add(file)
+ elif param.endswith("weight"):
+ raise ValueError(
+ f"Parameter {param} not found in weight map, please check..."
+ )
+ logger.info(f"Needed files: {needed_files}")
+ return needed_files
+
+
+def load_safetensor_file(
+ full_path: str, device: torch.device
+) -> Dict[str, torch.Tensor]:
+ tensors = {}
+ with safe_open(full_path, framework="pt", device=device) as f:
+ for k in f.keys():
+ tensors[k] = f.get_tensor(k)
+ logger.info(f"Loaded {len(tensors)} tensors from {full_path}")
+ return tensors
+
+
+def load_safetensor_weights(
+ model: torch.nn.Module,
+ weight_map: Dict[str, str],
+ file_location: str,
+ device: torch.device,
+):
+ """
+ Load safetensor weights into a `nn.Module`.
+
+ Args:
+ model (Module): The PyTorch module to load weights into. It may be a
+ model chunk or a full model.
+ weight_map (Dict[str, str]): Mapping of model parameters to file names.
+ file_location (str): Directory containing the weight files.
+ device (torch.device): The device to load tensors onto.
+ """
+ model_state_dict = model.state_dict()
+ needed_files = get_needed_files(model_state_dict, weight_map)
+ updated_states: Set[str] = set()
+
+ for file in needed_files:
+ full_path = os.path.join(file_location, file)
+ try:
+ checkpoint = load_safetensor_file(full_path, "cpu")
+ except FileNotFoundError:
+ logger.error(f"File not found: {full_path}")
+ except Exception as e:
+ logger.error(f"Error during checkpoint processing of {full_path}: {str(e)}")
+
+ matched_keys = set(checkpoint.keys()) & set(model_state_dict.keys())
+ for key in matched_keys:
+ # Check shape
+ if model_state_dict[key].shape != checkpoint[key].shape:
+ raise ValueError(
+ f"Shape mismatch for {key}: "
+ f"model needs {model_state_dict[key].shape}, but "
+ f"checkpoint has {checkpoint[key].shape}"
+ )
+ model_state_dict[key] = checkpoint[key].to(device)
+
+ updated_states.update(matched_keys)
+
+ missing_keys = set(model_state_dict.keys()) - updated_states
+ if missing_keys:
+ raise RuntimeError(
+ f"Partially updated state dict. Missing parameters: {missing_keys}"
+ )
+
+ model.load_state_dict(model_state_dict, strict=False, assign=True)
+ logger.info(f"Successfully loaded {len(updated_states)} weights into model")
+
+
+def load_weights_from_hf(
+ model: torch.nn.Module,
+ distribution: str,
+ device: torch.device,
+):
+ """
+ Load the weights from Hugging Face format (index file + multiple safetensor
+ files), and fill into `model`. Model config is needed b/c we permute
+ wq and wk weights based on attn heads.
+ """
+
+ weight_map, weight_path = get_hf_weight_map_and_path(distribution)
+
+ load_safetensor_weights(
+ model,
+ weight_map,
+ weight_path,
+ device,
+ )
diff --git a/torchtitan/experiments/deepseek_v3/download.py b/torchtitan/experiments/deepseek_v3/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b9ec3104d716cbd6142c6564d83f042f128770f
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/download.py
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Usage:
+# Downloads a given model to the HF Cache. Pass in a listed option ala "v3" or your own custom model path.
+# python download.py {model_id} [custom_model_path]
+# Examples:
+# python download.py v2 # Use predefined model: deepseek-ai/DeepSeek-V2
+# python download.py custom "deepseek-ai/new-model" # Download a custom model path
+
+# Available models:
+# "v2-lite-chat": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+# "v2-lite": "deepseek-ai/DeepSeek-V2-Lite",
+# "v2": "deepseek-ai/DeepSeek-V2",
+# "v3": "deepseek-ai/deepseek-v3",
+# "v3-0324": "deepseek-ai/DeepSeek-V3-0324",
+# "custom": None, # Placeholder for custom models
+
+
+import sys
+
+from transformers import AutoModelForCausalLM
+
+
+MODELS = {
+ "v2-lite-chat": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+ "v2-lite": "deepseek-ai/DeepSeek-V2-Lite",
+ "v2": "deepseek-ai/DeepSeek-V2",
+ "v3": "deepseek-ai/deepseek-v3",
+ "v3-0324": "deepseek-ai/DeepSeek-V3-0324",
+ "custom": None, # For custom (any) models
+}
+
+
+def print_usage():
+ print("Usage:")
+ print(" python download.py [model_version]")
+ print(" python download.py custom [custom_model_path]")
+ print("\nAvailable predefined models:")
+ for key, model in MODELS.items():
+ if key != "custom": # Skip the custom placeholder
+ print(f" {key}: {model}")
+ print("\nFor custom models:")
+ print(" custom: Specify your own model path")
+ print(' Example: python download.py custom "organization/model-name"')
+ sys.exit(1)
+
+
+# Process command line arguments
+if len(sys.argv) < 2 or sys.argv[1] not in MODELS:
+ print_usage()
+
+if sys.argv[1] == "custom":
+ if len(sys.argv) != 3:
+ print("Error: Custom model requires a model path")
+ print_usage()
+ model_id = sys.argv[2]
+ print(f"Using custom model: {model_id}")
+else:
+ model_id = MODELS[sys.argv[1]]
+print(f"Downloading model: {model_id}")
+
+model = AutoModelForCausalLM.from_pretrained(
+ model_id,
+ device_map="auto",
+ trust_remote_code=True,
+)
diff --git a/torchtitan/experiments/deepseek_v3/model.py b/torchtitan/experiments/deepseek_v3/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..0669df9528b3db0de3325db36f010312b5b3eac7
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/model.py
@@ -0,0 +1,1325 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This code is based on model definition of `deepseek-ai/DeepSeek-V3-Base` on
+# Hugging Face Model Hub. Url:
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/modeling_deepseek.py
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/resolve/main/configuration_deepseek.py
+#
+# It has been modified from its original forms to accommodate naming convention
+# and usage patterns of the TorchTitan project.
+
+# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeepSeek model."""
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.distributed as dist
+
+import torch.distributed._symmetric_memory as symm_mem
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from attn_mask_utils import _prepare_4d_causal_attention_mask
+from indices import generate_permute_indices
+from model_config import ModelArgs
+from symm_mem_recipes import OnDeviceAllToAllV
+from torch import nn
+from torch.distributed._functional_collectives import all_to_all_single_autograd
+
+from torchtitan.experiments.kernels.triton_mg_group_gemm.torchao_pr import (
+ ALIGN_SIZE_M,
+ grouped_gemm_forward,
+)
+
+# Get model parallel subgroup by name:
+# e.g. "pp", "ep", None
+def get_group(dim_name: Optional[str] = None) -> dist.ProcessGroup:
+ glob = torch.distributed.device_mesh._mesh_resources.get_current_mesh()
+ return glob.get_group(dim_name)
+
+
+class RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class RotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (
+ self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+ )
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ # Build here to make `torch.jit.trace` work.
+ self._set_cos_sin_cache(
+ seq_len=max_position_embeddings,
+ device=self.inv_freq.device,
+ dtype=torch.get_default_dtype(),
+ )
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+ )
+
+ freqs = torch.outer(t, self.inv_freq.to(t.device))
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+ def forward(self, x, seq_len=None):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+ return (
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
+ )
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+ """RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+ def __init__(
+ self,
+ dim,
+ max_position_embeddings=2048,
+ base=10000,
+ device=None,
+ scaling_factor=1.0,
+ ):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ t = torch.arange(
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+ )
+ t = t / self.scaling_factor
+
+ freqs = torch.outer(t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Deepseek
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+ """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+ def __init__(
+ self,
+ dim,
+ max_position_embeddings=2048,
+ base=10000,
+ device=None,
+ scaling_factor=1.0,
+ ):
+ self.scaling_factor = scaling_factor
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+
+ if seq_len > self.max_position_embeddings:
+ base = self.base * (
+ (self.scaling_factor * seq_len / self.max_position_embeddings)
+ - (self.scaling_factor - 1)
+ ) ** (self.dim / (self.dim - 2))
+ inv_freq = 1.0 / (
+ base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+ )
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(
+ self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+ )
+
+ freqs = torch.outer(t, self.inv_freq)
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(
+ num_rotations, dim, base=10000, max_position_embeddings=2048
+):
+ return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+ 2 * math.log(base)
+ )
+
+
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(
+ low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+ low = math.floor(
+ yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+ )
+ high = math.ceil(
+ yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+ )
+ return max(low, 0), min(high, dim - 1) # Clamp values just in case
+
+
+def yarn_get_mscale(scale=1, mscale=1):
+ if scale <= 1:
+ return 1.0
+ return 0.1 * mscale * math.log(scale) + 1.0
+
+
+def yarn_linear_ramp_mask(min, max, dim):
+ if min == max:
+ max += 0.001 # Prevent singularity
+
+ linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+ ramp_func = torch.clamp(linear_func, 0, 1)
+ return ramp_func
+
+
+class YarnRotaryEmbedding(RotaryEmbedding):
+ def __init__(
+ self,
+ dim,
+ max_position_embeddings=2048,
+ base=10000,
+ device=None,
+ scaling_factor=1.0,
+ original_max_position_embeddings=4096,
+ beta_fast=32,
+ beta_slow=1,
+ mscale=1,
+ mscale_all_dim=0,
+ ):
+ self.scaling_factor = scaling_factor
+ self.original_max_position_embeddings = original_max_position_embeddings
+ self.beta_fast = beta_fast
+ self.beta_slow = beta_slow
+ self.mscale = mscale
+ self.mscale_all_dim = mscale_all_dim
+ super().__init__(dim, max_position_embeddings, base, device)
+
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
+ self.max_seq_len_cached = seq_len
+ dim = self.dim
+
+ freq_extra = 1.0 / (
+ self.base
+ ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+ )
+ freq_inter = 1.0 / (
+ self.scaling_factor
+ * self.base
+ ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+ )
+
+ low, high = yarn_find_correction_range(
+ self.beta_fast,
+ self.beta_slow,
+ dim,
+ self.base,
+ self.original_max_position_embeddings,
+ )
+ inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
+ device=device, dtype=torch.float32
+ )
+ inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ t = torch.arange(seq_len, device=device, dtype=torch.float32)
+
+ freqs = torch.outer(t, inv_freq)
+
+ _mscale = float(
+ yarn_get_mscale(self.scaling_factor, self.mscale)
+ / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+ )
+
+ emb = torch.cat((freqs, freqs), dim=-1)
+ self.register_buffer(
+ "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
+ )
+ self.register_buffer(
+ "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
+ )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`):
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+ used to pass offsetted position ids when working with a KV-cache.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+
+ b, h, s, d = q.shape
+ q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ b, h, s, d = k.shape
+ k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class MLP(nn.Module):
+ act_fn = nn.SiLU()
+
+ def __init__(self, config, hidden_size=None, intermediate_size=None):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+ self.intermediate_size = (
+ config.intermediate_size if intermediate_size is None else intermediate_size
+ )
+
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+ def forward(self, x):
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+ return down_proj
+
+
+class MoEGate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.top_k = config.num_experts_per_tok
+ self.n_routed_experts = config.n_routed_experts
+ self.routed_scaling_factor = config.routed_scaling_factor
+ self.scoring_func = config.scoring_func
+ self.seq_aux = config.seq_aux
+ self.topk_method = config.topk_method
+ self.n_group = config.n_group
+ self.topk_group = config.topk_group
+
+ # topk selection algorithm
+ self.norm_topk_prob = config.norm_topk_prob
+ self.gating_dim = config.hidden_size
+ self.weight = nn.Parameter(
+ torch.empty((self.n_routed_experts, self.gating_dim))
+ )
+ if self.topk_method == "noaux_tc":
+ self.e_score_correction_bias = nn.Parameter(
+ # Changed from torch.empty to torch.rand to avoid non-even
+ # distribution for runs without actual weigths
+ torch.rand((self.n_routed_experts))
+ )
+ self.reset_parameters()
+
+ def reset_parameters(self) -> None:
+ import torch.nn.init as init
+
+ init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+ def forward(self, hidden_states):
+ bsz, seq_len, h = hidden_states.shape
+ # compute gating score
+ hidden_states = hidden_states.view(-1, h)
+ logits = F.linear(
+ hidden_states.type(torch.float32), self.weight.type(torch.float32), None
+ )
+ if self.scoring_func == "sigmoid":
+ scores = logits.sigmoid()
+ elif self.scoring_func == "softmax":
+ scores = logits.softmax(dim=-1, dtype=torch.float32)
+ else:
+ raise NotImplementedError(
+ f"insupportable scoring function for MoE gating: {self.scoring_func}"
+ )
+
+ # select top-k experts
+ if self.topk_method == "noaux_tc":
+ scores_for_choice = scores.view(
+ bsz * seq_len, -1
+ ) + self.e_score_correction_bias.unsqueeze(0)
+ group_scores = (
+ scores_for_choice.view(bsz * seq_len, self.n_group, -1)
+ .topk(2, dim=-1)[0]
+ .sum(dim=-1)
+ ) # [n, n_group]
+ group_idx = torch.topk(
+ group_scores, k=self.topk_group, dim=-1, sorted=False
+ )[
+ 1
+ ] # [n, top_k_group]
+ group_mask = torch.zeros_like(group_scores) # [n, n_group]
+ group_mask.scatter_(1, group_idx, 1) # [n, n_group]
+ score_mask = (
+ group_mask.unsqueeze(-1)
+ .expand(
+ bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
+ )
+ .reshape(bsz * seq_len, -1)
+ ) # [n, e]
+ tmp_scores = scores_for_choice.masked_fill(
+ ~score_mask.bool(), 0.0
+ ) # [n, e]
+ _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
+ topk_weight = scores.gather(1, topk_idx)
+ elif self.topk_method == "greedy":
+ topk_weight, topk_idx = torch.topk(
+ scores, k=self.top_k, dim=-1, sorted=False
+ )
+ else:
+ raise NotImplementedError(
+ f"insupportable TopK function for MoE gating: {self.topk_method}"
+ )
+
+ # norm gate to sum 1
+ if self.top_k > 1 and self.norm_topk_prob:
+ denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+ topk_weight = topk_weight / denominator
+ topk_weight = (
+ topk_weight * self.routed_scaling_factor
+ ) # must multiply the scaling factor
+
+ return topk_idx, topk_weight
+
+
+class MoE(nn.Module):
+ """
+ A mixed expert module containing shared experts.
+ """
+
+ # Class attributes:
+ # Two shuffle method supported:
+ # 1. "torch_all_to_all"
+ # 2. "symm_mem" (see `setup_symm_mem` below)
+ shuffle_method = "torch_all_to_all"
+
+ # Symmetric memory buffers shared by all MoE instances across layers
+ token_send_buf: Optional[torch.Tensor] = None
+ token_gather_buf: Optional[torch.Tensor] = None
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.num_experts_per_tok = config.num_experts_per_tok
+
+ # ep_size is the number of ranks in expert dimension
+ if config.ep_size <= 1:
+ raise ValueError(
+ "For code simplicity, this model only supports distributed experts, "
+ "thus EP size must be > 1, please modify your model config"
+ )
+ self.ep_group = get_group("ep")
+ assert config.ep_size == self.ep_group.size()
+ self.ep_size = config.ep_size
+ self.ep_rank = self.ep_group.rank()
+ self.experts_per_rank = config.n_routed_experts // config.ep_size
+ # Use ModuleDict instead of ModuleList to preserve absoulte expert
+ # IDs while avoiding `None` experts. The absolute expert IDs match
+ # with checkpoint FQNs.
+ self.experts = nn.ModuleDict()
+ for i in range(self.experts_per_rank):
+ abs_expert_id = self.ep_rank * self.experts_per_rank + i
+ self.experts[str(abs_expert_id)] = MLP(
+ config, intermediate_size=config.moe_intermediate_size
+ )
+ self.gate = MoEGate(config)
+ if config.n_shared_experts is not None:
+ intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+ self.shared_experts = MLP(
+ config=config, intermediate_size=intermediate_size
+ )
+
+ def combine_experts(self, submod_name):
+ all_weights = []
+ for expert in self.experts.values():
+ lin = expert.get_submodule(submod_name)
+ all_weights.append(lin.weight)
+ lin.weight = None
+
+ concat_weight = torch.cat(all_weights)
+ self.register_parameter(f"{submod_name}_weight", nn.Parameter(concat_weight))
+
+ # This function is used to create a symm mem buffer for MoE's. It is for
+ # shuffling tokens fully "on-device", as compared to traditional torch
+ # all_to_all APIs which requrie a GPU-to-CPU sync of the splits. If a user
+ # calls this function, the `shuffle_method` would switch from
+ # `torch_all_to_all` to `symm_mem`.
+ def setup_symm_mem(self, dtype: torch.dtype, device: torch.device):
+ # Switch shuffle method
+ self.shuffle_method = "symm_mem"
+
+ # Combine expert weights
+ print("Combining expert weights for Group GEMM")
+ self.combine_experts("gate_proj")
+ self.combine_experts("up_proj")
+ self.combine_experts("down_proj")
+
+ # Assuming worst case, 2x tokens are routed to one EP rank
+ overflow = 2
+ OnDeviceAllToAllV.max_output_len = (
+ self.config.max_seq_len * self.num_experts_per_tok * overflow
+ )
+
+ # Symmetric memory buffers are shared by all MoE instances across
+ # layers, we only need to initialize them once
+ if MoE.token_send_buf is not None:
+ return
+
+ # Input buffer for DP-to-EP shuffle
+ MoE.token_send_buf = symm_mem.empty(
+ self.config.max_seq_len
+ * self.num_experts_per_tok, # seq len * top k (flattened)
+ self.config.hidden_size, # hidden dim
+ dtype=dtype,
+ device=device,
+ )
+ # Input buffer for EP-to-DP shuffle
+ MoE.token_gather_buf = symm_mem.empty(
+ self.config.max_seq_len
+ * self.num_experts_per_tok # seq len * top k (flattened)
+ * overflow,
+ self.config.hidden_size, # hidden dim
+ dtype=dtype,
+ device=device,
+ )
+ print(f"EP rank [{self.ep_rank}]: Created Symmetric Memory for MoE")
+
+ def get_send_buf(self):
+ # [Why detach?] During a first forward-backward step, the buffer would
+ # be included in a computational graph. In a second step, autograd will
+ # return an error saying "Trying to backward through the graph a second
+ # time (or directly access saved tensors more than once)". This is
+ # because the buffer is still in the graph, and autograd is trying to
+ # backward through the graph a second time. To avoid this, we detach the
+ # buffer from the graph. `detach()` returns a new tensor, which shares
+ # the same storage with the original one.
+ self.token_send_buf.grad = None
+ return self.token_send_buf.detach()
+
+ def get_gather_buf(self):
+ # See [Why detach?] in `get_send_buf`
+ self.token_gather_buf.grad = None
+ return self.token_gather_buf.detach()
+
+ def forward(self, hidden_states):
+ identity = hidden_states
+ orig_shape = hidden_states.shape
+ # for each token, select top-k experts, and compute the weight for each expert
+ topk_idx, topk_weight = self.gate(hidden_states)
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ if self.shuffle_method == "symm_mem":
+ y = self.moe_on_device(hidden_states, topk_idx, topk_weight)
+ else: # "torch_all_to_all"
+ y = self.moe_forward(hidden_states, topk_idx, topk_weight)
+
+ y = y.view(*orig_shape)
+ if self.config.n_shared_experts is not None:
+ y = y + self.shared_experts(identity)
+ return y
+
+ def moe_forward(self, x, topk_ids, topk_weight):
+ # This part sorts the token indices so that tokens routed to the same expert reside consecutively.
+ # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive.
+ # Since this is an "aritificial" index creation (final outcome being
+ # `idxs`), we don't need gradients here.
+ with torch.no_grad():
+ # [seq_len, n_routed_experts]
+ cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts))
+ # Fill 1 to the selected experts
+ cnts.scatter_(1, topk_ids, 1)
+ tokens_per_expert = cnts.sum(dim=0)
+ # Token indices for each expert
+ idxs = topk_ids.view(-1).argsort()
+ sorted_tokens_shape = idxs.shape + x.shape[1:]
+
+ sorted_tokens = x[idxs // topk_ids.shape[1]]
+ assert sorted_tokens.shape == sorted_tokens_shape
+
+ # This part exchange the information about the number of tokens send and
+ # received by each expert. We can understand this information as "side
+ # band", which is not part of the actual data. Thus no gradient is
+ # needed.
+ with torch.no_grad():
+ # Sum the tokens over local experts, then we get tokens per EP rank,
+ # which is the input splits
+ tokens_per_expert_group = tokens_per_expert.new_empty(
+ tokens_per_expert.shape[0]
+ )
+ dist.all_to_all_single(
+ tokens_per_expert_group, tokens_per_expert, group=self.ep_group
+ )
+ input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+
+ # DP to EP token shuffle. This part needs gradient.
+ if self.shuffle_method == "symm_mem":
+ # Move input to the `token_send_buf` symm mem
+ token_send_buf = self.get_send_buf()
+ token_send_buf[: idxs.shape[0]].copy_(sorted_tokens)
+ # Note: `out=` avoids copy, but it is not differentiable
+ # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]])
+ token_gather_buf, output_splits = OnDeviceAllToAllV.apply(
+ token_send_buf,
+ input_splits,
+ self.ep_group,
+ )
+ with torch.no_grad():
+ # Received tokens from all other ranks. TODO: use mask instead
+ received = output_splits.sum()
+ # TODO: don't use `received`
+ gathered_tokens = token_gather_buf[:received]
+ else: # "torch_all_to_all"
+ # Prepare input ans output splits
+ with torch.no_grad():
+ output_splits = tokens_per_expert_group.view(self.ep_size, -1).sum(
+ dim=1
+ )
+ gathered_tokens = all_to_all_single_autograd(
+ sorted_tokens,
+ output_splits.tolist(),
+ input_splits.tolist(),
+ self.ep_group,
+ )
+
+ # This part prepares a 1D tensor with the same length as
+ # `gathered_tokens`. The 1D tensor is filled with local expert IDs which
+ # the tokens in `gathered_tokens` are headed for. This part doesn't need
+ # gradient.
+ with torch.no_grad():
+ gatherd_idxs = (
+ torch.arange(
+ tokens_per_expert_group.numel(),
+ device=tokens_per_expert_group.device,
+ )
+ % self.experts_per_rank
+ )
+ gatherd_idxs = gatherd_idxs.repeat_interleave(tokens_per_expert_group)
+
+ # Prepare buffer for tokens processed by experts
+ if self.shuffle_method == "symm_mem":
+ # Take necessary space from `token_gather_buf` symm mem because we are
+ # going to send them out after expert processing
+ processed_tokens = self.get_gather_buf()[: gathered_tokens.shape[0]]
+ else: # "torch_all_to_all"
+ processed_tokens = torch.empty_like(gathered_tokens)
+
+ # This part processes the tokens routed to the local experts.
+ # TODO: can we use group GEMM here?
+ for i, expert in enumerate(self.experts.values()):
+ processed_tokens[gatherd_idxs == i] = expert(
+ gathered_tokens[gatherd_idxs == i]
+ )
+
+ # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle.
+ # The input/output splits are just a reverse of the previous shuffle.
+ if self.shuffle_method == "symm_mem":
+ token_return_buf, _ = OnDeviceAllToAllV.apply(
+ processed_tokens,
+ output_splits,
+ self.ep_group,
+ )
+ returned_tokens = token_return_buf[: sorted_tokens_shape[0]]
+ else: # "torch_all_to_all"
+ returned_tokens = all_to_all_single_autograd(
+ processed_tokens,
+ input_splits.tolist(),
+ output_splits.tolist(),
+ self.ep_group,
+ )
+
+ output_tokens = torch.empty_like(returned_tokens)
+ output_tokens[idxs] = returned_tokens
+ final_out = (
+ output_tokens.view(*topk_ids.shape, -1)
+ .type(topk_weight.dtype)
+ .mul_(topk_weight.unsqueeze(dim=-1))
+ .sum(dim=1)
+ .type(returned_tokens.dtype)
+ )
+ return final_out
+
+ def moe_on_device(self, x, topk_ids, topk_weight):
+ # This part sorts the token indices so that tokens routed to the same expert reside consecutively.
+ # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive.
+ # Since this is an "aritificial" index creation (final outcome being
+ # `idxs`), we don't need gradients here.
+ with torch.no_grad():
+ # [seq_len, n_routed_experts]
+ cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts))
+ # Fill 1 to the selected experts
+ cnts.scatter_(1, topk_ids, 1)
+ tokens_per_expert = cnts.sum(dim=0)
+ # Token indices for each expert
+ idxs = topk_ids.view(-1).argsort()
+ sorted_tokens_shape = idxs.shape + x.shape[1:]
+
+ sorted_tokens = x[idxs // topk_ids.shape[1]]
+ assert sorted_tokens.shape == sorted_tokens_shape
+
+ # This part exchange the information about the number of tokens send and
+ # received by each expert. We can understand this information as "side
+ # band", which is not part of the actual data. Thus no gradient is
+ # needed.
+ with torch.no_grad():
+ # Sum the tokens over local experts, then we get tokens per EP rank,
+ # which is the input splits
+ tokens_per_expert_group = tokens_per_expert.new_empty(
+ tokens_per_expert.shape[0]
+ )
+ dist.all_to_all_single(
+ tokens_per_expert_group, tokens_per_expert, group=self.ep_group
+ )
+ input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+
+ # Move input to the `token_send_buf` symm mem
+ token_send_buf = self.get_send_buf()
+ token_send_buf[: idxs.shape[0]].copy_(sorted_tokens)
+ # Note: `out=` avoids copy, but it is not differentiable
+ # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]])
+ token_gather_buf, output_splits = OnDeviceAllToAllV.apply(
+ token_send_buf,
+ input_splits,
+ self.ep_group,
+ )
+
+ # We need to permute the received tokens so that tokens for the same expert are contiguous.
+ # This part prepares a 1D tensor `permuted_indices` for such permutation.
+ # This part doesn't need gradient.
+ with torch.no_grad():
+ permuted_indices, m_sizes = generate_permute_indices(
+ tokens_per_expert_group,
+ self.experts_per_rank,
+ self.ep_size,
+ token_gather_buf.shape[0],
+ ALIGN_SIZE_M,
+ )
+
+ # Permute the received tokens so that tokens for the same expert are contiguous.
+ contig_tokens = token_gather_buf[permuted_indices]
+
+ # Run the first grouped GEMM
+ w1 = self.get_parameter("gate_proj_weight")
+ gate_proj = grouped_gemm_forward(contig_tokens, w1, m_sizes)
+
+ # Run the second grouped GEMM
+ w3 = self.get_parameter("up_proj_weight")
+ up_proj = grouped_gemm_forward(contig_tokens, w3, m_sizes)
+
+ # Apply activation
+ hidden_outputs = MLP.act_fn(gate_proj) * up_proj
+
+ # Run the third grouped GEMM
+ w2 = self.get_parameter("down_proj_weight")
+ hidden_outputs = grouped_gemm_forward(hidden_outputs, w2, m_sizes)
+
+ # Prepare buffer for tokens processed by experts
+ # Take necessary space from `token_gather_buf` symm mem because we are
+ # going to send them out after expert processing
+ processed_tokens = self.get_gather_buf()
+
+ # Move into Symmetric Memory for the return shuffle
+ processed_tokens[permuted_indices] = hidden_outputs
+
+ # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle.
+ # The input/output splits are just a reverse of the previous shuffle.
+ token_return_buf, _ = OnDeviceAllToAllV.apply(
+ processed_tokens,
+ output_splits,
+ self.ep_group,
+ )
+ returned_tokens = token_return_buf[: sorted_tokens_shape[0]]
+
+ output_tokens = torch.empty_like(returned_tokens)
+ output_tokens[idxs] = returned_tokens
+ final_out = (
+ output_tokens.view(*topk_ids.shape, -1)
+ .type(topk_weight.dtype)
+ .mul_(topk_weight.unsqueeze(dim=-1))
+ .sum(dim=1)
+ .type(returned_tokens.dtype)
+ )
+ return final_out
+
+
+class Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: ModelArgs, layer_idx: Optional[int] = None):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ self.attention_dropout = config.attention_dropout
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+ self.q_lora_rank = config.q_lora_rank
+ self.qk_rope_head_dim = config.qk_rope_head_dim
+ self.kv_lora_rank = config.kv_lora_rank
+ self.v_head_dim = config.v_head_dim
+ self.qk_nope_head_dim = config.qk_nope_head_dim
+ self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+
+ self.is_causal = True
+
+ if self.q_lora_rank is None:
+ self.q_proj = nn.Linear(
+ self.hidden_size, self.num_heads * self.q_head_dim, bias=False
+ )
+ else:
+ self.q_a_proj = nn.Linear(
+ self.hidden_size, config.q_lora_rank, bias=config.attention_bias
+ )
+ self.q_a_layernorm = RMSNorm(config.q_lora_rank)
+ self.q_b_proj = nn.Linear(
+ config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
+ )
+
+ self.kv_a_proj_with_mqa = nn.Linear(
+ self.hidden_size,
+ config.kv_lora_rank + config.qk_rope_head_dim,
+ bias=config.attention_bias,
+ )
+ self.kv_a_layernorm = RMSNorm(config.kv_lora_rank)
+ self.kv_b_proj = nn.Linear(
+ config.kv_lora_rank,
+ self.num_heads
+ * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+ bias=False,
+ )
+
+ self.o_proj = nn.Linear(
+ self.num_heads * self.v_head_dim,
+ self.hidden_size,
+ bias=config.attention_bias,
+ )
+ self._init_rope()
+
+ self.softmax_scale = self.q_head_dim ** (-0.5)
+ if self.config.rope_scaling is not None:
+ mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+ scaling_factor = self.config.rope_scaling["factor"]
+ if mscale_all_dim:
+ mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+ self.softmax_scale = self.softmax_scale * mscale * mscale
+
+ def _init_rope(self):
+ if self.config.rope_scaling is None:
+ self.rotary_emb = RotaryEmbedding(
+ self.qk_rope_head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ else:
+ scaling_type = self.config.rope_scaling["type"]
+ scaling_factor = self.config.rope_scaling["factor"]
+ if scaling_type == "linear":
+ self.rotary_emb = LinearScalingRotaryEmbedding(
+ self.qk_rope_head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "dynamic":
+ self.rotary_emb = DynamicNTKScalingRotaryEmbedding(
+ self.qk_rope_head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ )
+ elif scaling_type == "yarn":
+ kwargs = {
+ key: self.config.rope_scaling[key]
+ for key in [
+ "original_max_position_embeddings",
+ "beta_fast",
+ "beta_slow",
+ "mscale",
+ "mscale_all_dim",
+ ]
+ if key in self.config.rope_scaling
+ }
+ self.rotary_emb = YarnRotaryEmbedding(
+ self.qk_rope_head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ scaling_factor=scaling_factor,
+ base=self.rope_theta,
+ **kwargs,
+ )
+ else:
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ if self.q_lora_rank is None:
+ q = self.q_proj(hidden_states)
+ else:
+ q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+ q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+ q_nope, q_pe = torch.split(
+ q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+ )
+
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+ compressed_kv, k_pe = torch.split(
+ compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+ )
+ k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+ kv = (
+ self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+ .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+ .transpose(1, 2)
+ )
+
+ k_nope, value_states = torch.split(
+ kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+ )
+ kv_seq_len = value_states.shape[-2]
+
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+ q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+
+ query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+ query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+ query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+ key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+ key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+ key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+
+ if attention_mask is not None:
+ # Attention mask was made 4D because the `attn_weights` above is 4D.
+ # We probably can make this mask smarter if we want to pack sequences
+ # together, instead of using padding. This optimization can be used in
+ # inference. For training, if we want to pack sequences, data loader
+ # will pass in a mask containing such info.
+ attention_mask = _prepare_4d_causal_attention_mask(
+ attention_mask, # None, or user provided mask in 2D
+ (bsz, q_len),
+ hidden_states,
+ 0, # past_key_values_length, 0 when training
+ )
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+ raise ValueError(
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+ )
+
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
+ query=query_states,
+ key=key_states,
+ value=value_states,
+ attn_mask=attention_mask,
+ dropout_p=self.attention_dropout,
+ is_causal=attention_mask is None,
+ scale=self.softmax_scale,
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+ attn_output = self.o_proj(attn_output)
+
+ return attn_output
+
+
+class DecoderLayer(nn.Module):
+ def __init__(self, config: ModelArgs, layer_idx: int):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.self_attn = Attention(config=config, layer_idx=layer_idx)
+
+ self.mlp = (
+ MoE(config)
+ if (
+ config.n_routed_experts is not None
+ and layer_idx >= config.first_k_dense_replace
+ and layer_idx % config.moe_layer_freq == 0
+ )
+ else MLP(config)
+ )
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = RMSNorm(
+ config.hidden_size, eps=config.rms_norm_eps
+ )
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> torch.Tensor:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ """
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
+
+ return hidden_states
+
+
+Deepseek_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ Two formats are allowed:
+ - a [`~cache_utils.Cache`] instance;
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+ cache format.
+
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+ legacy cache format will be returned.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class DeepseekModel(torch.nn.Module):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DecoderLayer`]
+
+ Args:
+ config: ModelArgs
+ """
+
+ def __init__(self, config: ModelArgs):
+ super().__init__()
+ self.config = config
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ # Creating model parts related to my stage
+ assert (
+ config.stage_idx < config.num_stages
+ ), f"Stage {config.stage_idx} is not in the model"
+ print(f"Creating model stage {config.stage_idx} of {config.num_stages}")
+
+ self.embed_tokens = (
+ nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ if config.stage_idx == 0
+ else None
+ )
+
+ self.layers = torch.nn.ModuleDict()
+ division = config.num_hidden_layers // config.num_stages
+ residual = config.num_hidden_layers % config.num_stages
+ # Some earlier stages may have 1 more layer than latter stages because
+ # the division may have residual; this is more even than giving the
+ # entire residual to the last stage.
+ layers_per_stage = [
+ division + 1 if stage < residual else division
+ for stage in range(config.num_stages)
+ ]
+ assert sum(layers_per_stage) == config.num_hidden_layers
+ layer_id_start = sum(layers_per_stage[: config.stage_idx])
+ layer_id_end = layer_id_start + layers_per_stage[config.stage_idx]
+ for layer_id in range(layer_id_start, layer_id_end):
+ self.layers[str(layer_id)] = DecoderLayer(config, layer_id)
+
+ self.norm = (
+ RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ if config.stage_idx == config.num_stages - 1
+ else None
+ )
+
+ # Initialize weights and apply final processing
+ self.apply(self._init_weights)
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ def forward(
+ self,
+ tokens: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> torch.Tensor:
+ # Embedding
+ hidden_states = (
+ self.embed_tokens(tokens) if self.embed_tokens is not None else tokens
+ )
+
+ # decoder layers
+ for decoder_layer in self.layers.values():
+ hidden_states = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ )
+
+ hidden_states = (
+ self.norm(hidden_states) if self.norm is not None else hidden_states
+ )
+ return hidden_states
+
+
+class DeepseekForCausalLM(torch.nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.model = DeepseekModel(config)
+ self.lm_head = (
+ nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+ if config.stage_idx == config.num_stages - 1
+ else None
+ )
+
+ # Initialize weights and apply final processing
+ # self.post_init()
+
+ def forward(
+ self,
+ tokens: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ) -> Tuple:
+ r"""
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, DeepseekForCausalLM
+
+ >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+ hidden_states = self.model(
+ tokens,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ )
+
+ logits = (
+ self.lm_head(hidden_states) if self.lm_head is not None else hidden_states
+ )
+ return logits
+
+ def prepare_inputs_for_generation(
+ self,
+ input_ids,
+ past_key_values=None,
+ attention_mask=None,
+ **kwargs,
+ ):
+ if past_key_values is not None:
+ # Assuming isinstance(past_key_values, Cache):
+ cache_length = past_key_values.get_seq_length()
+ past_length = past_key_values.seen_tokens
+ max_cache_length = past_key_values.get_max_length()
+
+ # Keep only the unprocessed tokens:
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+ # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+ # input)
+ if (
+ attention_mask is not None
+ and attention_mask.shape[1] > input_ids.shape[1]
+ ):
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+ # input_ids based on the past_length.
+ elif past_length < input_ids.shape[1]:
+ input_ids = input_ids[:, past_length:]
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+ if (
+ max_cache_length is not None
+ and attention_mask is not None
+ and cache_length + input_ids.shape[1] > max_cache_length
+ ):
+ attention_mask = attention_mask[:, -max_cache_length:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -input_ids.shape[1] :]
+
+ model_inputs = {"input_ids": input_ids}
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ }
+ )
+ return model_inputs
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ reordered_past += (
+ tuple(
+ past_state.index_select(0, beam_idx.to(past_state.device))
+ for past_state in layer_past
+ ),
+ )
+ return reordered_past
+
+ # Setup Symmetric Memory for MoE token shuffle.
+ # Supports inference currently.
+ def setup_symm_mem(self, dtype: torch.dtype, device: torch.device):
+ for layer in self.model.layers.values():
+ if not isinstance(layer.mlp, MoE):
+ continue
+ layer.mlp.setup_symm_mem(dtype, device)
diff --git a/torchtitan/experiments/deepseek_v3/requirements.txt b/torchtitan/experiments/deepseek_v3/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2b66a52d87be39b1c4fb36e822c24958d40dfa81
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/requirements.txt
@@ -0,0 +1,5 @@
+transformers
+accelerate
+torchdata >= 0.8.0
+datasets >= 2.21.0
+tomli >= 1.1.0 ; python_version < "3.11"
diff --git a/torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py b/torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed00317084d85abd10e13cc4f18437d6e9337a75
--- /dev/null
+++ b/torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def get_tid():
+ return tl.inline_asm_elementwise(
+ """
+ mov.u32 $0, %tid.x;
+ mov.u32 $1, %tid.y;
+ mov.u32 $2, %tid.z;
+ """,
+ "=r,=r,=r",
+ [],
+ dtype=(tl.uint32, tl.uint32, tl.uint32),
+ is_pure=True,
+ pack=1,
+ )
+
+
+@triton.jit
+def get_ntid():
+ return tl.inline_asm_elementwise(
+ """
+ mov.u32 $0, %ntid.x;
+ mov.u32 $1, %ntid.y;
+ mov.u32 $2, %ntid.z;
+ """,
+ "=r,=r,=r",
+ [],
+ dtype=(tl.uint32, tl.uint32, tl.uint32),
+ is_pure=True,
+ pack=1,
+ )
+
+
+@triton.jit
+def get_flat_tid():
+ tid_x, tid_y, tid_z = get_tid()
+ ntid_x, ntid_y, _ = get_ntid()
+ return tid_z * ntid_y * ntid_x + tid_y * ntid_x + tid_x
+
+
+@triton.jit
+def get_flat_bid():
+ return (
+ tl.program_id(2) * tl.num_programs(1) * tl.num_programs(0)
+ + tl.program_id(1) * tl.num_programs(0)
+ + tl.program_id(0)
+ )
+
+
+@triton.jit
+def sync_threads():
+ tl.inline_asm_elementwise(
+ "bar.sync 0;", "=r", [], dtype=tl.int32, is_pure=False, pack=1
+ )
diff --git a/torchtitan/experiments/flux/README.md b/torchtitan/experiments/flux/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e56939b6eea7769d5130703cd3acb58f7eb5f5a
--- /dev/null
+++ b/torchtitan/experiments/flux/README.md
@@ -0,0 +1,23 @@
+# FLUX model in torchtitan
+
+## Overview
+
+## Usage
+First, download the autoencoder model from HuggingFace with your own access token:
+```bash
+python torchtitan/experiments/flux/scripts/download_autoencoder.py --repo_id black-forest-labs/FLUX.1-dev --ae_path ae.safetensors --hf_token
+```
+This step will download the autoencoder model from HuggingFace and save it to the `torchtitan/experiments/flux/assets/autoencoder/ae.safetensors` file.
+
+Run the following command to train the model on a single GPU:
+```bash
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --nproc_per_node=1 torchtitan/experiments/flux/train.py --job.config_file torchtitan/experiments/flux/train_configs/debug_model.toml
+```
+
+## TODO
+- [ ] Supporting for multiple GPUs is comming soon (FSDP, etc)
+- [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc)
+- [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc)
+- [ ] Support for distributed checkpointing and loading
+- [ ] Implement init_weights() function to initialize the model weights
+- [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function
diff --git a/torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc b/torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f71ead984739bd6ad7c0808e5bb122786a517b4
Binary files /dev/null and b/torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc differ
diff --git a/torchtitan/experiments/flux/dataset/flux_dataset.py b/torchtitan/experiments/flux/dataset/flux_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..995f0af3b4152052bcfb21b4331e8dcff8ddd7da
--- /dev/null
+++ b/torchtitan/experiments/flux/dataset/flux_dataset.py
@@ -0,0 +1,267 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import random
+from dataclasses import dataclass
+from typing import Any, Callable, Optional
+
+import numpy as np
+
+import torch
+
+from datasets import Dataset, load_dataset
+from datasets.distributed import split_dataset_by_node
+from PIL import Image
+
+from torch.distributed.checkpoint.stateful import Stateful
+
+from torch.utils.data import IterableDataset
+from torchtitan.components.dataloader import ParallelAwareDataloader
+
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+from torchtitan.tools.logging import logger
+
+
+def _process_cc12m_image(
+ img: Image.Image,
+ output_size: int = 256,
+) -> Optional[torch.Tensor]:
+ """Process CC12M image to the desired size."""
+
+ width, height = img.size
+ # Skip low resolution images
+ if width < output_size or height < output_size:
+ return None
+
+ if width >= height:
+ # resize height to be equal to output_size, then crop
+ new_width, new_height = math.ceil(output_size / height * width), output_size
+ img = img.resize((new_width, new_height))
+ left = random.randint(0, new_width - output_size)
+ resized_img = img.crop((left, 0, left + output_size, output_size))
+ else:
+ # resize width to be equal to output_size, the crop
+ new_width, new_height = (
+ output_size,
+ math.ceil(output_size / width * height),
+ )
+ img = img.resize((new_width, new_height))
+ lower = random.randint(0, new_width - output_size)
+ resized_img = img.crop((0, lower, output_size, lower + output_size))
+
+ assert resized_img.size[0] == resized_img.size[1] == output_size
+
+ # Skip grayscale images
+ if resized_img.mode == "L":
+ return None
+
+ np_img = np.array(resized_img).transpose((2, 0, 1))
+ tensor_img = torch.tensor(np_img).float() / 255.0
+
+ # NOTE: The following commented code is an alternative way
+ # img_transform = transforms.Compose(
+ # [
+ # transforms.Resize(max(output_size, output_size)),
+ # transforms.CenterCrop((output_size, output_size)),
+ # transforms.ToTensor(),
+ # ]
+ # )
+ # tensor_img = img_transform(img)
+
+ return tensor_img
+
+
+def _flux_data_processor(
+ sample: dict[str, Any],
+ t5_tokenizer: FluxTokenizer,
+ clip_tokenizer: FluxTokenizer,
+ output_size: int = 256,
+) -> dict[str, Any]:
+ """
+ Preprocess CC12M dataset sample image and text for Flux model.
+
+ Args:
+ sample: A sample from dataset
+ t5_encoder: T5 encoder
+ clip_encoder: CLIP encoder
+ output_size: The output image size
+
+ """
+ img = _process_cc12m_image(sample["jpg"], output_size=output_size)
+ t5_tokens = t5_tokenizer.encode(sample["txt"])
+ clip_tokens = clip_tokenizer.encode(sample["txt"])
+
+ return {
+ "image": img,
+ "clip_tokens": clip_tokens, # type: List[int]
+ "t5_tokens": t5_tokens, # type: List[int]
+ }
+
+
+@dataclass
+class TextToImageDatasetConfig:
+ path: str
+ loader: Callable
+ data_processor: Callable
+
+
+DATASETS = {
+ "cc12m": TextToImageDatasetConfig(
+ path="pixparse/cc12m-wds",
+ loader=lambda path: load_dataset(path, split="train", streaming=True),
+ data_processor=_flux_data_processor,
+ ),
+}
+
+
+def _validate_dataset(
+ dataset_name: str, dataset_path: Optional[str] = None
+) -> tuple[str, Callable, Callable]:
+ """Validate dataset name and path."""
+ if dataset_name not in DATASETS:
+ raise ValueError(
+ f"Dataset {dataset_name} is not supported. "
+ f"Supported datasets are: {list(DATASETS.keys())}"
+ )
+
+ config = DATASETS[dataset_name]
+ path = dataset_path or config.path
+ logger.info(f"Preparing {dataset_name} dataset from {path}")
+ return path, config.loader, config.data_processor
+
+
+class FluxDataset(IterableDataset, Stateful):
+ """Dataset for FLUX text-to-image model.
+
+ Args:
+ dataset_name (str): Name of the dataset.
+ dataset_path (str): Path to the dataset.
+ model_transform (Transform): Callable that applies model-specific preprocessing to the sample.
+ dp_rank (int): Data parallel rank.
+ dp_world_size (int): Data parallel world size.
+ infinite (bool): Whether to loop over the dataset infinitely.
+ """
+
+ def __init__(
+ self,
+ dataset_name: str,
+ dataset_path: Optional[str],
+ t5_tokenizer: FluxTokenizer,
+ clip_tokenizer: FluxTokenizer,
+ job_config: Optional[JobConfig] = None,
+ dp_rank: int = 0,
+ dp_world_size: int = 1,
+ infinite: bool = False,
+ ) -> None:
+
+ # Force lowercase for consistent comparison
+ dataset_name = dataset_name.lower()
+
+ path, dataset_loader, data_processor = _validate_dataset(
+ dataset_name, dataset_path
+ )
+ ds = dataset_loader(path)
+
+ self.dataset_name = dataset_name
+ self._data = split_dataset_by_node(ds, dp_rank, dp_world_size)
+
+ self._t5_tokenizer = t5_tokenizer
+ self._clip_tokenizer = clip_tokenizer
+ self._data_processor = data_processor
+ self.job_config = job_config
+
+ self.infinite = infinite
+
+ # Variables for checkpointing
+ self._sample_idx = 0
+ self._all_samples: list[dict[str, Any]] = []
+
+ def _get_data_iter(self):
+ if isinstance(self._data, Dataset) and self._sample_idx == len(self._data):
+ return iter([])
+
+ it = iter(self._data)
+ for _ in range(self._sample_idx):
+ next(it)
+ return it
+
+ def __iter__(self):
+ while True:
+ for sample in self._get_data_iter():
+ # Use the dataset-specific preprocessor
+ sample_dict = self._data_processor(
+ sample, self._t5_tokenizer, self._clip_tokenizer, output_size=256
+ )
+
+ # skip low quality image or image with color channel = 1
+ if sample_dict["image"] is None:
+ logger.warning(
+ f"Low quality image {sample['__key__']} is skipped in Flux Dataloader"
+ )
+ continue
+
+ self._all_samples.extend(sample_dict)
+ self._sample_idx += 1
+
+ labels = sample_dict.pop("image")
+ yield sample_dict, labels
+
+ if not self.infinite:
+ logger.warning(f"Dataset {self.dataset_name} has run out of data")
+ break
+ else:
+ # Reset offset for the next iteration
+ self._sample_idx = 0
+ logger.warning(f"Dataset {self.dataset_name} is being re-looped")
+
+ def load_state_dict(self, state_dict):
+ self._sample_idx = state_dict["sample_idx"]
+ self._all_samples = state_dict["all_samples"]
+
+ def state_dict(self):
+ return {
+ "all_samples": self._all_samples,
+ "sample_idx": self._sample_idx,
+ }
+
+
+def build_flux_dataloader(
+ dp_world_size: int,
+ dp_rank: int,
+ job_config: JobConfig,
+ # This parameter is not used, keep it for compatibility
+ tokenizer: FluxTokenizer | None,
+ infinite: bool = True,
+) -> ParallelAwareDataloader:
+ """Build a data loader for HuggingFace datasets."""
+ dataset_name = job_config.training.dataset
+ dataset_path = job_config.training.dataset_path
+ batch_size = job_config.training.batch_size
+
+ t5_encoder_name = job_config.encoder.t5_encoder
+ clip_encoder_name = job_config.encoder.clip_encoder
+ max_t5_encoding_len = job_config.encoder.max_t5_encoding_len
+
+ ds = FluxDataset(
+ dataset_name=dataset_name,
+ dataset_path=dataset_path,
+ t5_tokenizer=FluxTokenizer(t5_encoder_name, max_length=max_t5_encoding_len),
+ clip_tokenizer=FluxTokenizer(
+ clip_encoder_name, max_length=77
+ ), # fix max_length for CLIP
+ dp_rank=dp_rank,
+ dp_world_size=dp_world_size,
+ infinite=infinite,
+ )
+
+ return ParallelAwareDataloader(
+ dataset=ds,
+ dp_rank=dp_rank,
+ dp_world_size=dp_world_size,
+ batch_size=batch_size,
+ )
diff --git a/torchtitan/experiments/flux/dataset/tokenizer.py b/torchtitan/experiments/flux/dataset/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..090bfc955152d87614f03793fd606330995da39d
--- /dev/null
+++ b/torchtitan/experiments/flux/dataset/tokenizer.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+
+
+from typing import List
+
+from torchtitan.components.tokenizer import Tokenizer
+from transformers import CLIPTokenizer, T5Tokenizer
+
+
+class FluxTokenizer(Tokenizer):
+ """
+ Tokenizing and encoding/decoding text using the T5 or Clip tokenizer.
+
+ Args:
+ model_path (str): Path to the tokenzier from hugging face.
+
+ """
+
+ def __init__(self, model_path: str = "t5-small", max_length: int = 77):
+ super().__init__()
+ self._n_words = 8 # TODO(jianiw): check
+ self._max_length = max_length
+
+ self.is_clip = model_path.startswith("openai")
+
+ if self.is_clip:
+ self._tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
+ model_path, max_length=max_length
+ )
+ else:
+ self._tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
+ model_path, max_length=max_length
+ )
+
+ def encode(
+ self,
+ s: str,
+ ) -> List[int]:
+ """
+ Encode the prompt text into tokens.
+ """
+ tokens = self._tokenizer(
+ s,
+ truncation=True,
+ max_length=self._max_length,
+ return_length=False,
+ return_overflowing_tokens=False,
+ padding="max_length",
+ return_tensors="pt", # return pytorch tensors, default return List[int]
+ )["input_ids"]
+ return tokens
+
+ def decode(self, t: List[int]) -> str:
+ """
+ Decode function. This function will not be called.
+ """
+ return self._tokenizer.decode(t)
diff --git a/torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc b/torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6880dfb59cfd0796ca3f69a67fec3d4d7b3a472
Binary files /dev/null and b/torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc differ
diff --git a/torchtitan/experiments/flux/model/hf_embedder.py b/torchtitan/experiments/flux/model/hf_embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..495fd7a81d16cc0cadeaab3b390a638339ff0f94
--- /dev/null
+++ b/torchtitan/experiments/flux/model/hf_embedder.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch import nn, Tensor
+from transformers import CLIPTextModel, T5EncoderModel
+
+
+class FluxEmbedder(nn.Module):
+ def __init__(self, version: str, **hf_kwargs):
+ super().__init__()
+ self.is_clip = version.startswith("openai")
+ self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+
+ if self.is_clip:
+ self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
+ version, **hf_kwargs
+ )
+ else:
+ self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
+ version, **hf_kwargs
+ )
+
+ self.hf_module = self.hf_module.eval().requires_grad_(False)
+
+ def forward(self, batch_tokens: Tensor) -> Tensor:
+ """
+ batch_tokens: [bsz, embedding_length]
+
+ For T5 Encoder, embeding_length is 768
+ For CLIP, embedding_length is 256
+ """
+ outputs = self.hf_module(
+ input_ids=batch_tokens.to(self.hf_module.device),
+ attention_mask=None,
+ output_hidden_states=False,
+ )
+ return outputs[self.output_key]
diff --git a/torchtitan/experiments/flux/model/model.py b/torchtitan/experiments/flux/model/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b9e6aeaacee709c4fdc7d86f338eec050bf322
--- /dev/null
+++ b/torchtitan/experiments/flux/model/model.py
@@ -0,0 +1,177 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+import torch
+
+from torch import nn, Tensor
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+
+from torchtitan.experiments.flux.model.autoencoder import AutoEncoderParams
+from torchtitan.experiments.flux.model.layers import (
+ DoubleStreamBlock,
+ EmbedND,
+ LastLayer,
+ MLPEmbedder,
+ SingleStreamBlock,
+ timestep_embedding,
+)
+
+from torchtitan.protocols.train_spec import BaseModelArgs, ModelProtocol
+from torchtitan.tools.logging import logger
+
+
+@dataclass
+class FluxModelArgs(BaseModelArgs):
+ in_channels: int = 64
+ out_channels: int = 64
+ vec_in_dim: int = 768
+ context_in_dim: int = 512
+ hidden_size: int = 3072
+ mlp_ratio: float = 4.0
+ num_heads: int = 24
+ depth: int = 19
+ depth_single_blocks: int = 38
+ axes_dim: tuple = (16, 56, 56)
+ theta: int = 10_000
+ qkv_bias: bool = True
+ guidance_embed: bool = True
+ autoencoder_params: AutoEncoderParams = field(default_factory=AutoEncoderParams)
+
+ def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
+ # context_in_dim is the same as the T5 embedding dimension
+ self.context_in_dim = job_config.encoder.max_t5_encoding_len
+
+ def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+ # TODO(jianiw): Add the number of flops for the autoencoder
+ nparams = sum(p.numel() for p in model.parameters())
+ logger.warning("FLUX model haven't implement get_nparams_and_flops() function")
+ return nparams, 1
+
+
+class FluxModel(nn.Module, ModelProtocol):
+ """
+ Transformer model for flow matching on sequences.
+
+ Agrs:
+ model_args: FluxModelArgs.
+
+ Attributes:
+ model_args (TransformerModelArgs): Model configuration arguments.
+ """
+
+ def __init__(self, model_args: FluxModelArgs):
+ super().__init__()
+
+ self.model_args = model_args
+ self.in_channels = model_args.in_channels
+ self.out_channels = model_args.out_channels
+ if model_args.hidden_size % model_args.num_heads != 0:
+ raise ValueError(
+ f"Hidden size {model_args.hidden_size} must be divisible by num_heads {model_args.num_heads}"
+ )
+ pe_dim = model_args.hidden_size // model_args.num_heads
+ if sum(model_args.axes_dim) != pe_dim:
+ raise ValueError(
+ f"Got {model_args.axes_dim} but expected positional dim {pe_dim}"
+ )
+ self.hidden_size = model_args.hidden_size
+ self.num_heads = model_args.num_heads
+ self.pe_embedder = EmbedND(
+ dim=pe_dim, theta=model_args.theta, axes_dim=model_args.axes_dim
+ )
+ self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+ self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+ self.vector_in = MLPEmbedder(model_args.vec_in_dim, self.hidden_size)
+ self.guidance_in = (
+ MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+ if model_args.guidance_embed
+ else nn.Identity()
+ )
+ self.txt_in = nn.Linear(model_args.context_in_dim, self.hidden_size)
+
+ self.double_blocks = nn.ModuleList(
+ [
+ DoubleStreamBlock(
+ self.hidden_size,
+ self.num_heads,
+ mlp_ratio=model_args.mlp_ratio,
+ qkv_bias=model_args.qkv_bias,
+ )
+ for _ in range(model_args.depth)
+ ]
+ )
+
+ self.single_blocks = nn.ModuleList(
+ [
+ SingleStreamBlock(
+ self.hidden_size, self.num_heads, mlp_ratio=model_args.mlp_ratio
+ )
+ for _ in range(model_args.depth_single_blocks)
+ ]
+ )
+
+ self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+
+ def init_weights(self, buffer_device=None):
+ # TODO(jianiw): replace placeholder with real weight init
+ for param in self.parameters():
+ param.data.uniform_(0, 0.1)
+
+ def forward(
+ self,
+ img: Tensor,
+ img_ids: Tensor,
+ txt: Tensor,
+ txt_ids: Tensor,
+ timesteps: Tensor,
+ y: Tensor,
+ guidance: Tensor | None = None,
+ ) -> Tensor:
+ if img.ndim != 3 or txt.ndim != 3:
+ raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+ # running on sequences img
+ img = self.img_in(img)
+ vec = self.time_in(timestep_embedding(timesteps, 256))
+ if self.model_args.guidance_embed:
+ if guidance is None:
+ raise ValueError(
+ "Didn't get guidance strength for guidance distilled model."
+ )
+ vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+ vec = vec + self.vector_in(y)
+ txt = self.txt_in(txt)
+
+ ids = torch.cat((txt_ids, img_ids), dim=1)
+ pe = self.pe_embedder(ids)
+
+ for block in self.double_blocks:
+ img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+
+ img = torch.cat((txt, img), 1)
+ for block in self.single_blocks:
+ img = block(img, vec=vec, pe=pe)
+ img = img[:, txt.shape[1] :, ...]
+
+ img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
+ return img
+
+ @classmethod
+ def from_model_args(cls, model_args: FluxModelArgs) -> "FluxModel":
+ """
+ Initialize a Flux model from a FluxModelArgs object.
+
+ Args:
+ model_args (FluxModelArgs): Model configuration arguments.
+
+ Returns:
+ FluxModel: FluxModel model.
+
+ """
+ return cls(model_args)
diff --git a/torchtitan/experiments/flux/tests/test_flux_dataloader.py b/torchtitan/experiments/flux/tests/test_flux_dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc87f1b8b4ae3ad7daf1558835716720127e3b42
--- /dev/null
+++ b/torchtitan/experiments/flux/tests/test_flux_dataloader.py
@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
+from torchtitan.tools.profiling import (
+ maybe_enable_memory_snapshot,
+ maybe_enable_profiling,
+)
+
+
+class TestFluxDataLoader:
+ def test_flux_dataloader(self):
+ dataset_name = "cc12m"
+ batch_size = 32
+ world_size = 4
+ rank = 0
+
+ num_steps = 10
+
+ path = "torchtitan.experiments.flux.flux_argparser"
+ sys.argv.append(f"--experimental.custom_args_module={path}")
+ config = JobConfig()
+ config.maybe_add_custom_args()
+ config.parse_args(
+ [
+ # Profiling options
+ # "--profiling.enable_profiling",
+ # "--profiling.profile_freq",
+ # "5",
+ # "--profiling.enable_memory_snapshot",
+ # "--profiling.save_memory_snapshot_folder",
+ # "memory_snapshot_flux",
+ "--training.dataset",
+ dataset_name,
+ "--training.batch_size",
+ str(batch_size),
+ "--encoder.t5_encoder",
+ "google/t5-v1_1-small",
+ "--encoder.clip_encoder",
+ "openai/clip-vit-large-patch14",
+ "--encoder.max_t5_encoding_len",
+ "512",
+ ]
+ )
+
+ with maybe_enable_profiling(
+ config, global_step=0
+ ) as torch_profiler, maybe_enable_memory_snapshot(
+ config, global_step=0
+ ) as memory_profiler:
+ dl = self._build_dataloader(
+ config,
+ world_size,
+ rank,
+ )
+ dl = iter(dl)
+
+ for i in range(0, num_steps):
+ input_data, labels = next(dl)
+ print(f"Step {i} image size: {labels.shape}")
+ if torch_profiler:
+ torch_profiler.step()
+ if memory_profiler:
+ memory_profiler.step()
+
+ print(len(input_data["clip_tokens"]))
+ for k, v in input_data.items():
+ print(f"Step {i} {k} value: {type(v), v.shape}")
+
+ assert len(input_data) == 2 # (clip_encodings, t5_encodings)
+ assert labels.shape == (batch_size, 3, 256, 256)
+ # assert input_data["clip_tokens"].shape[0] == batch_size
+ # assert input_data["t5_tokens"].shape == (batch_size, 512, 512)
+
+ if torch_profiler:
+ torch_profiler.step()
+ if memory_profiler:
+ memory_profiler.step(exit_ctx=True)
+
+ def test_preprocess(self):
+ # TODO
+ pass
+
+ def _build_dataloader(
+ self,
+ job_config,
+ world_size,
+ rank,
+ ):
+
+ return build_flux_dataloader(
+ dp_world_size=world_size,
+ dp_rank=rank,
+ job_config=job_config,
+ tokenizer=None,
+ infinite=False,
+ )
diff --git a/torchtitan/experiments/flux/tests/test_generate_image.py b/torchtitan/experiments/flux/tests/test_generate_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d8d16cfbbcbfaa706e6ff6713403520744efd5
--- /dev/null
+++ b/torchtitan/experiments/flux/tests/test_generate_image.py
@@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import os
+import time
+from typing import Callable
+
+import torch
+from einops import rearrange
+
+from PIL import ExifTags, Image
+
+from torch import Tensor
+
+from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+
+from torchtitan.experiments.flux.model.autoencoder import (
+ AutoEncoder,
+ AutoEncoderParams,
+ load_ae,
+)
+from torchtitan.experiments.flux.model.hf_embedder import FluxEmbedder
+
+from torchtitan.experiments.flux.model.model import FluxModel, FluxModelArgs
+from torchtitan.experiments.flux.utils import (
+ create_position_encoding_for_latents,
+ generate_noise_latent,
+ pack_latents,
+ preprocess_flux_data,
+ unpack_latents,
+)
+
+
+def time_shift(mu: float, sigma: float, t: Tensor):
+ return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+
+def get_lin_function(
+ x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+) -> Callable[[float], float]:
+ m = (y2 - y1) / (x2 - x1)
+ b = y1 - m * x1
+ return lambda x: m * x + b
+
+
+def get_schedule(
+ num_steps: int,
+ image_seq_len: int,
+ base_shift: float = 0.5,
+ max_shift: float = 1.15,
+ shift: bool = True,
+) -> list[float]:
+ # extra step for zero
+ timesteps = torch.linspace(1, 0, num_steps + 1)
+
+ # shifting the schedule to favor high timesteps for higher signal images
+ if shift:
+ # estimate mu based on linear estimation between two points
+ mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+ timesteps = time_shift(mu, 1.0, timesteps)
+
+ return timesteps.tolist()
+
+
+class TestGenerateImage:
+ def test_generate_image(self):
+ """
+ Run a forward pass of flux model to generate an image.
+ """
+ name = "flux-dev"
+ img_width = 512
+ img_height = 512
+ seed = None
+ prompt = (
+ "a photo of a forest with mist swirling around the tree trunks. The word "
+ '"FLUX" is painted over it in big, red brush strokes with visible texture'
+ )
+ device = "cuda"
+ num_steps = None
+ loop = False
+ guidance = 3.5
+ output_dir = "output"
+ add_sampling_metadata = True
+
+ prompt = prompt.split("|")
+ if len(prompt) == 1:
+ prompt = prompt[0]
+ additional_prompts = None
+ else:
+ additional_prompts = prompt[1:]
+ prompt = prompt[0]
+
+ assert not (
+ (additional_prompts is not None) and loop
+ ), "Do not provide additional prompts and set loop to True"
+
+ torch_device = torch.device(device)
+ if num_steps is None:
+ num_steps = 30
+
+ # allow for packing and conversion to latent space
+ img_height = 16 * (img_height // 16)
+ img_width = 16 * (img_width // 16)
+
+ # init all components
+ model = FluxModel(FluxModelArgs()).to(device=torch_device, dtype=torch.bfloat16)
+
+ ae = load_ae(
+ ckpt_path="assets/autoencoder/ae.safetensors",
+ autoencoder_params=AutoEncoderParams(),
+ device=torch_device,
+ dtype=torch.bfloat16,
+ )
+ clip_tokenizer = FluxTokenizer(
+ model_path="openai/clip-vit-large-patch14", max_length=77
+ )
+ t5_tokenizer = FluxTokenizer(model_path="google/t5-v1_1-small", max_length=512)
+ clip_encoder = FluxEmbedder(version="openai/clip-vit-large-patch14").to(
+ torch_device, dtype=torch.bfloat16
+ )
+ t5_encoder = FluxEmbedder(version="google/t5-v1_1-small").to(
+ torch_device, dtype=torch.bfloat16
+ )
+
+ rng = torch.Generator(device="cpu")
+
+ if seed is None:
+ seed = rng.seed()
+ print(f"Generating with seed {seed}:\n{prompt}")
+ t0 = time.perf_counter()
+ output_name = os.path.join(output_dir, f"img_{seed}.jpg")
+
+ # Tokenize the prompt, on CPU
+ clip_tokens = clip_tokenizer.encode(prompt)
+ t5_tokens = t5_tokenizer.encode(prompt)
+
+ batch = preprocess_flux_data(
+ device=torch_device,
+ dtype=torch.bfloat16,
+ autoencoder=None,
+ clip_encoder=clip_encoder,
+ t5_encoder=t5_encoder,
+ batch={
+ "clip_tokens": clip_tokens,
+ "t5_tokens": t5_tokens,
+ },
+ )
+
+ img = self._generate_images(
+ device=torch_device,
+ dtype=torch.bfloat16,
+ model=model,
+ decoder=ae,
+ img_width=img_width,
+ img_height=img_height,
+ denoising_steps=num_steps,
+ seed=seed,
+ clip_encodings=batch["clip_encodings"],
+ t5_encodings=batch["t5_encodings"],
+ guidance=guidance,
+ )
+
+ if torch.cuda.is_available():
+ torch.cuda.synchronize()
+ t1 = time.perf_counter()
+
+ print(f"Done in {t1 - t0:.1f}s.")
+
+ self._save_image(name, output_name, img, add_sampling_metadata, prompt)
+
+ def _generate_images(
+ self,
+ device: torch.device,
+ dtype: torch.dtype,
+ model: FluxModel,
+ decoder: AutoEncoder,
+ # image params:
+ img_width: int,
+ img_height: int,
+ # sampling params:
+ denoising_steps: int,
+ seed: int,
+ clip_encodings: torch.Tensor,
+ t5_encodings: torch.Tensor,
+ guidance: float = 4.0,
+ ):
+
+ bsz = clip_encodings.shape[0]
+ latents = generate_noise_latent(bsz, img_height, img_width, device, dtype, seed)
+ _, latent_channels, latent_height, latent_width = latents.shape
+
+ # create denoising schedule
+ timesteps = get_schedule(denoising_steps, latent_channels, shift=True)
+
+ # create positional encodings
+ POSITION_DIM = 3 # constant for Flux flow model
+ latent_pos_enc = create_position_encoding_for_latents(
+ bsz, latent_height, latent_width, POSITION_DIM
+ ).to(latents)
+ text_pos_enc = torch.zeros(bsz, t5_encodings.shape[1], POSITION_DIM).to(latents)
+
+ # convert img-like latents into sequences of patches
+ latents = pack_latents(latents)
+
+ # this is ignored for schnell
+ guidance_vec = torch.full((bsz,), guidance, device=device, dtype=dtype)
+ for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+ t_vec = torch.full((bsz,), t_curr, dtype=dtype, device=device)
+ pred = model(
+ img=latents,
+ img_ids=latent_pos_enc,
+ txt=t5_encodings,
+ txt_ids=text_pos_enc,
+ y=clip_encodings,
+ timesteps=t_vec,
+ guidance=guidance_vec,
+ )
+
+ latents = latents + (t_prev - t_curr) * pred
+
+ # convert sequences of patches into img-like latents
+ latents = unpack_latents(latents, latent_height, latent_width)
+
+ img = decoder.decode(latents)
+ return img
+
+ def _save_image(
+ self,
+ name: str,
+ output_name: str,
+ x: torch.Tensor,
+ add_sampling_metadata: bool,
+ prompt: str,
+ ):
+ print(f"Saving {output_name}")
+ # bring into PIL format and save
+ x = x.clamp(-1, 1)
+ x = rearrange(x[0], "c h w -> h w c")
+
+ img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
+
+ exif_data = Image.Exif()
+ exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
+ exif_data[ExifTags.Base.Make] = "Black Forest Labs"
+ exif_data[ExifTags.Base.Model] = name
+ if add_sampling_metadata:
+ exif_data[ExifTags.Base.ImageDescription] = prompt
+ img.save(output_name, exif=exif_data, quality=95, subsampling=0)
diff --git a/torchtitan/experiments/flux/train_configs/debug_model.toml b/torchtitan/experiments/flux/train_configs/debug_model.toml
new file mode 100644
index 0000000000000000000000000000000000000000..250a71d60ec28028b548803bad7f14b6b3a6db62
--- /dev/null
+++ b/torchtitan/experiments/flux/train_configs/debug_model.toml
@@ -0,0 +1,68 @@
+
+[job]
+dump_folder = "./outputs"
+description = "Flux debug model"
+print_args = false
+use_for_integration_test = true
+
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+
+[model]
+name = "flux"
+flavor = "flux-debug"
+norm_type = "rmsnorm" # layernorm / np_layernorm / rmsnorm
+# test tokenizer.model, for debug purpose only
+# tokenizer_path = "./tests/assets/test_tiktoken.model"
+# converters = "float8"
+
+
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+
+[lr_scheduler]
+warmup_steps = 2 # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8 # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+lr_min = 0.0
+
+[training]
+batch_size = 32
+seq_len = 512
+max_norm = 1.0 # grad norm clipping
+steps = 10
+compile = false
+dataset = "cc12m"
+guidance = 3.5
+seed = 0
+
+[encoder]
+t5_encoder="google/t5-v1_1-small"
+clip_encoder="openai/clip-vit-large-patch14"
+max_t5_encoding_len=512
+auto_encoder_path="torchtitan/experiments/flux/assets/autoencoder/ae.safetensors" # Autoencoder to use for image
+
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+
+[experimental]
+custom_args_module = "torchtitan.experiments.flux.flux_argparser"
diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dbabd1317a5923545f24c9a77feca46f5a92130
--- /dev/null
+++ b/torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py
@@ -0,0 +1,630 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Benchmark comparing reference PyTorch vs optimized M*G group GEMM implementation
+
+import argparse
+import logging
+import time
+
+# from typing import Dict, List, Optional, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import triton
+
+# import triton.language as tl
+
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+# Try to import the optimized implementations
+try:
+ from torchao_pr.mg_grouped_gemm import grouped_gemm_forward
+
+except ImportError:
+ logging.error(
+ "Error importing MG grouped GEMM modules. Make sure the implementation files are in the correct path."
+ )
+ raise
+
+
+def compute_reference_forward(x, w, m_sizes):
+ """
+ Reference PyTorch implementation of M*G grouped GEMM forward pass.
+
+ Args:
+ x (torch.Tensor): Input tensor of shape (M, K)
+ w (torch.Tensor): Weight tensor of shape (N, K)
+ m_sizes (torch.Tensor): Group sizes tensor of shape (G)
+
+ Returns:
+ torch.Tensor: Output tensor of shape (M, N)
+ """
+ result = torch.zeros((x.shape[0], w.shape[0]), dtype=x.dtype, device=x.device)
+
+ m_start = 0
+ for g in range(len(m_sizes)):
+ m_size = m_sizes[g].item()
+ if m_size > 0:
+ m_end = m_start + m_size
+
+ # Extract group input
+ x_g = x[m_start:m_end]
+
+ # Compute group output
+ y_g = torch.matmul(x_g, w.T)
+
+ # Store result
+ result[m_start:m_end] = y_g
+
+ # Update start index
+ m_start = m_end
+
+ return result
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["N"], # We'll vary the output dimension
+ x_vals=[1024, 2048, 4096, 8192, 16384], # Different output dimensions to test
+ # x_vals=[8192, 16384],
+ line_arg="provider", # We'll compare different providers
+ line_vals=["pytorch_reference", "M*G grouped GEMM"],
+ line_names=["PyTorch Reference", "M*G grouped Kernel"],
+ styles=[("blue", "-"), ("red", "-")],
+ ylabel="TFLOPS", # We'll measure TFLOPS
+ plot_name="mg_grouped_gemm_comparison",
+ args={
+ "M": 8192, # Batch dimension, fixed for all tests
+ "K": 7168, # Hidden dimension, fixed for all tests
+ "G": 8, # Number of groups
+ "dtype": torch.float16,
+ "device": "cuda",
+ },
+ )
+)
+def benchmark_forward(M, K, N, G, provider, dtype=torch.float16, device="cuda"):
+ """
+ Benchmark the forward pass of the grouped GEMM implementation.
+
+ Args:
+ M (int): Total batch size dimension
+ K (int): Hidden dimension
+ N (int): Output dimension
+ G (int): Number of groups
+ provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+ dtype (torch.dtype): Data type to use
+ device (str): Device to use
+
+ Returns:
+ float: Performance in TFLOPS
+ """
+ # Create group sizes for M dimension (balanced across groups)
+ base_size = M // G
+ remainder = M % G
+ M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+ m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+
+ print(f"N: {N}, M: {M}, K: {K}, G: {G}, dtype: {dtype}, device: {device}")
+
+ # Create input and weight tensors
+ x = torch.randn(M, K, dtype=dtype, device=device)
+ w = torch.randn(N, K, dtype=dtype, device=device)
+
+ # Pre-compute for PyTorch reference to ensure fair comparison
+ if provider == "pytorch_reference":
+ # Warmup
+ torch.cuda.synchronize()
+ compute_reference_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+
+ # Benchmark
+ start_time = time.time()
+ for _ in range(10): # Average over 10 runs
+ compute_reference_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+ end_time = time.time()
+ else: # Optimized kernel
+ # Warmup
+ torch.cuda.synchronize()
+ grouped_gemm_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+
+ # Benchmark
+ start_time = time.time()
+ for _ in range(10): # Average over 10 runs
+ grouped_gemm_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+ end_time = time.time()
+
+ # Calculate FLOPs
+ # For GEMM: 2 * M * N * K FLOPs (multiply-add counts as 2 FLOPs)
+ flops = 2 * M * N * K
+
+ # Convert to TFLOPS (tera-FLOPS)
+ avg_time = (end_time - start_time) / 10 # Average time per run
+ tflops = flops / avg_time / 1e12
+
+ return tflops
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["G"], # We'll vary the number of groups
+ x_vals=[1, 2, 4, 8, 16], # Different numbers of groups to test
+ line_arg="provider", # We'll compare different providers
+ line_vals=["pytorch_reference", "optimized_kernel"],
+ line_names=["PyTorch Reference", "Optimized Kernel"],
+ styles=[("blue", "-"), ("red", "-")],
+ ylabel="TFLOPS", # We'll measure TFLOPS
+ plot_name="mg_grouped_gemm_group_scaling",
+ args={
+ "M": 8192, # Batch dimension, fixed for all tests
+ "K": 4096, # Hidden dimension, fixed for all tests
+ "N": 8192, # Output dimension, fixed for all tests
+ "dtype": torch.float16,
+ "device": "cuda",
+ },
+ )
+)
+def benchmark_forward_groups(M, K, N, G, provider, dtype=torch.float16, device="cuda"):
+ """
+ Benchmark how performance scales with number of groups.
+
+ Args:
+ M (int): Total batch size dimension
+ K (int): Hidden dimension
+ N (int): Output dimension
+ G (int): Number of groups
+ provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+ dtype (torch.dtype): Data type to use
+ device (str): Device to use
+
+ Returns:
+ float: Performance in TFLOPS
+ """
+ # Create group sizes for M dimension (balanced across groups)
+ base_size = M // G
+ remainder = M % G
+ M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+ m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+
+ # Create input and weight tensors
+ x = torch.randn(M, K, dtype=dtype, device=device)
+ w = torch.randn(N, K, dtype=dtype, device=device)
+
+ # Benchmark logic - same as previous function
+ if provider == "pytorch_reference":
+ torch.cuda.synchronize()
+ compute_reference_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+
+ start_time = time.time()
+ for _ in range(10):
+ compute_reference_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+ end_time = time.time()
+ else:
+ torch.cuda.synchronize()
+ grouped_gemm_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+
+ start_time = time.time()
+ for _ in range(10):
+ grouped_gemm_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+ end_time = time.time()
+
+ # Calculate FLOPs and TFLOPS
+ flops = 2 * M * N * K
+ avg_time = (end_time - start_time) / 10
+ tflops = flops / avg_time / 1e12
+
+ return tflops
+
+
+@triton.testing.perf_report(
+ triton.testing.Benchmark(
+ x_names=["group_balance"], # We'll vary the group balance factor
+ x_vals=[
+ 0.0,
+ 0.25,
+ 0.5,
+ 0.75,
+ 0.9,
+ ], # Different imbalance factors (0 = balanced, 1 = max imbalance)
+ line_arg="provider", # We'll compare different providers
+ line_vals=["pytorch_reference", "optimized_kernel"],
+ line_names=["PyTorch Reference", "Optimized Kernel"],
+ styles=[("blue", "-"), ("red", "-")],
+ ylabel="TFLOPS", # We'll measure TFLOPS
+ plot_name="mg_grouped_gemm_imbalance",
+ args={
+ "M": 8192, # Batch dimension, fixed for all tests
+ "K": 4096, # Hidden dimension, fixed for all tests
+ "N": 8192, # Output dimension, fixed for all tests
+ "G": 4, # Number of groups
+ "dtype": torch.float16,
+ "device": "cuda",
+ },
+ )
+)
+def benchmark_imbalance(
+ M, K, N, G, group_balance, provider, dtype=torch.float16, device="cuda"
+):
+ """
+ Benchmark how performance is affected by imbalanced group sizes.
+
+ Args:
+ M (int): Total batch size dimension
+ K (int): Hidden dimension
+ N (int): Output dimension
+ G (int): Number of groups
+ group_balance (float): Balance factor from 0 to 1 (0 = balanced, 1 = max imbalance)
+ provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+ dtype (torch.dtype): Data type to use
+ device (str): Device to use
+
+ Returns:
+ float: Performance in TFLOPS
+ """
+ # Create imbalanced group sizes for M dimension
+ if group_balance == 0:
+ # Balanced case
+ base_size = M // G
+ remainder = M % G
+ M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+ else:
+ # Imbalanced case
+ # First group gets more elements, last group gets fewer
+ # The imbalance is controlled by the group_balance factor
+ remaining = M
+ M_sizes = []
+ for g in range(G):
+ # Interpolate from balanced to imbalanced based on group_balance
+ # For balanced (group_balance=0), each group gets M/G
+ # For imbalanced (group_balance=1), first group gets much more than last group
+ balanced_size = remaining // (G - g)
+
+ # Adjusting size based on position and imbalance factor
+ # First groups get more, last groups get less
+ if g < G // 2:
+ # First half of groups get more
+ adjustment = int(balanced_size * group_balance * (1 - g / (G - 1)))
+ size = balanced_size + adjustment
+ else:
+ # Second half of groups get less
+ adjustment = int(balanced_size * group_balance * ((g / (G - 1)) - 0.5))
+ size = balanced_size - adjustment
+
+ # Ensure we don't go below 1 or take more than remaining
+ size = max(1, min(size, remaining))
+ M_sizes.append(size)
+ remaining -= size
+
+ # Handle any remaining elements
+ if remaining > 0:
+ M_sizes[-1] += remaining
+
+ m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+
+ # Create input and weight tensors
+ x = torch.randn(M, K, dtype=dtype, device=device)
+ w = torch.randn(N, K, dtype=dtype, device=device)
+
+ # Benchmark logic
+ if provider == "pytorch_reference":
+ torch.cuda.synchronize()
+ compute_reference_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+
+ start_time = time.time()
+ for _ in range(10):
+ compute_reference_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+ end_time = time.time()
+ else:
+ torch.cuda.synchronize()
+ grouped_gemm_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+
+ start_time = time.time()
+ for _ in range(10):
+ grouped_gemm_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+ end_time = time.time()
+
+ # Calculate FLOPs and TFLOPS
+ flops = 2 * M * N * K
+ avg_time = (end_time - start_time) / 10
+ tflops = flops / avg_time / 1e12
+
+ return tflops
+
+
+def benchmark_model_configs():
+ """
+ Benchmark common model configurations used in DeepSeek-like models.
+ """
+ # Model configurations: (M, K, N, G)
+ configs = [
+ (8192, 7168, 4096, 4), # Config 1
+ (8192, 2048, 7168, 4), # Config 2
+ (4096, 7168, 4096, 8), # Config 3
+ (4096, 2048, 7168, 8), # Config 4
+ ]
+
+ results = []
+
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ dtype = torch.float16
+
+ for config_idx, (M, K, N, G) in enumerate(configs):
+ logging.info(f"\n===== Benchmarking DeepSeek Config {config_idx + 1} =====")
+ logging.info(f"M={M}, K={K}, N={N}, G={G}")
+
+ # Create group sizes for M dimension
+ base_size = M // G
+ remainder = M % G
+ M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+ m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+
+ # Create tensors
+ x = torch.randn(M, K, dtype=dtype, device=device)
+ w = torch.randn(N, K, dtype=dtype, device=device)
+
+ # Benchmark PyTorch reference
+ torch.cuda.synchronize()
+ compute_reference_forward(x, w, m_sizes) # Warmup
+ torch.cuda.synchronize()
+
+ logging.info("Benchmarking PyTorch reference...")
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+ for _ in range(10):
+ compute_reference_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+ end_time = time.time()
+ pt_time = (end_time - start_time) / 10
+ pt_memory = torch.cuda.max_memory_allocated() / (1024**2) # MB
+
+ # Benchmark optimized kernel
+ torch.cuda.synchronize()
+ grouped_gemm_forward(x, w, m_sizes) # Warmup
+ torch.cuda.synchronize()
+
+ logging.info("Benchmarking optimized kernel...")
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+ for _ in range(10):
+ grouped_gemm_forward(x, w, m_sizes)
+ torch.cuda.synchronize()
+ end_time = time.time()
+ opt_time = (end_time - start_time) / 10
+ opt_memory = torch.cuda.max_memory_allocated() / (1024**2) # MB
+
+ # Calculate FLOPs and speedup
+ flops = 2 * M * N * K
+ pt_tflops = flops / pt_time / 1e12
+ opt_tflops = flops / opt_time / 1e12
+ speedup = pt_time / opt_time
+
+ # Store results
+ results.append(
+ {
+ "config": f"Config {config_idx + 1}",
+ "dimensions": f"M={M}, K={K}, N={N}, G={G}",
+ "pt_time_ms": pt_time * 1000,
+ "opt_time_ms": opt_time * 1000,
+ "pt_tflops": pt_tflops,
+ "opt_tflops": opt_tflops,
+ "speedup": speedup,
+ "pt_memory_mb": pt_memory,
+ "opt_memory_mb": opt_memory,
+ "memory_savings": (
+ (pt_memory - opt_memory) / pt_memory * 100 if pt_memory > 0 else 0
+ ),
+ }
+ )
+
+ logging.info(
+ f"PyTorch Reference: {pt_time * 1000:.2f} ms, {pt_tflops:.2f} TFLOPS, {pt_memory:.2f} MB"
+ )
+ logging.info(
+ f"Optimized Kernel: {opt_time * 1000:.2f} ms, {opt_tflops:.2f} TFLOPS, {opt_memory:.2f} MB"
+ )
+ logging.info(
+ f"Speedup: {speedup:.2f}x, Memory savings: {results[-1]['memory_savings']:.2f}%"
+ )
+
+ # Print summary table
+ logging.info("\n===== Benchmark Results Summary =====")
+ logging.info(
+ f"{'Config':<10} | {'Time (ms)':<20} | {'TFLOPS':<20} | {'Speedup':<10} | {'Memory (MB)':<20} | {'Memory Saved':<12}"
+ )
+ logging.info(
+ f"{'':<10} | {'PyTorch':<9} {'Kernel':<9} | {'PyTorch':<9} {'Kernel':<9} | {'':<10} | "
+ f"{'PyTorch':<9} {'Kernel':<9} | {'':<12}"
+ )
+ logging.info("-" * 100)
+
+ for result in results:
+ logging.info(
+ f"{result['config']:<10} | "
+ f"{result['pt_time_ms']:<9.2f} {result['opt_time_ms']:<9.2f} | "
+ f"{result['pt_tflops']:<9.2f} {result['opt_tflops']:<9.2f} | "
+ f"{result['speedup']:<10.2f} | "
+ f"{result['pt_memory_mb']:<9.2f} {result['opt_memory_mb']:<9.2f} | "
+ f"{result['memory_savings']:<12.2f}%"
+ )
+
+ return results
+
+
+def plot_benchmark_results(results):
+ """
+ Plot benchmark results as bar charts.
+ """
+ # Extract data
+ configs = [r["config"] for r in results]
+ pt_tflops = [r["pt_tflops"] for r in results]
+ opt_tflops = [r["opt_tflops"] for r in results]
+ speedups = [r["speedup"] for r in results]
+
+ # Create figure with subplots
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
+
+ # Plot TFLOPS comparison
+ x = np.arange(len(configs))
+ width = 0.35
+ ax1.bar(x - width / 2, pt_tflops, width, label="PyTorch Reference")
+ ax1.bar(x + width / 2, opt_tflops, width, label="Optimized Kernel")
+ ax1.set_xlabel("Model Configuration")
+ ax1.set_ylabel("TFLOPS")
+ ax1.set_title("Performance Comparison (Higher is Better)")
+ ax1.set_xticks(x)
+ ax1.set_xticklabels(configs)
+ ax1.legend()
+ ax1.grid(axis="y", linestyle="--", alpha=0.7)
+
+ # Plot speedup
+ ax2.bar(x, speedups, width=0.6, color="green")
+ ax2.set_xlabel("Model Configuration")
+ ax2.set_ylabel("Speedup (x)")
+ ax2.set_title("Speedup Factor (Higher is Better)")
+ ax2.set_xticks(x)
+ ax2.set_xticklabels(configs)
+ ax2.grid(axis="y", linestyle="--", alpha=0.7)
+
+ # Add speedup values on top of bars
+ for i, v in enumerate(speedups):
+ ax2.text(i, v + 0.1, f"{v:.2f}x", ha="center")
+
+ plt.tight_layout()
+ plt.savefig("mg_grouped_gemm_benchmark_results.png")
+ logging.info(
+ "Benchmark results plot saved to 'mg_grouped_gemm_benchmark_results.png'"
+ )
+
+
+def compare_mg_implementations():
+ """
+ Combine the M*G and N*G benchmark results for comparison.
+ """
+ # Only run this if both NG and MG benchmarks have been run
+ try:
+ import pandas as pd
+
+ # Try to load previous benchmark results
+ mg_results = pd.read_csv("mg_grouped_gemm_benchmark_results.csv")
+ ng_results = pd.read_csv("ng_grouped_gemm_benchmark_results.csv")
+
+ # Create comparison plot
+ fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+
+ # Plot speedup comparison
+ configs = mg_results["config"].unique()
+ mg_speedups = mg_results.groupby("config")["speedup"].mean()
+ ng_speedups = ng_results.groupby("config")["speedup"].mean()
+
+ x = np.arange(len(configs))
+ width = 0.35
+
+ axes[0].bar(x - width / 2, mg_speedups, width, label="M*G Grouping")
+ axes[0].bar(x + width / 2, ng_speedups, width, label="N*G Grouping")
+ axes[0].set_xlabel("Model Configuration")
+ axes[0].set_ylabel("Speedup (x)")
+ axes[0].set_title("Speedup Comparison: M*G vs N*G")
+ axes[0].set_xticks(x)
+ axes[0].set_xticklabels(configs)
+ axes[0].legend()
+ axes[0].grid(axis="y", linestyle="--", alpha=0.7)
+
+ # Plot TFLOPS comparison for optimized kernels
+ mg_tflops = (
+ mg_results[mg_results["implementation"] == "optimized"]
+ .groupby("config")["tflops"]
+ .mean()
+ )
+ ng_tflops = (
+ ng_results[ng_results["implementation"] == "optimized"]
+ .groupby("config")["tflops"]
+ .mean()
+ )
+
+ axes[1].bar(x - width / 2, mg_tflops, width, label="M*G Grouping")
+ axes[1].bar(x + width / 2, ng_tflops, width, label="N*G Grouping")
+ axes[1].set_xlabel("Model Configuration")
+ axes[1].set_ylabel("TFLOPS")
+ axes[1].set_title("Performance Comparison: M*G vs N*G")
+ axes[1].set_xticks(x)
+ axes[1].set_xticklabels(configs)
+ axes[1].legend()
+ axes[1].grid(axis="y", linestyle="--", alpha=0.7)
+
+ plt.tight_layout()
+ plt.savefig("mg_vs_ng_comparison.png")
+ logging.info("Comparison plot saved to 'mg_vs_ng_comparison.png'")
+
+ except Exception as e:
+ logging.error(f"Could not create comparison plot: {e}")
+ logging.info(
+ "Run both M*G and N*G benchmarks first to generate comparison plots"
+ )
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Benchmark M*G Grouped GEMM implementations"
+ )
+ parser.add_argument("--run-all", action="store_true", help="Run all benchmarks")
+ parser.add_argument(
+ "--triton-bench", action="store_true", help="Run Triton performance reports"
+ )
+ parser.add_argument(
+ "--model-configs", action="store_true", help="Benchmark model configurations"
+ )
+ parser.add_argument(
+ "--compare-mg-ng",
+ action="store_true",
+ help="Compare M*G and N*G implementations",
+ )
+ args = parser.parse_args()
+
+ # Check if CUDA is available
+ if not torch.cuda.is_available():
+ logging.error(
+ "CUDA is not available. This benchmark requires a CUDA-capable GPU."
+ )
+ exit(1)
+
+ if args.run_all or args.model_configs:
+ # Benchmark model configurations
+ logging.info("Running benchmark for model configurations...")
+ results = benchmark_model_configs()
+ plot_benchmark_results(results)
+
+ if args.run_all or args.triton_bench:
+ # Run Triton performance reports
+ logging.info("Running Triton performance reports...")
+ benchmark_forward.run(save_path="mg_grouped_gemm_benchmark_results")
+ benchmark_forward_groups.run(save_path="mg_grouped_gemm_benchmark_results")
+ benchmark_imbalance.run(save_path="mg_grouped_gemm_benchmark_results")
+ logging.info(
+ "Triton performance reports saved to 'mg_grouped_gemm_benchmark_results' directory"
+ )
+
+ if args.run_all or args.compare_mg_ng:
+ # Compare M*G and N*G implementations
+ logging.info("Comparing M*G and N*G implementations...")
+ compare_mg_implementations()
diff --git a/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py
new file mode 100644
index 0000000000000000000000000000000000000000..2429432d756ae4d5bb6f91a6108c7ba8a4b9c627
--- /dev/null
+++ b/torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+import logging
+import unittest
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+from mg_grouped_gemm import grouped_gemm_forward
+
+
+class TestMG_GroupedGEMM(unittest.TestCase):
+ def setUp(self) -> None:
+ torch.manual_seed(2020)
+
+ def _run_grouped_gemm_test(
+ self,
+ shape: Tuple[int, int, int, int],
+ device: torch.device,
+ dtype: torch.dtype = torch.bfloat16,
+ atol: float = 1e-5,
+ rtol: float = 1.6e-2,
+ ) -> None:
+ G, M, N, K = shape
+ # In M*G grouping, input is [M*G, K] and weights are [N*G, K]
+ a = torch.randn(M * G, K, dtype=dtype, device=device)
+ b = torch.randn(N * G, K, dtype=dtype, device=device)
+
+ # Create equal-sized groups for simplicity
+ m_size = M
+ m_sizes = torch.full((G,), m_size, device=device, dtype=torch.int32)
+
+ result = grouped_gemm_forward(a, b, m_sizes)
+ self.assertTrue(result.shape == (M * G, N))
+
+ expected_result = torch.zeros(M * G, N, dtype=dtype, device=device)
+ m_start = 0
+ for g in range(G):
+ m_end = m_start + m_sizes[g]
+ b_slice = b[N * g : N * (g+1), :]
+ expected_result[m_start:m_end, :] = a[m_start:m_end, :] @ b_slice.T
+ m_start = m_end
+
+ # Convert result to match input dtype if needed
+ result = result.to(dtype)
+ torch.testing.assert_close(result, expected_result, atol=atol, rtol=rtol)
+
+ def test_MG_grouped_gemm_bf16(self) -> None:
+ for G in (1, 4, 16):
+ for M in (128, 512, 1024):
+ print(f"Testing BF16 M*G GroupGeMM with G={G}, M={M}")
+ self._run_grouped_gemm_test(
+ (G, M, 1024, 1024),
+ torch.device("cuda"),
+ dtype=torch.bfloat16,
+ atol=1e-5,
+ rtol=1.6e-2,
+ )
+
+ def test_MG_grouped_gemm_deepseek_shapes(self) -> None:
+ """Test with shapes from Deepseek model."""
+ deepseek_shapes = [
+ (4, 2048, 4096, 7168), # G, M, N, K
+ (4, 2048, 7168, 2048),
+ (8, 512, 4096, 7168),
+ (8, 512, 7168, 2048),
+ ]
+
+ device = torch.device("cuda")
+
+ for shape in deepseek_shapes:
+ G, M, N, K = shape
+ print(f"Testing BF16 M*G Deepseek shape: G={G}, M={M}, N={N}, K={K}")
+ self._run_grouped_gemm_test(
+ shape, device, dtype=torch.bfloat16, atol=1e-5, rtol=1.6e-2
+ )
diff --git a/torchtitan/experiments/llama4/infra/__pycache__/parallelize_llama.cpython-312.pyc b/torchtitan/experiments/llama4/infra/__pycache__/parallelize_llama.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27907cdc4b5f16d3ba623e3576da6fea69e8ae47
Binary files /dev/null and b/torchtitan/experiments/llama4/infra/__pycache__/parallelize_llama.cpython-312.pyc differ
diff --git a/torchtitan/experiments/llama4/infra/parallelize_llama.py b/torchtitan/experiments/llama4/infra/parallelize_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..72842fc04f896896772beca4ec7b50b0ce66a7b5
--- /dev/null
+++ b/torchtitan/experiments/llama4/infra/parallelize_llama.py
@@ -0,0 +1,159 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+
+from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+
+from torchtitan.models.llama3.parallelize_llama import (
+ apply_ac,
+ apply_compile,
+ apply_ddp,
+ apply_fsdp,
+ apply_tp,
+)
+from torchtitan.tools.logging import logger
+
+
+def parallelize_llama(
+ model: nn.Module,
+ world_mesh: DeviceMesh,
+ parallel_dims: ParallelDims,
+ job_config: JobConfig,
+):
+ """
+ Apply tensor parallelism, activation checkpointing, torch.compile, and data
+ parallelism to the model.
+
+ NOTE: The passed-in model preferably should be on meta device. Otherwise,
+ the model must fit on GPU or CPU memory.
+ """
+
+ if parallel_dims.tp_enabled:
+ if (
+ job_config.parallelism.enable_async_tensor_parallel
+ and not job_config.training.compile
+ ):
+ raise RuntimeError("Async TP requires --training.compile")
+
+ enable_float8_linear = "float8" in job_config.model.converters
+ float8_is_rowwise = job_config.float8.recipe_name in (
+ "rowwise",
+ "rowwise_with_gw_hp",
+ )
+
+ # For now, float8 all-gather with TP is only supported for tensorwise
+ # float8 scaling recipes. For rowwise recipes, we use regular TP and
+ # all-gather happens in high precision.
+ enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+
+ apply_tp(
+ model,
+ world_mesh["tp"],
+ loss_parallel=parallel_dims.loss_parallel_enabled,
+ enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+ enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
+ )
+
+ apply_moe_tp(model, world_mesh["tp"])
+
+ if job_config.activation_checkpoint.mode != "none":
+ if (
+ job_config.activation_checkpoint.mode == "selective"
+ and job_config.model.use_flex_attn
+ ):
+ raise ValueError(
+ "FlexAttention is not compatible with selective AC yet. "
+ "See https://github.com/pytorch/pytorch/issues/147879"
+ )
+ apply_ac(model, job_config.activation_checkpoint)
+
+ # turn on per-TransformerBlock compile after AC wrapping and before FSDP
+ if job_config.training.compile:
+ apply_compile(model)
+
+ # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
+ torch._dynamo.config.capture_scalar_outputs = True
+
+ if (
+ parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
+ ): # apply FSDP or HSDP, potentially with Context Parallel
+ if parallel_dims.dp_replicate_enabled:
+ dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+ else:
+ dp_mesh_dim_names = ("dp_shard_cp",)
+
+ apply_fsdp(
+ model,
+ world_mesh[tuple(dp_mesh_dim_names)],
+ param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+ reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+ pp_enabled=parallel_dims.pp_enabled,
+ cpu_offload=job_config.training.enable_cpu_offload,
+ reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward,
+ )
+
+ if parallel_dims.dp_replicate_enabled:
+ logger.info("Applied HSDP to the model")
+ else:
+ logger.info("Applied FSDP to the model")
+
+ if parallel_dims.cp_enabled:
+ logger.info("Applied Context Parallel to the model")
+
+ if job_config.training.enable_cpu_offload:
+ logger.info("Applied CPU Offloading to the model")
+ elif parallel_dims.dp_replicate_enabled:
+ if world_mesh.ndim > 1:
+ raise RuntimeError("DDP has not supported > 1D parallelism")
+ apply_ddp(
+ model,
+ world_mesh,
+ enable_compile=job_config.training.compile,
+ enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
+ )
+
+ return model
+
+
+def apply_moe_tp(
+ model: nn.Module,
+ tp_mesh: DeviceMesh,
+):
+ from torch.distributed.tensor import Partial, Replicate, Shard
+ from torch.distributed.tensor.parallel import (
+ parallelize_module,
+ PrepareModuleInputOutput,
+ )
+
+ from .expert_parallel import NoParallel, TensorParallel
+
+ for _, transformer_block in model.layers.items():
+ moe_layer_plan = {
+ # input / output sharding on the seqlen dim
+ # all-gather for input, reduce-scatter for output
+ "moe": PrepareModuleInputOutput(
+ input_layouts=(Shard(1),),
+ desired_input_layouts=(Replicate(),),
+ use_local_input=True,
+ output_layouts=(Partial(),),
+ desired_output_layouts=(Shard(1),),
+ ),
+ # replicate computation for the router
+ "moe.router.gate": NoParallel(),
+ # input Replicate, output Partial
+ "moe.experts": TensorParallel(),
+ "moe.shared_expert": TensorParallel(),
+ }
+ parallelize_module(
+ module=transformer_block,
+ device_mesh=tp_mesh,
+ parallelize_plan=moe_layer_plan,
+ )
diff --git a/torchtitan/experiments/llama4/model/__pycache__/args.cpython-312.pyc b/torchtitan/experiments/llama4/model/__pycache__/args.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75650d462a816282cc77f1f0feb67300f3c708e6
Binary files /dev/null and b/torchtitan/experiments/llama4/model/__pycache__/args.cpython-312.pyc differ
diff --git a/torchtitan/experiments/llama4/model/__pycache__/model.cpython-312.pyc b/torchtitan/experiments/llama4/model/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98abd5c8366e3acbf5fc8e031f0dcb5ba3225180
Binary files /dev/null and b/torchtitan/experiments/llama4/model/__pycache__/model.cpython-312.pyc differ
diff --git a/torchtitan/experiments/llama4/model/moe.py b/torchtitan/experiments/llama4/model/moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b925b36207875dedc13a16be10890c3671cdabb
--- /dev/null
+++ b/torchtitan/experiments/llama4/model/moe.py
@@ -0,0 +1,228 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from .args import TransformerModelArgs
+
+
+class GroupedExperts(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ hidden_dim: int,
+ num_experts: int,
+ ):
+ super().__init__()
+ self.num_experts = num_experts
+ self.w1 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+ self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
+ self.w3 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ num_local_tokens_per_expert: torch.Tensor | None = None,
+ ) -> torch.Tensor:
+ if num_local_tokens_per_expert is not None:
+ # a tuple of tensors indexed by experts
+ # each with shape (tokens_per_expert(varying), dim)
+ x = torch.split(
+ x,
+ split_size_or_sections=num_local_tokens_per_expert.tolist(),
+ dim=0,
+ )
+ out_experts_splits = []
+ for expert_idx, x_expert in enumerate(x):
+ w1, w2, w3 = (
+ self.w1[expert_idx],
+ self.w2[expert_idx],
+ self.w3[expert_idx],
+ )
+ h = F.silu(torch.matmul(x_expert, w1))
+ h = h * torch.matmul(x_expert, w3)
+ h = torch.matmul(h, w2)
+ # h shape (tokens_per_expert(varying), dim)
+ out_experts_splits.append(h)
+ out = torch.cat(out_experts_splits, dim=0)
+
+ # TODO:optimize with GroupedGEMM
+ # https://github.com/pytorch/pytorch/pull/150374
+ # _gouped_mm requires shapes to be multiple of 8
+ # offsets = torch.cumsum(num_local_tokens_per_expert, dim=0, dtype=torch.int32)
+ # h = F.silu(torch._grouped_mm(x, self.w1.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16))
+ # h = h * torch._grouped_mm(x, self.w3.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)
+ # out = torch._grouped_mm(h, self.w2.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)
+ else:
+ # x shape (num_experts, tokens_per_expert, dim)
+ h = F.silu(torch.bmm(x, self.w1))
+ h = h * torch.bmm(x, self.w3)
+ # out shape (num_experts, tokens_per_expert, dim)
+ out = torch.bmm(h, self.w2)
+ return out
+
+ def init_weights(self, init_std: float):
+ nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02)
+ nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std)
+ nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std)
+
+
+class TokenChoiceTopKRouter(nn.Module):
+ """This class implements token-choice routing. In token-choice top-K routing, each token is
+ routed to top K experts based on the router scores.
+
+ Args:
+ gate (nn.Module): Gate module to calculate the scores, typically nn.Linear(dim, num_experts).
+ dim (int): Dimension of input tokens.
+ num_experts (int): Number of experts in each moe layer.
+ top_k (int): Number of experts each token will be routed to in token-choice routing.
+ use_sigmoid (bool): Whether to use sigmoid or softmax for router scores. Default is False.
+ """
+
+ def __init__(
+ self,
+ dim: int,
+ num_experts: int,
+ top_k: int,
+ use_sigmoid: bool = False,
+ ):
+ super().__init__()
+ self.gate = nn.Linear(dim, num_experts, bias=False)
+ self.num_experts = num_experts
+ self.top_k = top_k
+ self.use_sigmoid = use_sigmoid
+
+ def forward(
+ self, x: torch.Tensor
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """
+ Args:
+ x (torch.Tensor): Input tensor with shape ``(bs*slen, dim)``.
+
+ Returns:
+ routed_input (torch.Tensor):
+ Tokens grouped together by experts indices with shape ``(bs*slen*top_k,)``.
+ token_indices (torch.Tensor):
+ Token indices for routed_input with shape ``(bs*slen*top_k,)``.
+ num_local_tokens_per_expert (torch.Tensor):
+ Number of tokens assigned to each expert with shape ``(num_experts,)``.
+ """
+ # scores shape (bs*slen, num_experts)
+ scores = self.gate(x)
+
+ # By default, sigmoid or softmax is performed in float32 to avoid loss explosion
+ if self.use_sigmoid:
+ scores = torch.sigmoid(scores.to(torch.float32)).to(x.dtype)
+ else:
+ scores = F.softmax(scores.to(torch.float32), dim=1).to(x.dtype)
+
+ # top scores shape (bs*slen, top_k)
+ top_scores, selected_experts_indices = torch.topk(scores, k=self.top_k, dim=1)
+ # top_scores /= top_scores.sum(dim=-1, keep_dim=True).to(x.dtype)
+
+ # group tokens together by expert indices from 0 to num_experts and pass that to experts forward
+ num_local_tokens_per_expert = torch.histc(
+ selected_experts_indices.view(-1),
+ bins=self.num_experts,
+ min=0,
+ max=self.num_experts,
+ )
+ # token_indices_experts_sorted shape (bs*slen*top_k,)
+ token_indices_experts_sorted = torch.argsort(
+ selected_experts_indices.view(-1), stable=True
+ )
+ top_scores = top_scores.view(-1)[token_indices_experts_sorted]
+ token_indices_experts_sorted = token_indices_experts_sorted // self.top_k
+
+ return top_scores, token_indices_experts_sorted, num_local_tokens_per_expert
+
+ def init_weights(self, init_std: float):
+ nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std)
+
+
+# TODO: implement load balancing auxiliary loss for token-choice routing
+class MoE(nn.Module):
+ def __init__(self, model_args: TransformerModelArgs):
+ super().__init__()
+ dim = model_args.dim
+ hidden_dim = 4 * model_args.dim
+ ffn_dim_multiplier = model_args.ffn_dim_multiplier
+ hidden_dim = int(2 * hidden_dim / 3)
+ if ffn_dim_multiplier is not None:
+ hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+
+ num_experts = model_args.num_experts
+
+ hidden_dim_denom = 1
+ if model_args.auto_scale_hidden_dim:
+ hidden_dim_denom = model_args.top_k + int(model_args.use_shared_expert)
+
+ if model_args.auto_scale_hidden_dim:
+ hidden_dim = int(hidden_dim / hidden_dim_denom)
+ hidden_dim += -hidden_dim % model_args.multiple_of
+
+ self.experts = GroupedExperts(
+ dim=dim, hidden_dim=hidden_dim, num_experts=num_experts
+ )
+ self.router = TokenChoiceTopKRouter(
+ dim=dim, num_experts=num_experts, top_k=model_args.top_k
+ )
+ self.shared_expert = (
+ GroupedExperts(dim=dim, hidden_dim=hidden_dim, num_experts=1)
+ if model_args.use_shared_expert
+ else None
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Args:
+ x (torch.Tensor): Input tensor with shape ``(bs, slen, dim)``.
+
+ Returns:
+ out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``.
+ """
+ bs, slen, dim = x.shape
+ # top_scores and selected_indices shape (bs*slen*top_k,)
+ # num_local_tokens_per_expert shape (num_experts,)
+ (
+ top_scores,
+ token_indices,
+ num_local_tokens_per_expert,
+ ) = self.router(x.reshape(bs * slen, dim))
+
+ # shape (bs*slen*top_k, dim)
+ token_indices = token_indices.reshape(-1, 1).expand(-1, dim)
+
+ # shape (bs*slen*top_k, dim)
+ routed_input = torch.gather(
+ x.view(-1, dim),
+ dim=0,
+ index=token_indices,
+ )
+ routed_input = routed_input * top_scores.reshape(-1, 1)
+
+ # shape (bs*slen*top_k, dim)
+ routed_output = self.experts(routed_input, num_local_tokens_per_expert)
+
+ # shared expert
+ if self.shared_expert is not None:
+ out = self.shared_expert(x.reshape(1, bs * slen, dim)).reshape(
+ bs * slen, dim
+ )
+ else:
+ out = torch.zeros_like(x.reshape(bs * slen, dim))
+
+ out = out.scatter_add(dim=0, index=token_indices, src=routed_output)
+ out = out.reshape(bs, slen, dim)
+ return out
+
+ def init_weights(self, init_std: float):
+ self.experts.init_weights(init_std)
+ self.router.init_weights(init_std)
+ if self.shared_expert is not None:
+ self.shared_expert.init_weights(init_std)
diff --git a/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.sh b/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6530b8ce992c8c33ccec94614e026d73964710ee
--- /dev/null
+++ b/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+set -ex
+
+# use envs as local overrides for convenience
+# e.g.
+# LOG_RANK=0,1 NGPU=4 ./convert_hf_to_dcp_with_gpus.sh
+NGPU=${NGPU:-"8"}
+LOG_RANK=${LOG_RANK:-0,1,2,3,4,5,6,7}
+CONFIG_FILE=${CONFIG_FILE:-"../train_configs/llama4_17bx16e.toml"}
+
+overrides=""
+if [ $# -ne 0 ]; then
+ overrides="$*"
+fi
+
+PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
+torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+convert_hf_to_dcp_with_gpus.py --job.config_file ${CONFIG_FILE} $overrides
diff --git a/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py
new file mode 100644
index 0000000000000000000000000000000000000000..7756afe3de1527f469a38fc6a0bdc6c62eaa2526
--- /dev/null
+++ b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py
@@ -0,0 +1,536 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import os
+import time
+from dataclasses import dataclass
+from typing import Any, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed.tensor import DeviceMesh, distribute_tensor, DTensor, Shard
+from torch.distributed.tensor._utils import compute_local_shape_and_global_offset
+from torchtitan.components.checkpoint import MODEL
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import init_logger, logger
+from torchtitan.train import Trainer
+
+# Sharding dims for MP checkpoints
+
+column_parallel = [
+ "tok_embeddings",
+ "wq",
+ "wk",
+ "wv",
+ "wqkv",
+ "w_in_shared_FD",
+ "w_out_eF_D",
+ "w_swiglu_FD",
+ "output",
+ "_linear",
+ "c_fc",
+ "vision_projection",
+]
+
+row_parallel = [
+ "wo",
+ "w_out_shared_DF",
+ "w_in_eD_F",
+ "moe_w_swiglu_eD_F",
+ "c_proj",
+]
+
+
+def convert_to_titan_fqns(fqn: str) -> list[str]:
+ # From the stored checkpoint keys to TorchTitan keys.
+ if "wqkv" in fqn and "layer_norm_weight" not in fqn:
+ ret = []
+ for k in ("wq", "wk", "wv"):
+ ret.append(fqn.replace("wqkv", k))
+ return ret
+ return [fqn]
+
+
+def get_shard_dim(fqn: str) -> Optional[int]:
+ if "bias" in fqn:
+ # Some bias params are still sharded
+ if "resblocks" in fqn:
+ for k in ("wq", "wk", "wv", "c_fc"):
+ if k in fqn:
+ return 0
+ return None
+ elif any([x in fqn for x in column_parallel]):
+ return 0
+ elif any([x in fqn for x in row_parallel]):
+ return 1
+ else:
+ return None
+
+
+def split_fused_qkv(shards: list[torch.Tensor]) -> tuple[torch.Tensor, ...]:
+ qkvs = [torch.split(shard, [640, 128, 128]) for shard in shards]
+ q = torch.cat([qkv[0] for qkv in qkvs], dim=0)
+ k = torch.cat([qkv[1] for qkv in qkvs], dim=0)
+ v = torch.cat([qkv[2] for qkv in qkvs], dim=0)
+ return q, k, v
+
+
+@dataclass
+class _Assignment:
+ loader_id: int
+ filename: str
+ fqns: tuple[str, ...]
+ shapes: tuple[torch.Size, ...]
+ dtypes: tuple[torch.dtype, ...]
+
+
+@dataclass
+class _AssignmentRound:
+ loader_assignments: dict[int, _Assignment] # List of assignments for each loader
+
+
+class CheckpointConverter:
+ TOTAL_SHARDS = 8
+
+ def __init__(
+ self,
+ process_group: dist.ProcessGroup,
+ path: str,
+ loader_every_n_ranks: int = 8,
+ ) -> None:
+ self.path = path
+ self.pg = process_group
+ self.my_rank = dist.get_rank(self.pg)
+ self.loader_every_n_ranks = loader_every_n_ranks
+ self.loader_id = self.my_rank // loader_every_n_ranks
+ self.should_load = (
+ self.my_rank % loader_every_n_ranks == 0
+ and self.loader_id < CheckpointConverter.TOTAL_SHARDS
+ )
+ self.total_loader = CheckpointConverter.TOTAL_SHARDS
+ self.titan_fqn_to_stored_fqn: dict[str, str] = {}
+ self.stored_fqn_to_titan_fqn: dict[str, list[str]] = {}
+ self.total_send_bytes = 0
+ self.total_recv_bytes = 0
+
+ def convert(self, state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+ begin = time.time()
+ self._load_metadata()
+ self._create_fqn_mappings(state_dict)
+ rounds = self._get_load_assignments(state_dict)
+
+ for assignments in rounds:
+ loader_assignments = assignments.loader_assignments
+ loaded_state_dict = None
+ # Let each loader to load its own data and move to its GPU.
+ for i in range(self.total_loader):
+ # This loader doesn't have any loading assignment for this round.
+ if i not in loader_assignments:
+ continue
+ # This rank is not the loader
+ if i != self.loader_id or not self.should_load:
+ continue
+ loaded_state_dict = self._load_round(loader_assignments[i])
+
+ results = []
+ for i in range(self.total_loader):
+ if i not in loader_assignments:
+ continue
+
+ if i == self.loader_id and self.should_load:
+ # This rank is the loader. It needs to send the loaded data to
+ # the other ranks.
+ assert loaded_state_dict is not None
+ results.append(
+ self._reshard_send(loader_assignments[i], loaded_state_dict)
+ )
+ else:
+ results.append(
+ self._reshard_receive(loader_assignments[i], state_dict)
+ )
+
+ self._reshard(results, state_dict)
+
+ torch.cuda.synchronize()
+ logger.info(f"Checkpoint conversion took {time.time() - begin:.2f} seconds.")
+ logger.info(f"Total send bytes: {self.total_send_bytes / 1e9:.2f} GB")
+ logger.info(f"Total recv bytes: {self.total_recv_bytes / 1e9:.2f} GB")
+ return state_dict
+
+ def _get_file_path(self, loader_id: int) -> str:
+ return os.path.join(self.path, f"consolidated.0{loader_id}.pth")
+
+ def _load_metadata(self) -> None:
+ if not self.should_load:
+ self.read_dict = {}
+ return
+ self.read_dict = torch.load(
+ self._get_file_path(self.loader_id),
+ mmap=True,
+ weights_only=False,
+ )
+
+ def _create_fqn_mappings(self, state_dict: dict[str, torch.Tensor]) -> None:
+ if not self.read_dict:
+ return
+
+ # Create the mapping from the stored checkpoint keys to TorchTitan keys.
+ for fqn in list(self.read_dict.keys()):
+ titan_fqns = convert_to_titan_fqns(fqn)
+ # We don't know how to process _extra_state
+ if "_extra_state" in fqn:
+ self.read_dict.pop(fqn)
+ continue
+
+ if titan_fqns[0] not in state_dict:
+ for titan_fqn in titan_fqns:
+ assert titan_fqns[0] not in state_dict
+ self.read_dict.pop(fqn)
+ continue
+ self.stored_fqn_to_titan_fqn[fqn] = titan_fqns
+ for titan_fqn in titan_fqns:
+ self.titan_fqn_to_stored_fqn[titan_fqn] = fqn
+
+ assert set(state_dict.keys()) == set(self.titan_fqn_to_stored_fqn.keys()), (
+ set(state_dict.keys()) - set(self.titan_fqn_to_stored_fqn.keys()),
+ set(self.titan_fqn_to_stored_fqn.keys()) - set(state_dict.keys()),
+ )
+
+ def _get_load_assignments(
+ self, state_dict: dict[str, torch.Tensor]
+ ) -> list[_AssignmentRound]:
+ if self.my_rank == 0:
+ rounds: list[_AssignmentRound] = []
+ size = 0
+ fqns = []
+ shapes = []
+ dtypes = []
+
+ # All loader must load all the FQNs because the checkpoint is purely TP sharded.
+ all_keys = list(self.read_dict.keys())
+ for fqn in all_keys:
+ fqns.append(fqn)
+ shapes.append(self.read_dict[fqn].shape)
+ dtypes.append(self.read_dict[fqn].dtype)
+ size += self.read_dict[fqn].numel() * self.read_dict[fqn].element_size()
+ if size < 1e9 and fqn != all_keys[-1]:
+ continue
+
+ logger.info(f"Adding {fqns} to round {len(rounds)}")
+ round_assignment = _AssignmentRound(loader_assignments={})
+ for loader_id in range(self.total_loader):
+ path = self._get_file_path(loader_id)
+ round_assignment.loader_assignments[loader_id] = _Assignment(
+ filename=path,
+ fqns=tuple(fqns),
+ shapes=tuple(shapes),
+ dtypes=tuple(dtypes),
+ loader_id=loader_id,
+ )
+ rounds.append(round_assignment)
+ size = 0
+ fqns.clear()
+ shapes.clear()
+ dtypes.clear()
+
+ object_list: list[Any] = [
+ rounds,
+ self.titan_fqn_to_stored_fqn,
+ self.stored_fqn_to_titan_fqn,
+ ]
+ else:
+ object_list = [None, None, None]
+
+ dist.broadcast_object_list(object_list, src=0, group=self.pg)
+ rounds = object_list[0]
+ self.titan_fqn_to_stored_fqn = object_list[1]
+ self.stored_fqn_to_titan_fqn = object_list[2]
+ return rounds
+
+ def _load_round(self, assignment: _Assignment) -> dict[str, torch.Tensor]:
+ ret = {}
+ assert self.read_dict
+ for fqn in assignment.fqns:
+ ret[fqn] = self.read_dict[fqn].to(device="cuda")
+ return ret
+
+ def _reshard_send(
+ self,
+ assignment: _Assignment,
+ loaded_state_dict: dict[str, torch.Tensor],
+ ) -> dict[str, torch.Tensor]:
+ flatten_tensors = [t.flatten() for t in loaded_state_dict.values()]
+ flatten_tensor = torch.concat(flatten_tensors)
+ assert self.loader_id == assignment.loader_id
+ rank = self.loader_id * self.loader_every_n_ranks
+ assert rank == self.my_rank
+ logger.info(f"Sending {assignment.filename} from {rank} {self.loader_id}")
+ logger.info(f"Sending {assignment.fqns}")
+ dist.broadcast(flatten_tensor, src=rank, group=self.pg)
+ self.total_send_bytes += flatten_tensor.numel() * flatten_tensor.element_size()
+ return loaded_state_dict
+
+ def _reshard_receive(
+ self, assignment: _Assignment, state_dict: dict[str, torch.Tensor]
+ ) -> dict[str, torch.Tensor]:
+ flatten_tensor = torch.empty(
+ sum(math.prod(s) for s, d in zip(assignment.shapes, assignment.dtypes)),
+ dtype=assignment.dtypes[0],
+ device="cuda",
+ )
+ rank = assignment.loader_id * self.loader_every_n_ranks
+ dist.broadcast(flatten_tensor, src=rank, group=self.pg)
+ self.total_recv_bytes += flatten_tensor.numel() * flatten_tensor.element_size()
+
+ ret: dict[str, torch.Tensor] = {}
+ loc = 0
+ for fqn, shape, dtype in zip(
+ assignment.fqns, assignment.shapes, assignment.dtypes
+ ):
+ n_ele = math.prod(shape)
+ ret[fqn] = flatten_tensor[loc : loc + n_ele].view(shape)
+ loc += n_ele
+ return ret
+
+ def _reshard(
+ self,
+ results: list[dict[str, torch.Tensor]],
+ state_dict: dict[str, torch.Tensor],
+ ) -> None:
+ def _inplace_copy(fqn: str, full_tensors: tuple[torch.Tensor, ...]):
+ titan_fqns = self.stored_fqn_to_titan_fqn[fqn]
+ assert len(titan_fqns) == len(full_tensors)
+ for titan_fqn, full_tensor in zip(titan_fqns, full_tensors):
+ dtensor = state_dict[titan_fqn]
+ logger.info(f"{titan_fqn} {full_tensor.sum()}")
+ assert isinstance(dtensor, DTensor)
+ shape, offset = compute_local_shape_and_global_offset(
+ full_tensor.shape, dtensor.device_mesh, dtensor.placements
+ )
+ slices = [
+ slice(cur_offset, cur_offset + cur_shape)
+ for cur_shape, cur_offset in zip(shape, offset)
+ ]
+ logger.info(
+ f"Copying {titan_fqn} with {slices=} {dtensor._local_tensor.shape=} "
+ f"{shape=} {offset=} {self.my_rank=} {dtensor.shape=} {full_tensor.shape=} "
+ f"{dtensor.placements=} {dtensor.device_mesh=} "
+ )
+ dtensor.to_local().copy_(full_tensor[slices])
+
+ def _concat_shards(fqn, shards: list[torch.Tensor]) -> tuple[torch.Tensor, ...]:
+ if "wqkv" in fqn:
+ if "layer_norm" in fqn:
+ return (shards[0],)
+ return split_fused_qkv(shards)
+
+ shard_dim = get_shard_dim(fqn)
+ if shard_dim is None:
+ return (shards[0],)
+ return (torch.cat(shards, dim=shard_dim),)
+
+ fqns = list(results[0].keys())
+ for result in results:
+ assert list(result.keys()) == fqns
+
+ for fqn in fqns:
+ full_tensors = _concat_shards(fqn, [result[fqn] for result in results])
+ _inplace_copy(fqn, full_tensors)
+
+
+def _create_verified_state_dict(
+ pg: dist.ProcessGroup, mesh: DeviceMesh
+) -> dict[str, torch.Tensor]:
+ placements = [Shard(0)]
+ state_dict = {
+ "tok_embeddings.weight": torch.rand(
+ 25256 * 8, 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.attention.wqkv.layer_norm_weight": torch.rand(
+ 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.attention.wq.weight": torch.rand(
+ 640 * 8, 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.attention.wk.weight": torch.rand(
+ 128 * 8, 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.attention.wv.weight": torch.rand(
+ 128 * 8, 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.attention.wo.weight": torch.rand(
+ 5120, 640 * 8, device="cuda", dtype=torch.bfloat16
+ ),
+ # "layers.47.feed_forward.router_DE": torch.rand(5120, 128, device="cuda", dtype=torch.bfloat16),
+ # "layers.47.feed_forward.running_gate_stats_3E": torch.rand(3, 128, device="cuda", dtype=torch.bfloat16),
+ # "layers.47.feed_forward.global_gate_stats_3E": torch.rand(3, 128, device="cuda", dtype=torch.bfloat16),
+ "layers.47.feed_forward.w_in_shared_FD.weight": torch.rand(
+ 1024 * 8, 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.feed_forward.w_out_shared_DF.weight": torch.rand(
+ 5120, 1024 * 8, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.feed_forward.w_swiglu_FD.weight": torch.rand(
+ 1024 * 8, 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.feed_forward.norm.weight": torch.rand(
+ 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.feed_forward.experts.moe_w_in_eD_F": torch.rand(
+ 655360, 1024 * 8, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.feed_forward.experts.moe_w_out_eF_D": torch.rand(
+ 131072 * 8, 5120, device="cuda", dtype=torch.bfloat16
+ ),
+ "layers.47.feed_forward.experts.moe_w_swiglu_eD_F": torch.rand(
+ 655360, 1024 * 8, device="cuda", dtype=torch.bfloat16
+ ),
+ }
+ return {k: distribute_tensor(v, mesh, placements) for k, v in state_dict.items()}
+
+
+def _verify_state_dict(
+ state_dict: dict[str, torch.Tensor], path: str, rank: int
+) -> None:
+ stored_state_dicts = [
+ torch.load(
+ os.path.join(path, f"consolidated.0{i}.pth"),
+ map_location="cpu",
+ weights_only=False,
+ mmap=True,
+ )
+ for i in range(8)
+ ]
+
+ def read_and_verify_tensor(fqn: str, dtensor: DTensor) -> None:
+ logger.info(f"Verifying {fqn} {dtensor.shape=} {dtensor.placements=} ")
+ shards = [stored_state_dicts[i][fqn] for i in range(8)]
+ full_tensor = dtensor.full_tensor()
+ logger.info(f"Gather {fqn} {full_tensor.shape} completely.")
+
+ if rank > 0:
+ return
+
+ if len(shards[0].shape) == 1:
+ assert full_tensor.shape == shards[0].shape, fqn
+ assert torch.allclose(shards[0].to(device="cuda"), full_tensor), fqn
+ return
+ elif shards[0].shape[0] == full_tensor.shape[0]:
+ concat_shards = torch.cat(shards, dim=1)
+ logger.info(f"Load {fqn} completely.")
+ elif shards[0].shape[1] == full_tensor.shape[1]:
+ concat_shards = torch.cat(shards, dim=0)
+ logger.info(f"Load {fqn} completely.")
+
+ concat_shards = concat_shards.to(device="cuda")
+ logger.info(f"Move to GPU {fqn} completely.")
+
+ assert concat_shards.shape == full_tensor.shape, fqn
+ assert concat_shards.dtype == full_tensor.dtype, fqn
+ assert concat_shards.device == full_tensor.device, fqn
+ assert torch.allclose(concat_shards, full_tensor), fqn
+
+ for k, v in state_dict.items():
+ if "wq" in k and "wqkv" not in k:
+ pass
+ elif "wk" in k:
+ pass
+ elif "wv" in k:
+ pass
+ else:
+ assert v is not None, k
+ read_and_verify_tensor(k, v)
+
+
+if __name__ == "__main__":
+ init_logger()
+ config = JobConfig()
+ config.parser.add_argument(
+ "--checkpoint.convert_path",
+ type=str,
+ default="",
+ help="""Specify the path of the target checkpoint to convert.""",
+ )
+ config.parser.add_argument(
+ "--checkpoint.convert_load_every_n_ranks",
+ type=int,
+ default=8,
+ help="""
+ Specify the interval at which ranks are assigned to load checkpoints.
+
+ For example, if this number is 4, then ranks 0, 4, 8, ... will load the
+ checkpoint. Each loader is responsible for loading one file. If there
+ are more loaders than files, only the first few loaders will be assigned
+ to load the checkpoint. The default value is 8.
+ """,
+ )
+ config.parser.add_argument(
+ "--checkpoint.fake_model",
+ action="store_true",
+ help="""If true, the model will be fake.""",
+ )
+ config.parse_args()
+ assert config.checkpoint.convert_path != ""
+
+ trainer: Optional[Trainer] = None
+
+ try:
+ trainer = Trainer(config)
+ if os.path.exists(trainer.checkpointer.folder):
+ raise RuntimeError(
+ "The checkpoint folder already exists. Abort to avoid overwriting "
+ f"the checkpoint. {trainer.checkpointer.folder=}"
+ )
+ if config.checkpoint.fake_model:
+ state_dict = _create_verified_state_dict(
+ trainer.world_mesh.get_group(), trainer.world_mesh
+ )
+ else:
+ state_dict = trainer.checkpointer.states[MODEL].state_dict()
+
+ size = 0
+ for v in state_dict.values():
+ size += v.numel() * v.element_size()
+ logger.info(f"Total size of the model: {size / 1e9:.2f} GB")
+
+ # Do not support PP yet, we will need to iterate over the PP dimension and
+ # extract the corresponding state_dict and device_mesh.
+ if "freq_cis" in state_dict:
+ state_dict.pop("freqs_cis")
+
+ state_dict = CheckpointConverter(
+ process_group=trainer.world_mesh.get_group(),
+ path=config.checkpoint.convert_path,
+ loader_every_n_ranks=config.checkpoint.convert_load_every_n_ranks,
+ ).convert(state_dict)
+
+ class DummyModel:
+ def __init__(self, state_dict: dict[str, torch.Tensor]) -> None:
+ self._state_dict = state_dict
+
+ def state_dict(self) -> dict[str, torch.Tensor]:
+ return self._state_dict
+
+ if config.checkpoint.fake_model:
+ begin = time.time()
+ _verify_state_dict(
+ state_dict,
+ config.checkpoint.convert_path,
+ trainer.world_mesh.get_rank(),
+ )
+ dist.barrier()
+ logger.info(f"Verifies state_dict {time.time() - begin}.")
+ else:
+ # oh, this is pretty bad, when can we get rid of the freqs_cis issue?
+ state_dict["freqs_cis"] = None
+ trainer.checkpointer.states[MODEL] = DummyModel(state_dict)
+ trainer.checkpointer.model_weights_only = True
+ trainer.checkpointer.export_dtype = next(iter(state_dict.values())).dtype
+ trainer.checkpointer.save(curr_step=0, force=True)
+ time.sleep(2)
+ finally:
+ pass
diff --git a/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.sh b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f3fd310934b1181ed83fa9fc4463f0c2336b46fc
--- /dev/null
+++ b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+# use envs as local overrides for convenience
+# e.g.
+# LOG_RANK=0,1 NGPU=4 ./convert_meta_to_dcp_with_gpus.sh
+NGPU=${NGPU:-"8"}
+LOG_RANK=${LOG_RANK:-0,1,2,3,4,5,6,7}
+CONFIG_FILE=${CONFIG_FILE:-"../train_configs/llama4_17bx16e.toml"}
+
+overrides=""
+if [ $# -ne 0 ]; then
+ overrides="$*"
+fi
+
+PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
+torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+convert_meta_to_dcp_with_gpus_meta.py --job.config_file ${CONFIG_FILE} $overrides
diff --git a/torchtitan/experiments/multimodal/__pycache__/__init__.cpython-312.pyc b/torchtitan/experiments/multimodal/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bcece22f4ab407a7702d83af4b95ac824c05cc45
Binary files /dev/null and b/torchtitan/experiments/multimodal/__pycache__/__init__.cpython-312.pyc differ
diff --git a/torchtitan/experiments/multimodal/tests/__init__.py b/torchtitan/experiments/multimodal/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e41cd717f6a439a9c08d76a9d0e4a54e190fc5a
--- /dev/null
+++ b/torchtitan/experiments/multimodal/tests/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/torchtitan/experiments/multimodal/tests/test_multimodal_model.py b/torchtitan/experiments/multimodal/tests/test_multimodal_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5acc51bb3d186674267a4fc47d9075f04122a60
--- /dev/null
+++ b/torchtitan/experiments/multimodal/tests/test_multimodal_model.py
@@ -0,0 +1,128 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+
+from torchtitan.experiments.llama_multimodal import (
+ ModelArgs,
+ MultimodalDecoder,
+ VisionEncoder,
+)
+
+from .test_utils import fixed_init_model, fixed_init_tensor
+
+
+@pytest.fixture
+def encoder_config():
+ return ModelArgs(
+ encoder_embed_dim=32,
+ encoder_num_layers=2,
+ encoder_num_heads=4,
+ tile_size=49,
+ patch_size=9,
+ max_num_tiles=4,
+ in_channels=3,
+ return_intermediates=[0, 1],
+ num_layers_projection=2,
+ decoder_embed_dim=128,
+ )
+
+
+@pytest.fixture
+def decoder_config():
+ return ModelArgs(
+ decoder_embed_dim=512,
+ vocab_size=10000,
+ fusion_interval=2,
+ num_special_tokens=3,
+ decoder_num_layers=6,
+ decoder_num_heads=8,
+ decoder_num_kv_heads=4,
+ max_seq_len=512,
+ rope_theta=50000.0,
+ )
+
+
+class TestMultimodalModelVisionEncoder:
+ @pytest.fixture(autouse=True)
+ def setup_class(self, encoder_config):
+ self.model_args = encoder_config
+ self.batch_size = 1
+ self.num_imgs = 2
+ self.num_tiles = 4
+ self.aspect_ratio = torch.tensor([[1, 3], [2, 2]]).reshape(
+ self.batch_size, self.num_imgs, 2
+ )
+ image = torch.rand(
+ (
+ self.batch_size,
+ self.num_imgs,
+ self.num_tiles,
+ self.model_args.in_channels,
+ self.model_args.tile_size,
+ self.model_args.tile_size,
+ )
+ )
+ self.image = fixed_init_tensor(image.shape, min_val=-1, max_val=1)
+
+ def test_llama_mm_vision_encoder(self):
+ model = VisionEncoder(self.model_args)
+ fixed_init_model(model, min_val=-1, max_val=1)
+ output = model(self.image, self.aspect_ratio)
+ expected_shape = (
+ self.batch_size,
+ self.num_imgs * self.num_tiles * (model.vit.patches_per_tile + 1),
+ self.model_args.decoder_embed_dim,
+ )
+ assert (
+ output.shape == expected_shape
+ ), f"Expected shape {expected_shape}, but got {output.shape}"
+
+ # TODO: Need to ensure numerical stability before doing convergence test.
+ # output.mean() = 3.994, we need to debug why it is not close to 5.28800, which is
+ # the test value from the original torch tune test
+ # assert torch.allclose(
+ # output.mean(), torch.tensor(5.28800), atol=1e-3, rtol=1e-3
+ # )
+
+
+class TestMultimodalModelDecoder:
+ @pytest.fixture(autouse=True)
+ def setup_class(self, decoder_config):
+ self.model_args = decoder_config
+ self.batch_size = 1
+ self.decoder_embed_dim = self.model_args.decoder_embed_dim
+ self.vocab_size = self.model_args.vocab_size
+ self.seq_len = 128
+ self.input = {
+ "tokens": torch.arange(self.batch_size * self.seq_len).reshape(
+ self.batch_size, self.seq_len
+ ),
+ "encoder_input": fixed_init_tensor(
+ (self.batch_size, self.seq_len, self.decoder_embed_dim),
+ min_val=-1,
+ max_val=1,
+ ),
+ "encoder_mask": None,
+ }
+
+ @torch.no_grad()
+ def test_llama_mm_decoder(self):
+ model = MultimodalDecoder(self.model_args)
+ fixed_init_model(model, min_val=-1, max_val=1)
+ output = model(**self.input)
+ expected_shape = (self.batch_size, self.seq_len, self.vocab_size)
+ assert (
+ output.shape == expected_shape
+ ), f"Expected shape {expected_shape}, but got {output.shape}"
+
+ # TODO: Need to ensure numerical stability before doing convergence test.
+ # output.mean() = -0.0134, we need to debug why it is not close to -9.47548e-5, which is
+ # the test value from the original torch tune test
+ # assert torch.allclose(
+ # output.mean(), torch.tensor(-9.47548e-5), atol=1e-3, rtol=1e-3
+ # )
diff --git a/torchtitan/experiments/multimodal/tests/test_utils.py b/torchtitan/experiments/multimodal/tests/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c3817db8699966a8d848ad744ccd6b6dabb3836
--- /dev/null
+++ b/torchtitan/experiments/multimodal/tests/test_utils.py
@@ -0,0 +1,58 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+
+def fixed_init_tensor(
+ shape: torch.Size,
+ min_val: Union[float, int] = 0.0,
+ max_val: Union[float, int] = 1.0,
+ nonlinear: bool = False,
+ dtype: torch.dtype = torch.float,
+):
+ """
+ Utility for generating deterministic tensors of a given shape. In general stuff
+ like torch.ones, torch.eye, etc can result in trivial outputs. This utility
+ generates a range tensor [min_val, max_val) of a specified dtype, applies
+ a sine function if nonlinear=True, then reshapes to the appropriate shape.
+ """
+ n_elements = math.prod(shape)
+ step_size = (max_val - min_val) / n_elements
+ x = torch.arange(min_val, max_val, step_size, dtype=dtype)
+ x = x.reshape(shape)
+ if nonlinear:
+ return torch.sin(x)
+ return x
+
+
+@torch.no_grad
+def fixed_init_model(
+ model: nn.Module,
+ min_val: Union[float, int] = 0.0,
+ max_val: Union[float, int] = 1.0,
+ nonlinear: bool = False,
+ dtype: Optional[torch.dtype] = None,
+):
+ """
+ This utility initializes all parameters of a model deterministically using the
+ function fixed_init_tensor above. See that docstring for details of each parameter.
+ """
+ for _, param in model.named_parameters():
+ param.copy_(
+ fixed_init_tensor(
+ param.shape,
+ min_val=min_val,
+ max_val=max_val,
+ nonlinear=nonlinear,
+ dtype=param.dtype if dtype is None else dtype,
+ )
+ )
diff --git a/torchtitan/experiments/multimodal/transform.py b/torchtitan/experiments/multimodal/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb0f989acd0b818f20116a60813c26e68438cec
--- /dev/null
+++ b/torchtitan/experiments/multimodal/transform.py
@@ -0,0 +1,185 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List, Mapping, Optional, Tuple
+
+import torch
+
+import torchvision
+from torchvision.transforms.v2 import functional as F
+
+from utils import (
+ find_supported_resolutions,
+ get_canvas_best_fit,
+ resize_with_pad,
+ tile_crop,
+)
+
+from torchtitan.tools.logging import logger
+
+
+class CLIPTransform:
+ """
+ This class accepts images of any size and dynamically resizes, pads, normalizes and tiles it
+ based on the image aspect ratio and the number of image tiles we allow.
+
+ The algorithm will NOT distort the image to fit a certain aspect ratio, because
+ that leads to a significant degradation in image quality.
+
+ The user can choose if they want to allow upscaling by using the flag ``resize_to_max_canvas``.
+
+ For example, if an input image is of size 300x800, and we want to allow
+ a maximum of 16 image tiles, with side 224px, then:
+
+ If ``resize_to_max_canvas=False``, then:
+ best_resolution = (448, 896) -> smallest canvas, up to 16 tiles, that doesn't require downscaling
+ image is NOT resized
+ image is padded (300, 800) -> 448,896
+ Image is tiled 2x4, for a final output shape of (8, 3, 224, 224)
+
+ If ``resize_to_max_canvas=True``, then:
+ best_resolution = (448, 1344) # canvas that allows maximum upscaling, with minimum padding, up to 16 tiles
+ image is resized without distortion (300,800) -> (448, 1194) #448 is the limiting side for the resize
+ image is padded (448, 1194) -> (448, 1344)
+ Image is tiled 2x6, for a final output shape of (10, 3, 224, 224)
+
+ Args:
+ image_mean (Optional[List[float]]): Mean values of each channel, used for normalization.
+ Should be the same used for the pre-trained model. If None, no normalization is performed. Default None.
+ image_std (Optional[List[float]]): Standard deviation values of each channel, used for normalization.
+ Should be the same used for the pre-trained model. If None, no normalization is performed. Default None.
+ possible_resolutions (Optional[List[Tuple[int, int]]]): List of possible resolutions as tuples (height, width).
+ where each tuple represents a possible canvas to fit the image into when calling ``get_canvas_best_fit``.
+ If None, this will be calculated using max_num_tiles and tile_size. Default None.
+ tile_size (int): Size of the tiles to divide the image into. Default 224.
+ max_num_tiles (Optional[int]): Only used if possible_resolutions is NOT given.
+ Maximum number of tiles to break an image into.
+ This will be used to generate possible_resolutions,
+ e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224.
+ Default 4.
+ dtype (torch.dtype): Data type of the output image. Default torch.bfloat16.
+ resample (str): Resampling method used when resizing images. Supports any enum of
+ ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic".
+ Default 'bilinear'.
+ resize_to_max_canvas (bool): "If True, the image will be upscaled without distortion to fit the largest possible
+ resolution from possible_resolutions.
+ If False, it will pick the resolution that minimizes downscaling, including no downscaling at all.
+ In this case, the image will only be upscaled if it's size < tile_size. Default False.
+
+ Examples:
+ >>> image_transform = CLIPImageTransform(
+ ... image_mean=None,
+ ... image_std=None,
+ ... tile_size=224,
+ ... possible_resolutions=None,
+ ... max_num_tiles=4,
+ ... resample="bilinear",
+ ... resize_to_max_canvas=True,
+ ...)
+ >>> # create random image
+ >>> image = (np.random.rand(100,200,3) * 255).astype(np.uint8)
+ >>> image = PIL.Image.fromarray(image)
+ >>> output = image_transform(image)
+ >>> output['image'].shape # [num_tiles, num_channels, tile_size, tile_size]
+ torch.Size([2, 3, 224, 224])
+ >>> output['ar'] # image best fits the canvas 224x448
+ torch.tensor([1,2])
+ """
+
+ def __init__(
+ self,
+ *,
+ image_mean: Optional[List[float]] = None,
+ image_std: Optional[List[float]] = None,
+ possible_resolutions: Optional[List[Tuple[int, int]]] = None,
+ tile_size: int = 224,
+ max_num_tiles: Optional[int] = 4,
+ dtype: torch.dtype = torch.bfloat16,
+ resample: str = "bilinear",
+ resize_to_max_canvas: bool = False,
+ ) -> None:
+
+ # get_canvas_best_fit
+ assert (
+ possible_resolutions is not None or max_num_tiles is not None
+ ), f"Either possible_resolutions or max_num_tiles must be given. Got {possible_resolutions} and {max_num_tiles}"
+
+ # If possible_resolutions are not given, then calculate possible ones based on max_num_tiles
+ if not possible_resolutions and max_num_tiles:
+ possible_resolutions = find_supported_resolutions(
+ max_num_tiles=max_num_tiles, tile_size=tile_size
+ )
+ else:
+ possible_resolutions = possible_resolutions
+
+ self.possible_resolutions = torch.tensor(possible_resolutions).reshape(-1, 2)
+ logger.debug(
+ f"Found possible_resolutions: {self.possible_resolutions}. Will fit the images into the canvas with best fit."
+ )
+
+ self.resize_to_max_canvas = resize_to_max_canvas
+
+ # normalize
+ assert (image_mean is None) == (
+ image_std is None
+ ), f"Need to provide both or none of image_mean and image_std. Got {image_mean=} and {image_std=}"
+ self.mean = image_mean
+ self.std = image_std
+
+ # resize_with_pad
+ self.max_size = None if resize_to_max_canvas else tile_size
+ self.dtype = dtype
+ self.resample = torchvision.transforms.InterpolationMode[resample.upper()]
+
+ # tile_crop
+ self.tile_size = tile_size
+
+ def __call__(self, image: torch.Tensor) -> Mapping[str, Any]:
+ """
+ Apply image decoding and transformations to the "image" field in the sample.
+
+ Args:
+ sample (Mapping[str, Any]): A sample with an "image" field containing
+ a List[Message] to tokenize
+
+ Returns:
+ Mapping[str, Any]: The sample with an updated "image" filed and added
+ "aspect_ratio" field.
+ """
+ assert isinstance(image, torch.Tensor), "Input image must be a torch.Tensor."
+
+ image = F.to_image(image)
+ image = F.grayscale_to_rgb_image(image)
+ image = F.to_dtype(image, dtype=self.dtype, scale=True)
+
+ # Find the best canvas to fit the image without distortion
+ best_resolution = get_canvas_best_fit(
+ image=image,
+ possible_resolutions=self.possible_resolutions,
+ resize_to_max_canvas=self.resize_to_max_canvas,
+ )
+
+ # resize without distortion + pad to fit best_resolution
+ image = resize_with_pad(
+ image=image,
+ target_size=best_resolution,
+ resample=self.resample,
+ max_size=self.max_size,
+ )
+
+ # Normalize
+ if self.mean:
+ image = F.normalize(image, mean=self.mean, std=self.std)
+
+ # Divide the image into equally sized tiles
+ image = tile_crop(image=image, tile_size=self.tile_size)
+
+ aspect_ratio = torch.tensor(best_resolution).reshape(-1) // self.tile_size
+
+ return {
+ "image": image,
+ "aspect_ratio": aspect_ratio,
+ }
diff --git a/torchtitan/experiments/multimodal/utils.py b/torchtitan/experiments/multimodal/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c927772a5ef95ba65123c9387de4ead1e732490f
--- /dev/null
+++ b/torchtitan/experiments/multimodal/utils.py
@@ -0,0 +1,437 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+from collections import defaultdict
+
+from pathlib import Path
+from typing import List, Optional, Set, Tuple, Union
+from urllib import request
+
+import torch
+import torchvision
+from torchvision.transforms.v2 import functional as F
+
+# NOTE Copied from torchtune.modules.transforms.vision_utils.tile_crop.py
+def tile_crop(image: torch.Tensor, tile_size: int) -> torch.Tensor:
+ """
+ Divides a tensor into equally sized tiles. The tensor should be divisible by tile_size.
+
+ Args:
+ image (torch.Tensor): Input image to crop into tiles.
+ tile_size (int): Size of each tile.
+
+ Returns:
+ torch.Tensor: torch.Tensor of shape [num_tiles, channel_size, tile_size, tile_size]
+
+ Examples:
+ >>> image = torch.rand(3, 200, 300)
+ >>> tiles = tile_crop(image, tile_size=50)
+ >>> tiles.shape # 4x6 = 24 tiles
+ torch.Size([24, 3, 50, 50])
+
+ >>> image = torch.rand(3, 400, 600)
+ >>> tiles = tile_crop(image, tile_size=200)
+ >>> tiles.shape # 2x3 = 6 tiles
+ torch.Size([6, 3, 200, 200])
+ """
+
+ channel_size, height, width = image.shape
+
+ # assert sizes are divisible
+ assert (
+ height % tile_size == 0 and width % tile_size == 0
+ ), f"Image size {height}x{width} is not divisible by tile size {tile_size}"
+
+ # Reshape to split height and width into tile_size blocks
+ tiles_height = height // tile_size
+ tiles_width = width // tile_size
+
+ reshaped = image.view(channel_size, tiles_height, tile_size, tiles_width, tile_size)
+
+ # Transpose to bring tiles together
+ # We want [tiles_height, tiles_width, channel_size, tile_size, tile_size]
+ transposed = reshaped.permute(1, 3, 0, 2, 4)
+
+ # Flatten the tiles
+ tiles = transposed.contiguous().view(
+ tiles_height * tiles_width, channel_size, tile_size, tile_size
+ )
+
+ return tiles
+
+
+# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py
+def resize_with_pad(
+ image: torch.Tensor,
+ target_size: Tuple[int, int],
+ resample: torchvision.transforms.InterpolationMode,
+ max_size: Optional[int] = None,
+) -> torch.Tensor:
+ """
+ Resizes and pads an image to target_size without causing distortion.
+ The user can set max_size to limit upscaling when target_size exceeds image_size.
+
+ Args:
+ image (torch.Tensor): The input image tensor in the format [..., H, W].
+ target_size (Tuple[int, int]): The desired resolution to fit the image into in the format [height, width].
+ resample (torchvision.transforms.InterpolationMode): Resampling method used when resizing images.
+ Supports torchvision.transforms.InterpolationMode.NEAREST, InterpolationMode.NEAREST_EXACT,
+ InterpolationMode.BILINEAR and InterpolationMode.BICUBIC.
+ max_size (Optional[int]): The maximum size to upscale the image to.
+ If None, will upscale up to target_size.
+
+ Returns:
+ torch.Tensor: The resized and padded image tensor in the format [..., H, W].
+
+ Examples:
+
+ Example 1: The image will be upscaled from (300, 800) to (448, 1194), since 448 is the limiting side,
+ and then padded from (448, 1194) to (448, 1344).
+
+ >>> max_size = None
+ >>> image = torch.rand([3, 300, 800])
+ >>> target_size = (448, 1344)
+ >>> resample = torchvision.transforms.InterpolationMode.BILINEAR
+ >>> output = resize_with_pad(image, target_size, resample, max_size)
+
+ Example 2: The image will stay as is, since 800 > 600, and then padded from (300, 800) to (448, 1344).
+
+ >>> max_size = 600
+ >>> image = torch.rand([3, 300, 800])
+ >>> target_size = (448, 1344)
+ >>> resample = torchvision.transforms.InterpolationMode.BILINEAR
+ >>> output = resize_with_pad(image, target_size, resample, max_size)
+
+ Example 3: The image will be downscaled from (500, 1000) to (224, 448),
+ and padded from (224, 448) to (448, 448).
+
+ >>> max_size = 600
+ >>> image = torch.rand([3, 500, 1000])
+ >>> target_size = (448, 488)
+ >>> resample = torchvision.transforms.InterpolationMode.BILINEAR
+ >>> output = resize_with_pad(image, target_size, resample, max_size)
+
+ """
+
+ image_height, image_width = image.shape[-2:]
+ image_size = (image_height, image_width)
+
+ # If target_size requires upscaling, we might want to limit the upscaling to max_size
+ if max_size is not None:
+ new_target_height = min(max(image_height, max_size), target_size[0])
+ new_target_width = min(max(image_width, max_size), target_size[1])
+ target_size_resize = (new_target_height, new_target_width)
+ else:
+ target_size_resize = target_size
+
+ # resize to target_size while preserving aspect ratio
+ new_size_preserving_aspect_ratio = _get_max_res_without_distortion(
+ image_size=image_size,
+ target_size=target_size_resize,
+ )
+
+ image = F.resize(
+ inpt=image,
+ size=list(new_size_preserving_aspect_ratio),
+ interpolation=resample,
+ antialias=True,
+ )
+
+ image = _pad_image_top_left(image=image, target_size=target_size)
+
+ return image
+
+
+# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py
+def _pad_image_top_left(
+ image: torch.Tensor,
+ target_size: Tuple[int, int],
+) -> torch.Tensor:
+ """
+ Places the image at the top left of the canvas and pads with 0 the right and bottom
+ to fit to the target resolution. If target_size < image_size, it will crop the image.
+
+ Args:
+ image (torch.Tensor): The input image tensor in the format [..., H, W].
+ target_size (Tuple[int, int]): The desired resolution to fit the image into in the format [height, width].
+
+ Returns:
+ torch.Tensor: The padded image tensor in the format [..., H, W].
+ """
+
+ image_size = image.shape[-2:]
+
+ height, width = image_size
+ target_height, target_width = target_size
+
+ pad_x = target_width - width
+ pad_y = target_height - height
+
+ padding = [0, 0, pad_x, pad_y]
+ return F.pad(inpt=image, padding=padding)
+
+
+# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py
+def _get_max_res_without_distortion(
+ image_size: Tuple[int, int],
+ target_size: Tuple[int, int],
+) -> Tuple[int, int]:
+ """
+ Determines the maximum resolution to which an image can be resized to without distorting its
+ aspect ratio, based on the target resolution.
+
+ For example, if image_size = (200,400) and target_size = (600,800),
+ scale_h = 600/200 = 3
+ scale_w = 800/400 = 2
+ So the maximum that we can upscale without distortion is min(scale_h, scale_w) = 2
+
+ Since scale_w is the limiting side, then new_w = target_w, and new_h = old_h*scale_w
+
+ Args:
+ image_size (Tuple[int, int]): The original resolution of the image.
+ target_size (Tuple[int, int]): The desired resolution to fit the image into.
+ Returns:
+ Tuple[int, int]: The optimal dimensions to which the image should be resized.
+ Examples:
+ >>> _get_max_res_without_distortion([200, 300], target_size = (450, 200))
+ (133, 200)
+ >>> _get_max_res_without_distortion([800, 600], target_size = (450, 1300))
+ (450, 337)
+ """
+
+ original_height, original_width = image_size
+ target_height, target_width = target_size
+
+ scale_w = target_width / original_width
+ scale_h = target_height / original_height
+
+ if scale_w < scale_h:
+ new_width = target_width
+ new_height = min(math.floor(original_height * scale_w), target_height)
+ else:
+ new_height = target_height
+ new_width = min(math.floor(original_width * scale_h), target_width)
+
+ return new_height, new_width
+
+
+# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py
+def _get_factors(n: int) -> Set[int]:
+ """
+ Calculate all factors of a given number, i.e. a divisor that leaves no remainder.
+
+ Args:
+ n (int): The number to find factors for.
+
+ Returns:
+ set: A set containing all factors of the number.
+
+ Examples:
+ >>> _get_factors(n=12)
+ {1, 2, 3, 4, 6, 12}
+ """
+ factors_set = set()
+
+ for i in range(1, int(n**0.5) + 1):
+ if n % i == 0:
+ factors_set.add(i)
+ factors_set.add(n // i)
+ return factors_set
+
+
+# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py
+def get_canvas_best_fit(
+ image: torch.Tensor, possible_resolutions: torch.Tensor, resize_to_max_canvas: bool
+) -> Tuple[int, int]:
+ """
+ Determines the best canvas possible from a list of possible resolutions to
+ resize an image to, without distortion.
+
+ For each possible resolution, calculates the scaling factors for
+ width and height, and selects the smallest one, which is the limiting side.
+ E.g. if to match a canvas shape you have to upscale an image's height by 2x, and width by 1.5x,
+ then the maximum upscaling without distortion is min(2, 1.5) = 1.5.
+
+ If there are multiple canvases that satisfy the conditions,
+ we pick the one with the lowest area to minimize padding.
+
+ Args:
+ image (torch.Tensor): The image we want to fit into a canvas.
+ possible_resolutions (torch.Tensor): A tensor of shape (N, 2) where each
+ row represents a possible canvas.
+ resize_to_max_canvas (bool): If True, pick the canvas that allows maximum scaling.
+ If False, pick the canvas that minimizes downscaling, including no downscaling at all.
+
+ Returns:
+ Tuple[int, int]: The best resolution to fit the image into.
+
+ Examples:
+ >>> image = torch.rand(3, 200, 300)
+ >>> possible_resolutions = torch.tensor([
+ ... [224, 672],
+ ... [672, 224],
+ ... [224, 448],
+ ... [448, 224],
+ ... [224, 224]
+ ... ])
+ >>> get_canvas_best_fit(image, possible_resolutions, resize_to_max_canvas=False)
+ (224, 448)
+
+ In the example above, we calculate the scaling factors for each possible resolution
+
+ >>> scale_height = torch.tensor([1.1200, 3.3600, 1.1200, 2.2400, 1.1200])
+ >>> scale_width = torch.tensor([2.2400, 0.7467, 1.4933, 0.7467, 0.7467])
+ >>> scales = torch.tensor([1.1200, 0.7467, 1.1200, 0.7467, 0.7467])
+
+ Two options have scaling_factor > 1, since resize_to_max_canvas is False, we pick the smallest
+
+ >>> upscaling_options = torch.tensor([1.1200, 1.1200])
+ >>> selected_scale = torch.tensor(1.1200)
+
+ There are two possible options, so we pick the one with the smallest area
+
+ >>> areas = torch.tensor([150528, 100352]) # for resolutions [672, 224] and [224, 448], respectively
+ >>> optimal_canvas = torch.tensor([224, 448]) # resolution with the smallest area
+ """
+
+ original_height, original_width = image.shape[-2:]
+
+ # possible resolutions heights/widths
+ target_heights, target_widths = (
+ possible_resolutions[:, 0],
+ possible_resolutions[:, 1],
+ )
+
+ # scaling factors to resize the image without distortion
+ scale_w = target_widths / original_width
+ scale_h = target_heights / original_height
+
+ # get limiting side scaling -> no distortion
+ scales = torch.where(scale_w > scale_h, scale_h, scale_w)
+
+ # filter only scales that allow upscaling
+ upscaling_options = scales[scales >= 1]
+ if len(upscaling_options) > 0:
+ if resize_to_max_canvas:
+ selected_scale = torch.max(upscaling_options)
+ else:
+ selected_scale = torch.min(upscaling_options)
+ else:
+ # no upscaling possible,
+ # get the minimum downscaling (max scale for scales<1)
+ downscaling_options = scales[scales < 1]
+ selected_scale = torch.max(downscaling_options)
+
+ # get all resolutions that support this scaling factor,
+ # e.g. you can upscale to 224x224, 224x448, 224x672 without distortion
+ chosen_canvas = possible_resolutions[scales == selected_scale]
+
+ # if there are multiple resolutions,
+ # get the one with minimum area to reduce padding
+ if len(chosen_canvas) > 1:
+ areas = chosen_canvas[:, 0] * chosen_canvas[:, 1]
+ optimal_idx = torch.argmin(areas)
+ optimal_canvas = chosen_canvas[optimal_idx]
+ else:
+ optimal_canvas = chosen_canvas[0]
+
+ return tuple(optimal_canvas.tolist())
+
+
+# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py
+def find_supported_resolutions(
+ max_num_tiles: int, tile_size: int
+) -> List[Tuple[int, int]]:
+ """
+ Computes all combinations of resolutions, multiple of tile_size,
+ that contain up to max_num_tiles. Useful for when dividing an image into tiles.
+
+ For example, if we want at most 2 tiles per image, then we can support the
+ following resolutions: (1x1, 1x2, 2x1) * tile_size
+
+ Args:
+ max_num_tiles (int): Maximum number of tiles.
+ tile_size (int): Size of the side of the tile.
+
+ Returns:
+ List[Tuple[int, int]]: List of possible resolutions as tuples (height, width).
+
+ Examples:
+
+ >>> max_num_tiles = 4
+ >>> tile_size = 224
+ >>> find_supported_resolutions(max_num_tiles, tile_size)
+ [(224, 896), (448, 448), (224, 224), (896, 224), (224, 672), (672, 224), (224, 448), (448, 224)]
+ """
+
+ # create dictionary {aspect_ratio: [resolution1, ..., resolution n]}
+ # example {0.25: [(1,4)], 1.0: [(2,2), (1,1)], 4.0: [(4,1)]}
+ asp_dict = defaultdict(list)
+ for _tile_size in range(max_num_tiles, 0, -1):
+ factors = sorted(_get_factors(_tile_size))
+ asp_ratios = [(factor, _tile_size // factor) for factor in factors]
+ for height, width in asp_ratios:
+ ratio_float = height / width
+ asp_dict[ratio_float].append((height, width))
+
+ # get the resolutions multiplied by the tile_size
+ possible_resolutions = []
+ for ar, resolution in asp_dict.items():
+ for height, width in resolution:
+ possible_resolutions.append((height * tile_size, width * tile_size))
+
+ return possible_resolutions
+
+
+# NOTE Copied from torchtune.data._utils.py
+def load_image(image_loc: Union[Path, str]) -> torch.Tensor:
+ """
+ Convenience method to load an image in torch.Tensor format from a local file path or remote source.
+
+ Args:
+ image_loc (Union[Path, str]): Local file path or remote source pointing to the image
+ which will be loaded in PIL format.
+
+ Note:
+ If loading an image from a remote source, the function expects the URL provided in ``image_loc``
+ to start with "http" or "https" e.g. "https://www.wikipedia.org/en/bird.jpg".
+
+ Raises:
+ ValueError: If the image cannot be loaded from remote source, **or**
+ if the image cannot be opened as a :class:`~torch.Tensor`.
+
+ Examples:
+ >>> # Load from remote source
+ >>> image = load_image("https://www.wikipedia.org/en/bird.jpg")
+
+ >>> # Load from local file path
+ >>> image = load_image(Path("/home/user/bird.jpg"))
+
+ Returns:
+ torch.Tensor: The loaded image.
+ """
+
+ # If pointing to remote source, try to load to local
+ if isinstance(image_loc, str) and image_loc.startswith("http"):
+ try:
+ image_loc = request.urlopen(image_loc).read()
+ image = torchvision.io.decode_image(
+ torch.frombuffer(image_loc, dtype=torch.uint8),
+ mode="RGB",
+ )
+ except Exception as e:
+ raise ValueError("Failed to load remote image as torch.Tensor") from e
+
+ # Open the local image as a Tensor image
+ else:
+ try:
+ image = torchvision.io.decode_image(image_loc, mode="RGB")
+ except Exception as e:
+ raise ValueError("Failed to load local image as torch.Tensor") from e
+
+ return image
diff --git a/torchtitan/experiments/simple_fsdp/__pycache__/__init__.cpython-312.pyc b/torchtitan/experiments/simple_fsdp/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc198d440c85b33f72d66a3b6434ac78f6591c29
Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/__pycache__/__init__.cpython-312.pyc differ
diff --git a/torchtitan/experiments/simple_fsdp/__pycache__/model.cpython-312.pyc b/torchtitan/experiments/simple_fsdp/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6eccaf665f354d725ca23d132b2c4b5e7bce82c5
Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/__pycache__/model.cpython-312.pyc differ
diff --git a/torchtitan/experiments/simple_fsdp/__pycache__/parallelize_llama.cpython-312.pyc b/torchtitan/experiments/simple_fsdp/__pycache__/parallelize_llama.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a492e16243709f2ae35304eda0898aa37f7c096
Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/__pycache__/parallelize_llama.cpython-312.pyc differ
diff --git a/torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-312.pyc b/torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3718b57ca56ac3984a3e90236764f0fd00d74c16
Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-312.pyc differ
diff --git a/torchtitan/experiments/simple_fsdp/model.py b/torchtitan/experiments/simple_fsdp/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..63104169b8fa14ed7032182c1ad08b782cd715fe
--- /dev/null
+++ b/torchtitan/experiments/simple_fsdp/model.py
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torchtitan.models.llama3 import Transformer, TransformerModelArgs
+from .simple_fsdp import disable_data_parallel
+
+
+class SimpleFSDPTransformer(Transformer):
+ def __init__(self, model_args: TransformerModelArgs):
+ super().__init__(model_args)
+ self.init_weights()
+
+ def init_weights(self, *args, **kwargs):
+ with disable_data_parallel():
+ super().init_weights(*args, **kwargs)
diff --git a/torchtitan/experiments/simple_fsdp/parallelize_llama.py b/torchtitan/experiments/simple_fsdp/parallelize_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d696db27e90e292465aa7b9c6ffa20ae8f0508
--- /dev/null
+++ b/torchtitan/experiments/simple_fsdp/parallelize_llama.py
@@ -0,0 +1,98 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+from torch.distributed import DeviceMesh
+
+from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+from torchtitan.models.llama3.parallelize_llama import apply_ac
+from torchtitan.tools.logging import logger
+
+from .simple_fsdp import data_parallel, MixedPrecisionPolicy
+
+
+def parallelize_llama(
+ model: nn.Module,
+ world_mesh: DeviceMesh,
+ parallel_dims: ParallelDims,
+ job_config: JobConfig,
+):
+ """
+ Apply tensor parallelism, activation checkpointing, torch.compile, and data
+ parallelism to the model.
+
+ NOTE: The passed-in model preferably should be on meta device. Otherwise,
+ the model must fit on GPU or CPU memory.
+ """
+ # TODO(ruisizhang123): Add support for TP (on-going)
+ # if parallel_dims.tp_enabled:
+ # if (
+ # job_config.parallelism.enable_async_tensor_parallel
+ # and not job_config.training.compile
+ # ):
+ # raise RuntimeError("Async TP requires --training.compile")
+
+ # enable_float8_linear = "float8" in job_config.model.converters
+ # float8_is_rowwise = job_config.float8.recipe_name in (
+ # "rowwise",
+ # "rowwise_with_gw_hp",
+ # )
+
+ # # For now, float8 all-gather with TP is only supported for tensorwise
+ # # float8 scaling recipes. For rowwise recipes, we use regular TP and
+ # # all-gather happens in high precision.
+ # enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+
+ # apply_tp(
+ # model,
+ # world_mesh["tp"],
+ # loss_parallel=parallel_dims.loss_parallel_enabled,
+ # enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+ # enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
+ # )
+
+ if job_config.activation_checkpoint.mode != "none":
+ apply_ac(model, job_config.activation_checkpoint)
+
+ # apply data parallel
+ if (
+ parallel_dims.dp_replicate_enabled
+ or parallel_dims.dp_shard_enabled
+ or parallel_dims.cp_enabled
+ ):
+ if parallel_dims.dp_replicate_enabled:
+ if parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled:
+ dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+ dp_mode = "hybrid_shard"
+ else:
+ dp_mesh_dim_names = ("dp_replicate",)
+ dp_mode = "replicate"
+ else:
+ dp_mesh_dim_names = ("dp_shard_cp",)
+ dp_mode = "fully_shard"
+
+ mp_policy = MixedPrecisionPolicy(
+ param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+ reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+ )
+
+ model = data_parallel(
+ model,
+ world_mesh[tuple(dp_mesh_dim_names)],
+ mode=dp_mode,
+ ac_mode=job_config.activation_checkpoint.mode,
+ mp_policy=mp_policy,
+ )
+ logger.info("Applied Data Parallel (dp mode=%s) to the model", dp_mode)
+
+ if job_config.training.compile:
+ torch._inductor.config.reorder_for_peak_memory = False
+ model = torch.compile(model, fullgraph=True)
+
+ return model
diff --git a/torchtitan/experiments/simple_fsdp/tests/__pycache__/test_numerics.cpython-312-pytest-8.4.1.pyc b/torchtitan/experiments/simple_fsdp/tests/__pycache__/test_numerics.cpython-312-pytest-8.4.1.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24e814b57101af3d3137611c1ca31d0d9bc46326
Binary files /dev/null and b/torchtitan/experiments/simple_fsdp/tests/__pycache__/test_numerics.cpython-312-pytest-8.4.1.pyc differ
diff --git a/torchtitan/models/__pycache__/attention.cpython-312.pyc b/torchtitan/models/__pycache__/attention.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bd8501a02d87fc1646b31bd8df09b34531abf20
Binary files /dev/null and b/torchtitan/models/__pycache__/attention.cpython-312.pyc differ
diff --git a/torchtitan/models/llama3/__pycache__/parallelize_llama.cpython-312.pyc b/torchtitan/models/llama3/__pycache__/parallelize_llama.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95e80b52715c0488bc26290ce159be14aef87949
Binary files /dev/null and b/torchtitan/models/llama3/__pycache__/parallelize_llama.cpython-312.pyc differ
diff --git a/torchtitan/models/llama3/parallelize_llama.py b/torchtitan/models/llama3/parallelize_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2e6f0c78eb4acb3a4d561aa0717758fdf3b1c1
--- /dev/null
+++ b/torchtitan/models/llama3/parallelize_llama.py
@@ -0,0 +1,398 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file applies the PT-D parallelisms (except pipeline parallelism) and various
+# training techniques (e.g. activation checkpointing and compile) to the Llama model.
+
+from collections import defaultdict
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable.replicate import replicate
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+ checkpoint_wrapper as ptd_checkpoint_wrapper,
+)
+
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
+from torch.distributed.tensor import Replicate, Shard
+from torch.distributed.tensor.parallel import (
+ ColwiseParallel,
+ parallelize_module,
+ PrepareModuleInput,
+ RowwiseParallel,
+ SequenceParallel,
+)
+
+from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+from torchtitan.tools.logging import logger
+
+
+def parallelize_llama(
+ model: nn.Module,
+ world_mesh: DeviceMesh,
+ parallel_dims: ParallelDims,
+ job_config: JobConfig,
+):
+ """
+ Apply tensor parallelism, activation checkpointing, torch.compile, and data
+ parallelism to the model.
+
+ NOTE: The passed-in model preferably should be on meta device. Otherwise,
+ the model must fit on GPU or CPU memory.
+ """
+
+ if parallel_dims.tp_enabled:
+ if (
+ job_config.parallelism.enable_async_tensor_parallel
+ and not job_config.training.compile
+ ):
+ raise RuntimeError("Async TP requires --training.compile")
+
+ enable_float8_linear = "float8" in job_config.model.converters
+ float8_is_rowwise = job_config.float8.recipe_name in (
+ "rowwise",
+ "rowwise_with_gw_hp",
+ )
+
+ # For now, float8 all-gather with TP is only supported for tensorwise
+ # float8 scaling recipes. For rowwise recipes, we use regular TP and
+ # all-gather happens in high precision.
+ enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+
+ apply_tp(
+ model,
+ world_mesh["tp"],
+ loss_parallel=parallel_dims.loss_parallel_enabled,
+ enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+ enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
+ )
+
+ if job_config.model.use_flex_attn:
+ if job_config.activation_checkpoint.mode == "selective":
+ raise ValueError(
+ "FlexAttention is not compatible with selective AC yet. "
+ "See https://github.com/pytorch/pytorch/issues/147879"
+ )
+
+ if parallel_dims.cp_enabled:
+ raise ValueError(
+ "FlexAttention is not compatible with CP yet. "
+ "We are still working on this."
+ )
+
+ if job_config.activation_checkpoint.mode != "none":
+ apply_ac(model, job_config.activation_checkpoint)
+
+ # turn on per-TransformerBlock compile after AC wrapping and before FSDP
+ if job_config.training.compile:
+ apply_compile(model)
+
+ if (
+ parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
+ ): # apply FSDP or HSDP, potentially with Context Parallel
+ if parallel_dims.dp_replicate_enabled:
+ dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+ else:
+ dp_mesh_dim_names = ("dp_shard_cp",)
+
+ apply_fsdp(
+ model,
+ world_mesh[tuple(dp_mesh_dim_names)],
+ param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+ reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+ pp_enabled=parallel_dims.pp_enabled,
+ cpu_offload=job_config.training.enable_cpu_offload,
+ reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward,
+ )
+
+ if parallel_dims.dp_replicate_enabled:
+ logger.info("Applied HSDP to the model")
+ else:
+ logger.info("Applied FSDP to the model")
+
+ if parallel_dims.cp_enabled:
+ logger.info("Applied Context Parallel to the model")
+
+ if job_config.training.enable_cpu_offload:
+ logger.info("Applied CPU Offloading to the model")
+ elif parallel_dims.dp_replicate_enabled:
+ if world_mesh.ndim > 1:
+ raise RuntimeError("DDP has not supported > 1D parallelism")
+ apply_ddp(
+ model,
+ world_mesh,
+ enable_compile=job_config.training.compile,
+ enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
+ )
+
+ return model
+
+
+def apply_tp(
+ model: nn.Module,
+ tp_mesh: DeviceMesh,
+ loss_parallel: bool,
+ enable_float8_tensorwise_tp: bool,
+ enable_async_tp: bool,
+):
+ """Apply tensor parallelism."""
+ # 1. Parallelize the embedding and shard its outputs (which are the first
+ # transformer block's inputs)
+ # 2. Parallelize the root norm layer over the sequence dim
+ # 3. Parallelize the final linear output layer
+ parallelize_module(
+ model,
+ tp_mesh,
+ {
+ "tok_embeddings": RowwiseParallel(
+ input_layouts=Replicate(),
+ output_layouts=Shard(1),
+ ),
+ "norm": SequenceParallel(),
+ "output": ColwiseParallel(
+ input_layouts=Shard(1),
+ output_layouts=Shard(-1) if loss_parallel else Replicate(),
+ use_local_output=not loss_parallel,
+ ),
+ },
+ )
+
+ # Parallel styles used for transformer block linear weights and their
+ # inputs may be different for float8 linears with tensorwise scaling.
+ if enable_float8_tensorwise_tp:
+ # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there
+ from torchao.float8.float8_tensor_parallel import (
+ Float8ColwiseParallel,
+ Float8RowwiseParallel,
+ PrepareFloat8ModuleInput,
+ )
+
+ rowwise_parallel, colwise_parallel, prepare_module_input = (
+ Float8RowwiseParallel,
+ Float8ColwiseParallel,
+ PrepareFloat8ModuleInput,
+ )
+ else:
+ rowwise_parallel, colwise_parallel, prepare_module_input = (
+ RowwiseParallel,
+ ColwiseParallel,
+ PrepareModuleInput,
+ )
+
+ # Apply tensor + sequence parallelism to every transformer block
+ # NOTE: At the cost of model code change, we can accelerate Sequence Parallel
+ # by folding (and unfolding) the batch dimension and the sequence dimension.
+ # Examples can be found at https://github.com/pytorch/torchtitan/pull/437
+ for layer_id, transformer_block in model.layers.items():
+ layer_plan = {
+ "attention_norm": SequenceParallel(),
+ "attention": prepare_module_input(
+ input_layouts=(Shard(1), None),
+ desired_input_layouts=(Replicate(), None),
+ ),
+ "attention.wq": colwise_parallel(),
+ "attention.wk": colwise_parallel(),
+ "attention.wv": colwise_parallel(),
+ "attention.wo": rowwise_parallel(output_layouts=Shard(1)),
+ "ffn_norm": SequenceParallel(),
+ "feed_forward": prepare_module_input(
+ input_layouts=(Shard(1),),
+ desired_input_layouts=(Replicate(),),
+ ),
+ "feed_forward.w1": colwise_parallel(),
+ "feed_forward.w2": rowwise_parallel(output_layouts=Shard(1)),
+ "feed_forward.w3": colwise_parallel(),
+ }
+
+ parallelize_module(
+ module=transformer_block,
+ device_mesh=tp_mesh,
+ parallelize_plan=layer_plan,
+ )
+
+ if enable_async_tp:
+ from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+
+ torch._inductor.config._micro_pipeline_tp = True
+ enable_symm_mem_for_group(tp_mesh.get_group().group_name)
+
+ logger.info(
+ f"Applied {'Float8 tensorwise ' if enable_float8_tensorwise_tp else ''}{'Async ' if enable_async_tp else ''}"
+ "Tensor Parallelism to the model"
+ )
+
+
+# for selective op activation checkpointing
+_save_list = {
+ torch.ops.aten.mm.default,
+ torch.ops.aten._scaled_dot_product_efficient_attention.default,
+ torch.ops.aten._scaled_dot_product_flash_attention.default,
+ # for low precision training, it's useful to always save
+ # the result of max, since the absolute maximum is
+ # used to compute the scaling factor for quantization.
+ torch.ops.aten.max.default,
+}
+
+
+def _apply_ac_to_transformer_block(module: nn.Module, ac_config):
+ valid_ac_modes = ("full", "selective")
+ if ac_config.mode not in valid_ac_modes:
+ raise ValueError(
+ f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
+ )
+
+ if ac_config.mode == "full":
+ return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+
+ assert ac_config.mode == "selective", f"{ac_config.mode}"
+ use_op_sac = ac_config.selective_ac_option == "op"
+ use_layer_sac = ac_config.selective_ac_option.isdigit()
+ if not use_op_sac and not use_layer_sac:
+ raise ValueError(
+ f"Invalid selective AC option: {ac_config.selective_ac_option}. "
+ f"Valid options: 'op' or a positive int representing layer frequency"
+ )
+ if use_op_sac:
+ from torch.utils.checkpoint import (
+ CheckpointPolicy,
+ create_selective_checkpoint_contexts,
+ )
+
+ def _get_custom_policy(meta):
+ def _custom_policy(ctx, func, *args, **kwargs):
+ mode = "recompute" if ctx.is_recompute else "forward"
+ mm_count_key = f"{mode}_mm_count"
+ if func == torch.ops.aten.mm.default:
+ meta[mm_count_key] += 1
+ # Saves output of all compute ops, except every second mm
+ to_save = func in _save_list and not (
+ func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
+ )
+ return (
+ CheckpointPolicy.MUST_SAVE
+ if to_save
+ else CheckpointPolicy.PREFER_RECOMPUTE
+ )
+
+ return _custom_policy
+
+ def selective_checkpointing_context_fn():
+ meta = defaultdict(int)
+ return create_selective_checkpoint_contexts(_get_custom_policy(meta))
+
+ return ptd_checkpoint_wrapper(
+ module,
+ context_fn=selective_checkpointing_context_fn,
+ preserve_rng_state=False,
+ )
+ elif use_layer_sac:
+ # Checkpoint every `ac_freq` of the modules passed to this function
+ ac_freq = int(ac_config.selective_ac_option)
+ ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0)
+ ptd_checkpoint_wrapper._count += 1
+ if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0:
+ return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+ else:
+ return module
+
+
+def apply_ac(model: nn.Module, ac_config):
+ """Apply activation checkpointing to the model."""
+ for layer_id, transformer_block in model.layers.named_children():
+ transformer_block = _apply_ac_to_transformer_block(transformer_block, ac_config)
+ model.layers.register_module(layer_id, transformer_block)
+
+ logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
+
+
+def apply_compile(model: nn.Module):
+ """
+ Apply torch.compile to each TransformerBlock, which makes compilation efficient due to
+ repeated structure. Alternatively one can compile the whole model (after applying DP).
+ """
+ for layer_id, transformer_block in model.layers.named_children():
+ transformer_block = torch.compile(transformer_block, fullgraph=True)
+ model.layers.register_module(layer_id, transformer_block)
+
+ logger.info("Compiling each TransformerBlock with torch.compile")
+
+
+def apply_fsdp(
+ model: nn.Module,
+ dp_mesh: DeviceMesh,
+ param_dtype: torch.dtype,
+ reduce_dtype: torch.dtype,
+ pp_enabled: bool,
+ cpu_offload: bool = False,
+ reshard_after_forward_policy: str = "default",
+):
+ """
+ Apply data parallelism (via FSDP2) to the model.
+
+ Args:
+ model (nn.Module): The model to apply data parallelism to.
+ dp_mesh (DeviceMesh): The device mesh to use for data parallelism.
+ param_dtype (torch.dtype): The data type to use for model parameters.
+ reduce_dtype (torch.dtype): The data type to use for reduction operations.
+ pp_enabled (bool): Whether pipeline parallelism is enabled.
+ cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False.
+ reshard_after_forward_policy (str, optional): The policy to use for resharding after forward pass. Defaults to "default".
+ Other options: "never", "always".
+ - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios.
+ - "always" will enable `reshard_after_forward` for all forward passes.
+ - "never" will disable `reshard_after_forward` for all forward passes.
+
+ """
+ mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
+ fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
+ if cpu_offload:
+ fsdp_config["offload_policy"] = CPUOffloadPolicy()
+
+ for layer_id, transformer_block in model.layers.items():
+ if reshard_after_forward_policy == "always":
+ reshard_after_forward = True
+ elif reshard_after_forward_policy == "never":
+ reshard_after_forward = False
+ elif reshard_after_forward_policy == "default":
+ if pp_enabled:
+ # For PP, do not reshard after forward to avoid per-microbatch
+ # all-gathers, which can be expensive and non-overlapped
+ reshard_after_forward = False
+ else:
+ # As an optimization, do not reshard after forward for the last
+ # transformer block since FSDP would prefetch it immediately
+ reshard_after_forward = int(layer_id) < len(model.layers) - 1
+ else:
+ raise ValueError(
+ f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
+ )
+ fully_shard(
+ transformer_block,
+ **fsdp_config,
+ reshard_after_forward=reshard_after_forward,
+ )
+ fully_shard(model, **fsdp_config, reshard_after_forward=not pp_enabled)
+
+
+def apply_ddp(
+ model: nn.Module,
+ dp_mesh: DeviceMesh,
+ enable_compile: bool,
+ enable_compiled_autograd: bool,
+):
+ if enable_compile:
+ if enable_compiled_autograd:
+ torch._dynamo.config.optimize_ddp = (
+ "python_reducer_without_compiled_forward"
+ )
+ else:
+ torch._dynamo.config.optimize_ddp = "ddp_optimizer"
+
+ replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
+
+ logger.info("Applied DDP to the model")
diff --git a/torchtitan/protocols/__pycache__/model_converter.cpython-312.pyc b/torchtitan/protocols/__pycache__/model_converter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da8e5f2c191b348fce11d2a6a3cb9fe7fb03519d
Binary files /dev/null and b/torchtitan/protocols/__pycache__/model_converter.cpython-312.pyc differ