projecti7 commited on Jan 13

Commit

4eb905e

verified ·

1 Parent(s): 5e42353

Auto-sync checkpoint during training

Browse files

Files changed (37) hide show

bpe.model +3 -0
checkpoint-1000.pt +3 -0
checkpoint-2000.pt +3 -0
log/log-train-2026-01-13-08-17-26 +95 -0
log/log-train-2026-01-13-08-30-12 +95 -0
log/log-train-2026-01-13-08-32-30 +97 -0
log/log-train-2026-01-13-08-34-44 +105 -0
log/log-train-2026-01-13-08-36-31 +107 -0
log/log-train-2026-01-13-08-40-26 +102 -0
log/log-train-2026-01-13-08-44-52 +107 -0
log/log-train-2026-01-13-09-13-56 +109 -0
log/log-train-2026-01-13-09-17-30 +109 -0
log/log-train-2026-01-13-09-20-52 +109 -0
log/log-train-2026-01-13-09-23-46 +110 -0
log/log-train-2026-01-13-09-26-41 +110 -0
log/log-train-2026-01-13-09-30-18 +110 -0
log/log-train-2026-01-13-09-36-17 +95 -0
log/log-train-2026-01-13-09-38-48 +525 -0
log/log-train-2026-01-13-09-59-44 +153 -0
log/log-train-2026-01-13-10-02-58 +544 -0
tensorboard/events.out.tfevents.1768292246.8e64ffbd666a.2056.0 +3 -0
tensorboard/events.out.tfevents.1768293012.8e64ffbd666a.2141.0 +3 -0
tensorboard/events.out.tfevents.1768293150.8e64ffbd666a.2188.0 +3 -0
tensorboard/events.out.tfevents.1768293284.8e64ffbd666a.2227.0 +3 -0
tensorboard/events.out.tfevents.1768293391.8e64ffbd666a.2263.0 +3 -0
tensorboard/events.out.tfevents.1768293626.8e64ffbd666a.2307.0 +3 -0
tensorboard/events.out.tfevents.1768293892.8e64ffbd666a.2350.0 +3 -0
tensorboard/events.out.tfevents.1768295636.8e64ffbd666a.3204.0 +3 -0
tensorboard/events.out.tfevents.1768295850.8e64ffbd666a.3255.0 +3 -0
tensorboard/events.out.tfevents.1768296052.8e64ffbd666a.3309.0 +3 -0
tensorboard/events.out.tfevents.1768296226.8e64ffbd666a.3355.0 +3 -0
tensorboard/events.out.tfevents.1768296401.8e64ffbd666a.3410.0 +3 -0
tensorboard/events.out.tfevents.1768296618.8e64ffbd666a.3456.0 +3 -0
tensorboard/events.out.tfevents.1768296977.8e64ffbd666a.3504.0 +3 -0
tensorboard/events.out.tfevents.1768297128.8e64ffbd666a.3540.0 +3 -0
tensorboard/events.out.tfevents.1768298384.8e64ffbd666a.19994.0 +3 -0
tensorboard/events.out.tfevents.1768298578.8e64ffbd666a.24203.0 +3 -0

bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76c0a1e6ddcd31c5d6e93b27047ce285569ff243e8a89e1af44a08aec82d36d7
+size 256555

checkpoint-1000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0ecd694c9bf70c25cc0eb18bbc84174a0b417f80f0d795eb2b0abc6f88e09fb
+size 1141963947

checkpoint-2000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fd7775f96144e213bb18ca99005488da4c2ce4b666ac16f949395b0d1d1a2e9
+size 1141963947

log/log-train-2026-01-13-08-17-26 ADDED Viewed

	@@ -0,0 +1,95 @@

+2026-01-13 08:17:26,910 INFO [train.py:967] Training started
+2026-01-13 08:17:26,911 INFO [train.py:977] Device: cuda:0
+2026-01-13 08:17:26,915 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": true,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 150,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": false,
+  "valid_interval": 3000,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 08:17:26,915 INFO [train.py:988] About to create model
+2026-01-13 08:17:27,495 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 08:17:27,513 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 08:17:30,538 INFO [asr_datamodule.py:443] About to get the shuffled train-clean-100,             train-clean-360 and train-other-500 cuts

log/log-train-2026-01-13-08-30-12 ADDED Viewed

	@@ -0,0 +1,95 @@

+2026-01-13 08:30:12,644 INFO [train.py:967] Training started
+2026-01-13 08:30:12,645 INFO [train.py:977] Device: cuda:0
+2026-01-13 08:30:12,647 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": true,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 150,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 3000,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 08:30:12,648 INFO [train.py:988] About to create model
+2026-01-13 08:30:13,269 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 08:30:13,287 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 08:30:15,364 INFO [asr_datamodule.py:443] About to get the shuffled train-clean-100,             train-clean-360 and train-other-500 cuts

log/log-train-2026-01-13-08-32-30 ADDED Viewed

	@@ -0,0 +1,97 @@

+2026-01-13 08:32:30,673 INFO [train.py:967] Training started
+2026-01-13 08:32:30,674 INFO [train.py:977] Device: cuda:0
+2026-01-13 08:32:30,676 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 100,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 08:32:30,677 INFO [train.py:988] About to create model
+2026-01-13 08:32:31,260 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 08:32:31,277 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 08:32:33,347 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 08:32:33,348 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 08:32:33,349 INFO [asr_datamodule.py:233] About to get Musan cuts

log/log-train-2026-01-13-08-34-44 ADDED Viewed

	@@ -0,0 +1,105 @@

+2026-01-13 08:34:44,600 INFO [train.py:967] Training started
+2026-01-13 08:34:44,601 INFO [train.py:977] Device: cuda:0
+2026-01-13 08:34:44,603 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 100,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 08:34:44,604 INFO [train.py:988] About to create model
+2026-01-13 08:34:45,181 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 08:34:45,199 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 08:34:47,258 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 08:34:47,259 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 08:34:47,259 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 08:34:47,294 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 08:34:47,294 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 08:34:47,294 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 08:34:47,295 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 08:34:47,295 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 08:34:47,661 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 08:34:47,661 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 08:34:47,662 INFO [asr_datamodule.py:467] About to get dev-other cuts

log/log-train-2026-01-13-08-36-31 ADDED Viewed

	@@ -0,0 +1,107 @@

+2026-01-13 08:36:31,536 INFO [train.py:967] Training started
+2026-01-13 08:36:31,537 INFO [train.py:977] Device: cuda:0
+2026-01-13 08:36:31,540 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 100,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 08:36:31,540 INFO [train.py:988] About to create model
+2026-01-13 08:36:32,126 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 08:36:32,143 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 08:36:34,210 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 08:36:34,211 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 08:36:34,211 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 08:36:34,245 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 08:36:34,245 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 08:36:34,246 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 08:36:34,246 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 08:36:34,246 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 08:36:34,600 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 08:36:34,601 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 08:36:34,601 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 08:36:34,602 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 08:36:34,827 INFO [asr_datamodule.py:372] About to create dev dataloader

log/log-train-2026-01-13-08-40-26 ADDED Viewed

	@@ -0,0 +1,102 @@

+2026-01-13 08:40:26,429 INFO [train.py:967] Training started
+2026-01-13 08:40:26,430 INFO [train.py:977] Device: cuda:0
+2026-01-13 08:40:26,432 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 100,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 08:40:26,433 INFO [train.py:988] About to create model
+2026-01-13 08:40:27,011 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 08:40:27,029 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 08:40:29,057 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 08:40:29,057 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 08:40:29,058 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 08:40:29,090 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 08:40:29,090 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 08:40:29,090 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 08:40:29,090 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 08:40:29,090 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.

log/log-train-2026-01-13-08-44-52 ADDED Viewed

	@@ -0,0 +1,107 @@

+2026-01-13 08:44:52,830 INFO [train.py:967] Training started
+2026-01-13 08:44:52,831 INFO [train.py:977] Device: cuda:0
+2026-01-13 08:44:52,834 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 100,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 08:44:52,834 INFO [train.py:988] About to create model
+2026-01-13 08:44:53,431 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 08:44:53,448 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 08:44:55,485 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 08:44:55,487 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 08:44:55,487 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 08:44:55,519 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 08:44:55,519 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 08:44:55,520 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 08:44:55,520 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 08:44:55,520 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 08:44:55,806 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 08:44:55,806 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 08:44:55,807 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 08:44:55,807 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 08:44:55,989 INFO [asr_datamodule.py:372] About to create dev dataloader

log/log-train-2026-01-13-09-13-56 ADDED Viewed

	@@ -0,0 +1,109 @@

+2026-01-13 09:13:56,956 INFO [train.py:967] Training started
+2026-01-13 09:13:56,957 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:13:56,960 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 80,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:13:56,960 INFO [train.py:988] About to create model
+2026-01-13 09:13:57,530 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:13:57,547 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:13:59,584 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 09:13:59,585 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 09:13:59,586 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 09:13:59,589 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 09:13:59,589 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 09:13:59,589 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 09:13:59,590 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 09:13:59,590 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 09:13:59,886 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 09:13:59,886 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 09:13:59,887 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 09:13:59,887 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 09:14:00,077 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 09:14:02,016 INFO [train.py:1204] Saving batch to /kaggle/working/amharic_training/exp_amharic_streaming/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
+2026-01-13 09:14:02,022 INFO [train.py:1210] features shape: torch.Size([5, 1515, 80])

log/log-train-2026-01-13-09-17-30 ADDED Viewed

	@@ -0,0 +1,109 @@

+2026-01-13 09:17:30,681 INFO [train.py:967] Training started
+2026-01-13 09:17:30,682 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:17:30,685 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 60,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:17:30,685 INFO [train.py:988] About to create model
+2026-01-13 09:17:31,267 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:17:31,284 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:17:33,315 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 09:17:33,317 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 09:17:33,317 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 09:17:33,321 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 09:17:33,321 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 09:17:33,321 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 09:17:33,321 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 09:17:33,322 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 09:17:33,673 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 09:17:33,674 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 09:17:33,674 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 09:17:33,675 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 09:17:33,906 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 09:17:36,035 INFO [train.py:1204] Saving batch to /kaggle/working/amharic_training/exp_amharic_streaming/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
+2026-01-13 09:17:36,040 INFO [train.py:1210] features shape: torch.Size([3, 1515, 80])

log/log-train-2026-01-13-09-20-52 ADDED Viewed

	@@ -0,0 +1,109 @@

+2026-01-13 09:20:52,690 INFO [train.py:967] Training started
+2026-01-13 09:20:52,691 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:20:52,694 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-clean",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 60,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:20:52,694 INFO [train.py:988] About to create model
+2026-01-13 09:20:53,295 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:20:53,313 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:20:55,371 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 09:20:55,372 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 09:20:55,372 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 09:20:55,377 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 09:20:55,377 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 09:20:55,377 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 09:20:55,377 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 09:20:55,377 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 09:20:55,729 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 09:20:55,729 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 09:20:55,730 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 09:20:55,730 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 09:20:55,957 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 09:20:58,069 INFO [train.py:1204] Saving batch to /kaggle/working/amharic_training/exp_amharic_streaming/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
+2026-01-13 09:20:58,073 INFO [train.py:1210] features shape: torch.Size([3, 1515, 80])

log/log-train-2026-01-13-09-23-46 ADDED Viewed

	@@ -0,0 +1,110 @@

+2026-01-13 09:23:46,523 INFO [train.py:967] Training started
+2026-01-13 09:23:46,524 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:23:46,526 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-dirty",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 60,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:23:46,527 INFO [train.py:988] About to create model
+2026-01-13 09:23:47,105 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:23:47,122 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:23:49,153 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 09:23:49,155 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 09:23:49,155 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 09:23:49,159 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 09:23:49,159 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 09:23:49,159 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 09:23:49,159 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 09:23:49,159 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 09:23:49,454 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 09:23:49,454 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 09:23:49,455 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 09:23:49,456 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 09:23:49,642 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 09:23:51,502 INFO [train.py:1204] Saving batch to /kaggle/working/amharic_training/exp_amharic_streaming/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
+2026-01-13 09:23:51,504 INFO [train.py:1210] features shape: torch.Size([3, 1515, 80])
+2026-01-13 09:23:51,505 INFO [train.py:1214] num tokens: 197

log/log-train-2026-01-13-09-26-41 ADDED Viewed

	@@ -0,0 +1,110 @@

+2026-01-13 09:26:41,655 INFO [train.py:967] Training started
+2026-01-13 09:26:41,656 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:26:41,659 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-dirty",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 60,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 0,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:26:41,659 INFO [train.py:988] About to create model
+2026-01-13 09:26:42,230 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:26:42,247 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:26:44,308 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 09:26:44,309 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 09:26:44,309 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 09:26:44,312 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 09:26:44,312 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 09:26:44,312 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 09:26:44,313 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 09:26:44,313 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 09:26:44,604 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 09:26:44,604 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 09:26:44,605 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 09:26:44,605 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 09:26:44,787 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 09:26:46,366 INFO [train.py:1204] Saving batch to /kaggle/working/amharic_training/exp_amharic_streaming/batch-b74d0fb1-32e7-0629-8fad-c1a606cb0fb3.pt
+2026-01-13 09:26:46,368 INFO [train.py:1210] features shape: torch.Size([3, 1515, 80])
+2026-01-13 09:26:46,368 INFO [train.py:1214] num tokens: 197

log/log-train-2026-01-13-09-30-18 ADDED Viewed

	@@ -0,0 +1,110 @@

+2026-01-13 09:30:18,369 INFO [train.py:967] Training started
+2026-01-13 09:30:18,370 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:30:18,372 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": true,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-dirty",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 60,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 0,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:30:18,373 INFO [train.py:988] About to create model
+2026-01-13 09:30:18,944 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:30:18,960 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:30:20,978 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 09:30:20,979 INFO [asr_datamodule.py:232] Enable MUSAN
+2026-01-13 09:30:20,979 INFO [asr_datamodule.py:233] About to get Musan cuts
+2026-01-13 09:30:20,983 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 09:30:20,983 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 09:30:20,983 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 09:30:20,983 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 09:30:20,983 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 09:30:21,264 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 09:30:21,264 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 09:30:21,264 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 09:30:21,265 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 09:30:21,444 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 09:30:23,025 INFO [train.py:1204] Saving batch to /kaggle/working/amharic_training/exp_amharic_streaming/batch-b74d0fb1-32e7-0629-8fad-c1a606cb0fb3.pt
+2026-01-13 09:30:23,027 INFO [train.py:1210] features shape: torch.Size([3, 1515, 80])
+2026-01-13 09:30:23,028 INFO [train.py:1214] num tokens: 197

log/log-train-2026-01-13-09-36-17 ADDED Viewed

	@@ -0,0 +1,95 @@

+2026-01-13 09:36:17,322 INFO [train.py:967] Training started
+2026-01-13 09:36:17,323 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:36:17,326 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": false,
+  "enable_spec_aug": false,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-dirty",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": true,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 120,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 3000,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:36:17,326 INFO [train.py:988] About to create model
+2026-01-13 09:36:17,900 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:36:17,917 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:36:19,966 INFO [asr_datamodule.py:443] About to get the shuffled train-clean-100,             train-clean-360 and train-other-500 cuts

log/log-train-2026-01-13-09-38-48 ADDED Viewed

	@@ -0,0 +1,525 @@

+2026-01-13 09:38:48,908 INFO [train.py:967] Training started
+2026-01-13 09:38:48,909 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:38:48,912 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": false,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-dirty",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 100,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:38:48,912 INFO [train.py:988] About to create model
+2026-01-13 09:38:49,490 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:38:49,507 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:38:51,546 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 09:38:51,548 INFO [asr_datamodule.py:239] Disable MUSAN
+2026-01-13 09:38:51,548 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 09:38:51,548 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 09:38:51,548 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 09:38:51,548 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 09:38:51,548 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 09:38:51,839 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 09:38:51,839 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 09:38:51,840 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 09:38:51,840 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 09:38:52,028 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 09:38:55,662 INFO [train.py:895] Epoch 1, batch 0, loss[loss=8.196, simple_loss=7.458, pruned_loss=7.367, over 2262.00 frames. ], tot_loss[loss=8.196, simple_loss=7.458, pruned_loss=7.367, over 2262.00 frames. ], batch size: 6, lr: 2.50e-02, grad_scale: 2.0
+2026-01-13 09:38:55,663 INFO [train.py:920] Computing validation loss
+2026-01-13 09:39:40,650 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9124, 2.9127, 2.9131, 2.9103, 2.9125, 2.9127, 2.9127, 2.9128],
+       device='cuda:0'), covar=tensor([0.0180, 0.0263, 0.0187, 0.0132, 0.0140, 0.0314, 0.0158, 0.0139],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.5573e-06, 8.6458e-06, 8.6548e-06, 8.5687e-06, 8.8456e-06, 8.6909e-06,
+        8.7530e-06, 8.7242e-06], device='cuda:0')
+2026-01-13 09:40:12,314 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.2631, 4.2631, 4.2631, 4.2631, 4.2631, 4.2631, 4.2631, 4.2631],
+       device='cuda:0'), covar=tensor([0.0011, 0.0010, 0.0010, 0.0007, 0.0005, 0.0007, 0.0005, 0.0008],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([8.8449e-06, 8.8559e-06, 8.7936e-06, 8.6492e-06, 8.7990e-06, 8.7099e-06,
+        8.5965e-06, 8.7138e-06], device='cuda:0')
+2026-01-13 09:40:24,340 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5677, 3.5751, 3.5746, 3.5756, 3.5726, 3.5743, 3.5743, 3.5759],
+       device='cuda:0'), covar=tensor([0.0066, 0.0065, 0.0093, 0.0109, 0.0074, 0.0122, 0.0076, 0.0135],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7006e-06, 8.7711e-06, 8.6196e-06, 8.7975e-06, 8.6467e-06, 8.7045e-06,
+        8.6997e-06, 8.8221e-06], device='cuda:0')
+2026-01-13 09:40:43,154 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9190, 2.9195, 2.9200, 2.9152, 2.9188, 2.9196, 2.9196, 2.9197],
+       device='cuda:0'), covar=tensor([0.0052, 0.0086, 0.0089, 0.0054, 0.0055, 0.0080, 0.0096, 0.0067],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.5573e-06, 8.6458e-06, 8.6548e-06, 8.5687e-06, 8.8458e-06, 8.6908e-06,
+        8.7530e-06, 8.7242e-06], device='cuda:0')
+2026-01-13 09:41:22,552 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.8265, 1.8260, 1.8262, 1.8234, 1.8246, 1.8255, 1.8255, 1.8256],
+       device='cuda:0'), covar=tensor([0.0322, 0.0257, 0.0258, 0.0232, 0.0242, 0.0275, 0.0276, 0.0327],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7107e-06, 8.6163e-06, 8.9108e-06, 8.6139e-06, 8.8207e-06, 8.7166e-06,
+        8.7750e-06, 8.7847e-06], device='cuda:0')
+2026-01-13 09:41:23,283 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.6762, 1.6758, 1.6760, 1.6734, 1.6745, 1.6751, 1.6754, 1.6755],
+       device='cuda:0'), covar=tensor([0.0395, 0.0321, 0.0324, 0.0269, 0.0302, 0.0365, 0.0328, 0.0471],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7107e-06, 8.6162e-06, 8.9105e-06, 8.6139e-06, 8.8209e-06, 8.7166e-06,
+        8.7750e-06, 8.7847e-06], device='cuda:0')
+2026-01-13 09:41:26,322 INFO [train.py:929] Epoch 1, validation: loss=8.293, simple_loss=7.536, pruned_loss=7.553, over 1639044.00 frames.
+2026-01-13 09:41:26,323 INFO [train.py:930] Maximum memory allocated so far is 3020MB
+2026-01-13 09:41:27,531 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=5.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 09:41:28,530 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=79.25 vs. limit=5.0
+2026-01-13 09:41:32,768 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=23.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:41:41,398 INFO [train.py:895] Epoch 1, batch 50, loss[loss=1.013, simple_loss=0.9012, pruned_loss=1.006, over 2380.00 frames. ], tot_loss[loss=2.137, simple_loss=1.941, pruned_loss=1.892, over 101034.02 frames. ], batch size: 6, lr: 2.75e-02, grad_scale: 2.0
+2026-01-13 09:41:51,000 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=83.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:41:53,990 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=59.66 vs. limit=5.0
+2026-01-13 09:41:56,109 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.818e+01 2.693e+01 4.002e+01 1.599e+02 1.649e+03, threshold=8.004e+01, percent-clipped=0.0
+2026-01-13 09:41:56,149 INFO [train.py:895] Epoch 1, batch 100, loss[loss=1.077, simple_loss=0.9404, pruned_loss=1.1, over 2169.00 frames. ], tot_loss[loss=1.562, simple_loss=1.403, pruned_loss=1.448, over 178952.89 frames. ], batch size: 6, lr: 3.00e-02, grad_scale: 2.0
+2026-01-13 09:42:09,042 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=144.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 09:42:11,221 INFO [train.py:895] Epoch 1, batch 150, loss[loss=0.9299, simple_loss=0.7989, pruned_loss=0.9596, over 2374.00 frames. ], tot_loss[loss=1.341, simple_loss=1.19, pruned_loss=1.288, over 241574.68 frames. ], batch size: 6, lr: 3.25e-02, grad_scale: 2.0
+2026-01-13 09:42:18,782 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.5431, 4.5462, 4.5454, 4.5350, 4.5452, 4.5414, 4.5458, 4.5460],
+       device='cuda:0'), covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.9723e-06, 8.9389e-06, 8.8992e-06, 9.0146e-06, 8.9093e-06, 9.1135e-06,
+        8.9242e-06, 9.0929e-06], device='cuda:0')
+2026-01-13 09:42:27,341 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.079e+01 2.605e+01 3.127e+01 3.866e+01 7.758e+01, threshold=6.254e+01, percent-clipped=0.0
+2026-01-13 09:42:27,380 INFO [train.py:895] Epoch 1, batch 200, loss[loss=1.111, simple_loss=0.9477, pruned_loss=1.107, over 2206.00 frames. ], tot_loss[loss=1.217, simple_loss=1.067, pruned_loss=1.196, over 289242.18 frames. ], batch size: 13, lr: 3.50e-02, grad_scale: 2.0
+2026-01-13 09:42:32,960 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=3.76 vs. limit=2.0
+2026-01-13 09:42:35,865 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=38.69 vs. limit=5.0
+2026-01-13 09:42:42,305 INFO [train.py:895] Epoch 1, batch 250, loss[loss=1.007, simple_loss=0.8392, pruned_loss=1.036, over 2247.00 frames. ], tot_loss[loss=1.136, simple_loss=0.9856, pruned_loss=1.13, over 326643.59 frames. ], batch size: 9, lr: 3.75e-02, grad_scale: 2.0
+2026-01-13 09:42:45,709 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=62.66 vs. limit=5.0
+2026-01-13 09:42:53,670 INFO [zipformer.py:2441] attn_weights_entropy = tensor([5.0704, 5.0704, 5.0704, 5.0704, 5.0704, 5.0704, 5.0704, 5.0704],
+       device='cuda:0'), covar=tensor([3.2139e-05, 1.0774e-05, 1.5964e-05, 1.5737e-05, 1.1412e-05, 1.8972e-05,
+        1.2123e-05, 1.1349e-05], device='cuda:0'), in_proj_covar=tensor([0.0008, 0.0008, 0.0008, 0.0008, 0.0007, 0.0008, 0.0008, 0.0007],
+       device='cuda:0'), out_proj_covar=tensor([7.8729e-06, 7.9249e-06, 7.8565e-06, 7.7460e-06, 7.9483e-06, 7.8248e-06,
+        7.6244e-06, 7.7748e-06], device='cuda:0')
+2026-01-13 09:42:54,351 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=4.62 vs. limit=2.0
+2026-01-13 09:42:55,783 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=296.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:42:56,888 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=300.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 09:42:57,106 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.604e+01 3.256e+01 4.117e+01 5.033e+01 1.037e+02, threshold=8.234e+01, percent-clipped=8.0
+2026-01-13 09:42:57,142 INFO [train.py:895] Epoch 1, batch 300, loss[loss=0.987, simple_loss=0.8183, pruned_loss=0.9782, over 2330.00 frames. ], tot_loss[loss=1.078, simple_loss=0.925, pruned_loss=1.077, over 354852.99 frames. ], batch size: 8, lr: 4.00e-02, grad_scale: 2.0
+2026-01-13 09:43:00,299 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=47.75 vs. limit=5.0
+2026-01-13 09:43:01,193 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.32 vs. limit=2.0
+2026-01-13 09:43:02,601 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.60 vs. limit=2.0
+2026-01-13 09:43:11,925 INFO [train.py:895] Epoch 1, batch 350, loss[loss=0.8632, simple_loss=0.7046, pruned_loss=0.8543, over 2315.00 frames. ], tot_loss[loss=1.035, simple_loss=0.8795, pruned_loss=1.033, over 377251.63 frames. ], batch size: 6, lr: 4.25e-02, grad_scale: 2.0
+2026-01-13 09:43:13,787 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=357.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 09:43:18,900 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=46.42 vs. limit=5.0
+2026-01-13 09:43:22,106 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.63 vs. limit=2.0
+2026-01-13 09:43:22,608 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=387.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:43:23,048 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.81 vs. limit=2.0
+2026-01-13 09:43:24,218 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=29.38 vs. limit=5.0
+2026-01-13 09:43:27,086 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.616e+01 3.100e+01 3.626e+01 4.338e+01 9.138e+01, threshold=7.251e+01, percent-clipped=1.0
+2026-01-13 09:43:27,122 INFO [train.py:895] Epoch 1, batch 400, loss[loss=0.9037, simple_loss=0.7319, pruned_loss=0.8742, over 2369.00 frames. ], tot_loss[loss=1.001, simple_loss=0.8426, pruned_loss=0.9903, over 395631.04 frames. ], batch size: 7, lr: 4.50e-02, grad_scale: 4.0
+2026-01-13 09:43:32,183 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=29.92 vs. limit=5.0
+2026-01-13 09:43:36,098 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.43 vs. limit=2.0
+2026-01-13 09:43:38,302 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=439.0, num_to_drop=2, layers_to_drop={0, 3}
+2026-01-13 09:43:39,300 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.70 vs. limit=2.0
+2026-01-13 09:43:40,707 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.5933, 2.5865, 2.5947, 2.5947, 2.5950, 2.5951, 2.5953, 2.5942],
+       device='cuda:0'), covar=tensor([0.0013, 0.0020, 0.0018, 0.0018, 0.0021, 0.0016, 0.0016, 0.0015],
+       device='cuda:0'), in_proj_covar=tensor([0.0008, 0.0008, 0.0008, 0.0008, 0.0009, 0.0008, 0.0008, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([8.2654e-06, 7.9629e-06, 8.3062e-06, 8.0001e-06, 8.3341e-06, 8.1874e-06,
+        8.1496e-06, 8.3140e-06], device='cuda:0')
+2026-01-13 09:43:41,023 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=448.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:43:41,835 INFO [train.py:895] Epoch 1, batch 450, loss[loss=0.9395, simple_loss=0.7559, pruned_loss=0.8873, over 2344.00 frames. ], tot_loss[loss=0.9793, simple_loss=0.8177, pruned_loss=0.9582, over 408289.93 frames. ], batch size: 7, lr: 4.75e-02, grad_scale: 4.0
+2026-01-13 09:43:46,909 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.23 vs. limit=2.0
+2026-01-13 09:43:56,117 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.1839, 4.1946, 4.1719, 4.2045, 4.1976, 4.1819, 4.1965, 4.1874],
+       device='cuda:0'), covar=tensor([0.0057, 0.0040, 0.0062, 0.0071, 0.0071, 0.0062, 0.0044, 0.0062],
+       device='cuda:0'), in_proj_covar=tensor([0.0008, 0.0008, 0.0008, 0.0008, 0.0009, 0.0008, 0.0008, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([8.0830e-06, 8.4915e-06, 8.0144e-06, 8.1578e-06, 8.3533e-06, 8.0091e-06,
+        8.0887e-06, 8.3024e-06], device='cuda:0')
+2026-01-13 09:43:56,616 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.677e+01 3.262e+01 3.704e+01 4.217e+01 2.894e+02, threshold=7.407e+01, percent-clipped=2.0
+2026-01-13 09:43:56,652 INFO [train.py:895] Epoch 1, batch 500, loss[loss=0.8784, simple_loss=0.7029, pruned_loss=0.8098, over 2259.00 frames. ], tot_loss[loss=0.9674, simple_loss=0.8003, pruned_loss=0.9367, over 419371.59 frames. ], batch size: 8, lr: 4.99e-02, grad_scale: 4.0
+2026-01-13 09:44:05,968 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.40 vs. limit=2.0
+2026-01-13 09:44:11,365 INFO [train.py:895] Epoch 1, batch 550, loss[loss=0.9381, simple_loss=0.745, pruned_loss=0.8499, over 2290.00 frames. ], tot_loss[loss=0.9573, simple_loss=0.7846, pruned_loss=0.9164, over 427131.39 frames. ], batch size: 10, lr: 4.98e-02, grad_scale: 4.0
+2026-01-13 09:44:14,914 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=562.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:44:16,581 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=568.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:44:23,033 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=590.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:44:26,483 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=600.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:44:26,678 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.774e+01 3.348e+01 3.655e+01 4.119e+01 1.479e+02, threshold=7.311e+01, percent-clipped=2.0
+2026-01-13 09:44:26,715 INFO [train.py:895] Epoch 1, batch 600, loss[loss=0.843, simple_loss=0.6637, pruned_loss=0.7531, over 2279.00 frames. ], tot_loss[loss=0.946, simple_loss=0.7685, pruned_loss=0.8949, over 434274.41 frames. ], batch size: 9, lr: 4.98e-02, grad_scale: 4.0
+2026-01-13 09:44:27,166 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.38 vs. limit=2.0
+2026-01-13 09:44:33,320 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=623.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 09:44:35,180 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=629.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 09:44:36,455 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.3829, 4.3479, 4.2036, 3.7323, 3.9239, 4.3881, 4.3071, 3.9026],
+       device='cuda:0'), covar=tensor([0.0327, 0.0871, 0.0503, 0.3171, 0.3102, 0.0416, 0.0333, 0.4048],
+       device='cuda:0'), in_proj_covar=tensor([0.0015, 0.0017, 0.0015, 0.0021, 0.0017, 0.0016, 0.0014, 0.0016],
+       device='cuda:0'), out_proj_covar=tensor([1.3055e-05, 1.5864e-05, 1.2996e-05, 1.9293e-05, 1.4758e-05, 1.2554e-05,
+        1.2267e-05, 1.3596e-05], device='cuda:0')
+2026-01-13 09:44:36,514 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=13.57 vs. limit=2.0
+2026-01-13 09:44:40,835 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=648.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:44:41,736 INFO [train.py:895] Epoch 1, batch 650, loss[loss=0.875, simple_loss=0.6743, pruned_loss=0.7901, over 2405.00 frames. ], tot_loss[loss=0.9412, simple_loss=0.7572, pruned_loss=0.8803, over 440352.52 frames. ], batch size: 7, lr: 4.98e-02, grad_scale: 4.0
+2026-01-13 09:44:41,827 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=651.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 09:44:42,098 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=652.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:44:44,092 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=64.62 vs. limit=5.0
+2026-01-13 09:44:44,780 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=12.02 vs. limit=5.0
+2026-01-13 09:44:52,970 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.98 vs. limit=2.0
+2026-01-13 09:44:56,854 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.978e+01 3.549e+01 3.892e+01 4.312e+01 1.243e+02, threshold=7.784e+01, percent-clipped=2.0
+2026-01-13 09:44:56,891 INFO [train.py:895] Epoch 1, batch 700, loss[loss=0.925, simple_loss=0.7123, pruned_loss=0.8124, over 2171.00 frames. ], tot_loss[loss=0.9397, simple_loss=0.7489, pruned_loss=0.8687, over 443805.35 frames. ], batch size: 6, lr: 4.98e-02, grad_scale: 4.0
+2026-01-13 09:45:02,096 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=14.77 vs. limit=2.0
+2026-01-13 09:45:08,243 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.70 vs. limit=2.0
+2026-01-13 09:45:08,387 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=739.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 09:45:09,558 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=743.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 09:45:11,977 INFO [train.py:895] Epoch 1, batch 750, loss[loss=0.8599, simple_loss=0.6573, pruned_loss=0.7445, over 2367.00 frames. ], tot_loss[loss=0.9381, simple_loss=0.7412, pruned_loss=0.856, over 448175.63 frames. ], batch size: 7, lr: 4.97e-02, grad_scale: 4.0
+2026-01-13 09:45:13,018 INFO [zipformer.py:2441] attn_weights_entropy = tensor([6.3220, 6.3644, 6.4193, 6.3498, 6.0585, 6.3995, 6.3733, 6.3806],
+       device='cuda:0'), covar=tensor([0.0301, 0.0121, 0.0082, 0.0241, 0.3057, 0.0181, 0.0211, 0.0271],
+       device='cuda:0'), in_proj_covar=tensor([0.0011, 0.0010, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0011],
+       device='cuda:0'), out_proj_covar=tensor([1.1219e-05, 9.8383e-06, 1.1192e-05, 1.0935e-05, 1.2783e-05, 1.1378e-05,
+        1.1662e-05, 1.1140e-05], device='cuda:0')
+2026-01-13 09:45:22,965 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=787.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:45:27,477 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 3.088e+01 3.664e+01 4.289e+01 4.703e+01 6.734e+01, threshold=8.577e+01, percent-clipped=0.0
+2026-01-13 09:45:27,514 INFO [train.py:895] Epoch 1, batch 800, loss[loss=0.934, simple_loss=0.7043, pruned_loss=0.8053, over 2276.00 frames. ], tot_loss[loss=0.9419, simple_loss=0.7373, pruned_loss=0.8496, over 448512.04 frames. ], batch size: 6, lr: 4.97e-02, grad_scale: 8.0
+2026-01-13 09:45:41,806 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=847.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:45:43,029 INFO [train.py:895] Epoch 1, batch 850, loss[loss=0.9701, simple_loss=0.7146, pruned_loss=0.8437, over 2440.00 frames. ], tot_loss[loss=0.9431, simple_loss=0.7322, pruned_loss=0.8396, over 451605.10 frames. ], batch size: 8, lr: 4.96e-02, grad_scale: 8.0
+2026-01-13 09:45:55,298 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=14.89 vs. limit=5.0
+2026-01-13 09:45:58,313 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 3.338e+01 3.938e+01 4.476e+01 5.274e+01 8.302e+01, threshold=8.953e+01, percent-clipped=0.0
+2026-01-13 09:45:58,350 INFO [train.py:895] Epoch 1, batch 900, loss[loss=0.9333, simple_loss=0.6945, pruned_loss=0.7819, over 2270.00 frames. ], tot_loss[loss=0.9477, simple_loss=0.7301, pruned_loss=0.8326, over 451613.21 frames. ], batch size: 6, lr: 4.96e-02, grad_scale: 8.0
+2026-01-13 09:46:00,583 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=908.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:46:03,854 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=918.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:46:05,671 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=924.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 09:46:07,051 WARNING [optim.py:385] Scaling gradients by 0.02290330082178116, model_norm_threshold=89.527587890625
+2026-01-13 09:46:07,134 INFO [optim.py:446] Parameter Dominanting tot_sumsq encoder.encoders.2.out_combiner.weight1 with proportion 0.69, where dominant_sumsq=(grad_sumsq*orig_rms_sq)=1.052e+07, grad_sumsq = 1.052e+07, orig_rms_sq=1.000e+00
+2026-01-13 09:46:12,518 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=946.0, num_to_drop=2, layers_to_drop={0, 3}
+2026-01-13 09:46:14,182 INFO [train.py:895] Epoch 1, batch 950, loss[loss=0.974, simple_loss=0.7228, pruned_loss=0.8013, over 2237.00 frames. ], tot_loss[loss=0.9471, simple_loss=0.7245, pruned_loss=0.8207, over 452647.09 frames. ], batch size: 6, lr: 4.96e-02, grad_scale: 8.0
+2026-01-13 09:46:14,544 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=952.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:46:18,240 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=9.11 vs. limit=2.0
+2026-01-13 09:46:29,348 INFO [checkpoint.py:74] Saving checkpoint to /kaggle/working/amharic_training/exp_amharic_streaming/checkpoint-1000.pt
+2026-01-13 09:46:31,055 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1000.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:46:31,323 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 3.225e+01 4.239e+01 4.740e+01 5.717e+01 3.909e+03, threshold=9.481e+01, percent-clipped=4.0
+2026-01-13 09:46:31,360 INFO [train.py:895] Epoch 1, batch 1000, loss[loss=0.9992, simple_loss=0.7254, pruned_loss=0.8273, over 2138.00 frames. ], tot_loss[loss=0.9529, simple_loss=0.7237, pruned_loss=0.8148, over 452027.12 frames. ], batch size: 6, lr: 4.95e-02, grad_scale: 8.0
+2026-01-13 09:46:32,192 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=7.36 vs. limit=2.0
+2026-01-13 09:46:35,315 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.19 vs. limit=2.0
+2026-01-13 09:46:44,085 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=6.18 vs. limit=2.0
+2026-01-13 09:46:45,018 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.73 vs. limit=2.0
+2026-01-13 09:46:45,220 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1043.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 09:46:47,702 INFO [train.py:895] Epoch 1, batch 1050, loss[loss=0.8471, simple_loss=0.6468, pruned_loss=0.6461, over 2257.00 frames. ], tot_loss[loss=0.9489, simple_loss=0.7186, pruned_loss=0.7973, over 453061.93 frames. ], batch size: 7, lr: 4.95e-02, grad_scale: 8.0
+2026-01-13 09:46:59,085 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.39 vs. limit=2.0
+2026-01-13 09:47:00,751 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.75 vs. limit=2.0
+2026-01-13 09:47:00,832 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=4.42 vs. limit=2.0
+2026-01-13 09:47:00,933 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1091.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:47:01,422 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.40 vs. limit=2.0
+2026-01-13 09:47:02,013 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.7871, 3.3411, 3.1725, 3.3538, 3.1468, 2.7462, 3.3241, 4.1834],
+       device='cuda:0'), covar=tensor([0.5168, 0.2776, 0.4386, 0.2887, 0.4230, 0.4194, 0.2773, 0.1286],
+       device='cuda:0'), in_proj_covar=tensor([0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010],
+       device='cuda:0'), out_proj_covar=tensor([9.9681e-06, 9.8241e-06, 9.7288e-06, 9.2153e-06, 1.0065e-05, 1.0017e-05,
+        9.3450e-06, 9.7447e-06], device='cuda:0')
+2026-01-13 09:47:04,412 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 3.947e+01 4.937e+01 5.799e+01 6.625e+01 2.691e+02, threshold=1.160e+02, percent-clipped=9.0
+2026-01-13 09:47:04,448 INFO [train.py:895] Epoch 1, batch 1100, loss[loss=0.9168, simple_loss=0.7179, pruned_loss=0.6659, over 2463.00 frames. ], tot_loss[loss=0.9442, simple_loss=0.7155, pruned_loss=0.777, over 453995.58 frames. ], batch size: 7, lr: 4.94e-02, grad_scale: 8.0
+2026-01-13 09:47:04,977 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=44.13 vs. limit=5.0
+2026-01-13 09:47:21,206 INFO [train.py:895] Epoch 1, batch 1150, loss[loss=0.8858, simple_loss=0.6845, pruned_loss=0.6444, over 2471.00 frames. ], tot_loss[loss=0.9409, simple_loss=0.7135, pruned_loss=0.7586, over 454837.72 frames. ], batch size: 7, lr: 4.94e-02, grad_scale: 8.0
+2026-01-13 09:47:21,923 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1153.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:47:25,103 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.4813, 2.6394, 2.9559, 2.4652, 2.4085, 2.5175, 2.5225, 3.3136],
+       device='cuda:0'), covar=tensor([0.2658, 0.2703, 0.1351, 0.2667, 0.2957, 0.3443, 0.3015, 0.1583],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0008, 0.0008, 0.0009, 0.0008, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([7.9596e-06, 7.8302e-06, 7.8867e-06, 7.6591e-06, 8.3952e-06, 7.5446e-06,
+        8.2542e-06, 7.9533e-06], device='cuda:0')
+2026-01-13 09:47:29,464 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.74 vs. limit=2.0
+2026-01-13 09:47:37,278 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 4.803e+01 6.248e+01 8.274e+01 9.946e+01 3.148e+02, threshold=1.655e+02, percent-clipped=16.0
+2026-01-13 09:47:37,314 INFO [train.py:895] Epoch 1, batch 1200, loss[loss=0.7617, simple_loss=0.595, pruned_loss=0.5393, over 2259.00 frames. ], tot_loss[loss=0.9339, simple_loss=0.7101, pruned_loss=0.7372, over 454163.66 frames. ], batch size: 7, lr: 4.93e-02, grad_scale: 8.0
+2026-01-13 09:47:38,065 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1203.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 09:47:41,691 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1214.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 09:47:43,027 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1218.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 09:47:44,899 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1224.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:47:48,187 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=18.86 vs. limit=5.0
+2026-01-13 09:47:52,090 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1246.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:47:53,669 INFO [train.py:895] Epoch 1, batch 1250, loss[loss=0.8348, simple_loss=0.6625, pruned_loss=0.5725, over 2319.00 frames. ], tot_loss[loss=0.9119, simple_loss=0.6973, pruned_loss=0.7037, over 455507.43 frames. ], batch size: 8, lr: 4.92e-02, grad_scale: 8.0
+2026-01-13 09:47:58,426 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1266.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:48:00,215 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=4.19 vs. limit=2.0
+2026-01-13 09:48:00,320 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1272.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:48:01,919 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=3.88 vs. limit=2.0
+2026-01-13 09:48:04,757 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=4.28 vs. limit=2.0
+2026-01-13 09:48:08,089 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1294.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:48:10,425 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 5.801e+01 8.651e+01 1.054e+02 1.337e+02 2.832e+02, threshold=2.109e+02, percent-clipped=10.0
+2026-01-13 09:48:10,462 INFO [train.py:895] Epoch 1, batch 1300, loss[loss=1.188, simple_loss=0.8983, pruned_loss=0.8486, over 2158.00 frames. ], tot_loss[loss=0.8911, simple_loss=0.6852, pruned_loss=0.6728, over 454448.31 frames. ], batch size: 11, lr: 4.92e-02, grad_scale: 8.0
+2026-01-13 09:48:22,074 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.47 vs. limit=2.0
+2026-01-13 09:48:25,431 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=3.42 vs. limit=2.0
+2026-01-13 09:48:28,155 INFO [train.py:895] Epoch 1, batch 1350, loss[loss=0.7584, simple_loss=0.6027, pruned_loss=0.5076, over 2399.00 frames. ], tot_loss[loss=0.8633, simple_loss=0.668, pruned_loss=0.6379, over 454592.91 frames. ], batch size: 7, lr: 4.91e-02, grad_scale: 8.0
+2026-01-13 09:48:46,298 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 5.250e+01 1.099e+02 1.429e+02 2.120e+02 4.530e+02, threshold=2.858e+02, percent-clipped=25.0
+2026-01-13 09:48:46,335 INFO [train.py:895] Epoch 1, batch 1400, loss[loss=0.7066, simple_loss=0.5769, pruned_loss=0.4541, over 2443.00 frames. ], tot_loss[loss=0.847, simple_loss=0.6578, pruned_loss=0.614, over 455370.88 frames. ], batch size: 6, lr: 4.91e-02, grad_scale: 8.0
+2026-01-13 09:48:49,948 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.89 vs. limit=2.0
+2026-01-13 09:48:53,267 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.50 vs. limit=2.0
+2026-01-13 09:48:53,956 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=3.45 vs. limit=2.0
+2026-01-13 09:48:57,931 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=3.51 vs. limit=2.0
+2026-01-13 09:48:58,113 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1434.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:49:04,093 INFO [train.py:895] Epoch 1, batch 1450, loss[loss=0.8106, simple_loss=0.6392, pruned_loss=0.5356, over 2244.00 frames. ], tot_loss[loss=0.8415, simple_loss=0.6549, pruned_loss=0.5997, over 455410.58 frames. ], batch size: 8, lr: 4.90e-02, grad_scale: 8.0
+2026-01-13 09:49:11,403 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=11.56 vs. limit=5.0
+2026-01-13 09:49:16,155 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.12 vs. limit=2.0
+2026-01-13 09:49:19,849 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1495.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 09:49:21,803 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 7.048e+01 1.247e+02 1.643e+02 2.341e+02 7.445e+02, threshold=3.286e+02, percent-clipped=10.0
+2026-01-13 09:49:21,842 INFO [train.py:895] Epoch 1, batch 1500, loss[loss=0.6875, simple_loss=0.5587, pruned_loss=0.4364, over 2236.00 frames. ], tot_loss[loss=0.8262, simple_loss=0.6459, pruned_loss=0.5784, over 455144.27 frames. ], batch size: 7, lr: 4.89e-02, grad_scale: 8.0
+2026-01-13 09:49:22,634 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1503.0, num_to_drop=1, layers_to_drop={2}
+2026-01-13 09:49:24,775 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1509.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 09:49:31,675 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=11.32 vs. limit=5.0
+2026-01-13 09:49:33,976 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=3.45 vs. limit=2.0
+2026-01-13 09:49:38,887 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=3.08 vs. limit=2.0
+2026-01-13 09:49:40,450 INFO [train.py:895] Epoch 1, batch 1550, loss[loss=0.7783, simple_loss=0.6287, pruned_loss=0.493, over 2390.00 frames. ], tot_loss[loss=0.8172, simple_loss=0.6406, pruned_loss=0.5632, over 454433.80 frames. ], batch size: 8, lr: 4.89e-02, grad_scale: 8.0
+2026-01-13 09:49:40,481 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1551.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:49:46,056 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1566.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:49:52,453 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.21 vs. limit=2.0
+2026-01-13 09:49:59,272 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 8.336e+01 1.422e+02 1.858e+02 2.468e+02 4.982e+02, threshold=3.715e+02, percent-clipped=15.0
+2026-01-13 09:49:59,309 INFO [train.py:895] Epoch 1, batch 1600, loss[loss=0.6818, simple_loss=0.5446, pruned_loss=0.4329, over 2261.00 frames. ], tot_loss[loss=0.8009, simple_loss=0.6302, pruned_loss=0.5435, over 454032.56 frames. ], batch size: 7, lr: 4.88e-02, grad_scale: 8.0
+2026-01-13 09:49:59,309 INFO [train.py:920] Computing validation loss
+2026-01-13 09:50:04,794 INFO [zipformer.py:2441] attn_weights_entropy = tensor([0.9485, 1.0214, 1.0466, 0.9932, 1.1770, 1.0263, 1.0004, 0.9625],
+       device='cuda:0'), covar=tensor([0.5975, 0.4278, 0.3709, 0.4989, 0.3976, 0.4152, 0.4645, 0.5407],
+       device='cuda:0'), in_proj_covar=tensor([0.0015, 0.0014, 0.0013, 0.0014, 0.0014, 0.0013, 0.0015, 0.0015],
+       device='cuda:0'), out_proj_covar=tensor([1.1172e-05, 1.2094e-05, 9.8530e-06, 1.0434e-05, 1.0908e-05, 1.0459e-05,
+        1.1226e-05, 1.1101e-05], device='cuda:0')
+2026-01-13 09:50:07,418 INFO [zipformer.py:2441] attn_weights_entropy = tensor([0.6509, 0.7616, 0.7264, 0.6654, 0.8065, 0.7020, 0.7154, 0.6604],
+       device='cuda:0'), covar=tensor([0.5922, 0.4539, 0.4135, 0.5331, 0.4574, 0.4358, 0.4921, 0.6024],
+       device='cuda:0'), in_proj_covar=tensor([0.0015, 0.0014, 0.0013, 0.0014, 0.0014, 0.0013, 0.0015, 0.0015],
+       device='cuda:0'), out_proj_covar=tensor([1.1172e-05, 1.2094e-05, 9.8530e-06, 1.0434e-05, 1.0908e-05, 1.0459e-05,
+        1.1226e-05, 1.1101e-05], device='cuda:0')
+2026-01-13 09:50:41,279 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.1372, 2.3905, 2.0286, 1.1676, 1.5905, 2.8965, 2.8251, 2.2906],
+       device='cuda:0'), covar=tensor([1.2944, 0.6857, 0.7795, 2.2906, 1.1652, 0.4459, 0.3884, 0.6072],
+       device='cuda:0'), in_proj_covar=tensor([0.0016, 0.0016, 0.0015, 0.0024, 0.0019, 0.0015, 0.0012, 0.0014],
+       device='cuda:0'), out_proj_covar=tensor([1.7762e-05, 1.3191e-05, 1.3544e-05, 2.9703e-05, 1.8251e-05, 1.1722e-05,
+        1.0617e-05, 1.0359e-05], device='cuda:0')
+2026-01-13 09:51:02,260 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.1776, 0.8942, 0.8937, 1.0881, 1.0727, 1.0311, 1.0130, 0.9743],
+       device='cuda:0'), covar=tensor([0.4991, 0.5299, 0.8790, 0.5863, 0.6438, 0.6071, 0.7429, 0.7961],
+       device='cuda:0'), in_proj_covar=tensor([0.0012, 0.0013, 0.0014, 0.0011, 0.0013, 0.0013, 0.0014, 0.0013],
+       device='cuda:0'), out_proj_covar=tensor([9.1695e-06, 9.5475e-06, 9.6757e-06, 8.6551e-06, 8.3471e-06, 8.6966e-06,
+        9.7825e-06, 9.2441e-06], device='cuda:0')
+2026-01-13 09:51:08,319 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.7615, 1.3277, 1.4572, 1.8445, 1.5323, 1.4364, 1.7330, 1.6826],
+       device='cuda:0'), covar=tensor([0.3174, 0.3797, 0.3159, 0.2289, 0.2958, 0.3382, 0.2717, 0.2685],
+       device='cuda:0'), in_proj_covar=tensor([0.0008, 0.0008, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007, 0.0007],
+       device='cuda:0'), out_proj_covar=tensor([5.8173e-06, 6.3159e-06, 5.7286e-06, 4.7694e-06, 5.4294e-06, 5.4778e-06,
+        6.3024e-06, 5.1621e-06], device='cuda:0')
+2026-01-13 09:51:23,138 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.0185, 0.9183, 0.9436, 1.0427, 1.0311, 1.1100, 0.9328, 1.1403],
+       device='cuda:0'), covar=tensor([0.4380, 0.6846, 0.4444, 0.3839, 0.3990, 0.3684, 0.4812, 0.3465],
+       device='cuda:0'), in_proj_covar=tensor([0.0012, 0.0013, 0.0012, 0.0013, 0.0011, 0.0012, 0.0012, 0.0012],
+       device='cuda:0'), out_proj_covar=tensor([1.0313e-05, 1.1725e-05, 1.0247e-05, 1.0120e-05, 8.9623e-06, 9.7133e-06,
+        1.0096e-05, 9.1179e-06], device='cuda:0')
+2026-01-13 09:51:31,567 INFO [zipformer.py:2441] attn_weights_entropy = tensor([0.7309, 0.8547, 0.8271, 0.7734, 0.9467, 0.8852, 0.8127, 0.7781],
+       device='cuda:0'), covar=tensor([0.6571, 0.5323, 0.4511, 0.5943, 0.5395, 0.4595, 0.5631, 0.6575],
+       device='cuda:0'), in_proj_covar=tensor([0.0015, 0.0014, 0.0013, 0.0014, 0.0014, 0.0013, 0.0015, 0.0015],
+       device='cuda:0'), out_proj_covar=tensor([1.1172e-05, 1.2094e-05, 9.8530e-06, 1.0434e-05, 1.0908e-05, 1.0459e-05,
+        1.1226e-05, 1.1101e-05], device='cuda:0')
+2026-01-13 09:51:49,665 INFO [zipformer.py:2441] attn_weights_entropy = tensor([0.6173, 0.5222, 0.5937, 0.5901, 0.6267, 0.5477, 0.6054, 0.5795],
+       device='cuda:0'), covar=tensor([0.2362, 0.3030, 0.2300, 0.2101, 0.2639, 0.2161, 0.2632, 0.2306],
+       device='cuda:0'), in_proj_covar=tensor([0.0007, 0.0007, 0.0006, 0.0006, 0.0008, 0.0006, 0.0007, 0.0007],
+       device='cuda:0'), out_proj_covar=tensor([5.7955e-06, 5.8694e-06, 5.4043e-06, 5.6972e-06, 6.5964e-06, 5.6070e-06,
+        6.3476e-06, 5.3398e-06], device='cuda:0')
+2026-01-13 09:52:01,649 INFO [zipformer.py:2441] attn_weights_entropy = tensor([0.7619, 0.7391, 0.7737, 0.6055, 0.7114, 0.7327, 0.6225, 0.6765],
+       device='cuda:0'), covar=tensor([0.5252, 0.5068, 0.6815, 0.7808, 0.5537, 0.6139, 0.5133, 0.5908],
+       device='cuda:0'), in_proj_covar=tensor([0.0011, 0.0011, 0.0012, 0.0011, 0.0011, 0.0011, 0.0010, 0.0011],
+       device='cuda:0'), out_proj_covar=tensor([8.6005e-06, 8.7409e-06, 9.4261e-06, 9.2131e-06, 8.4794e-06, 9.1113e-06,
+        8.7320e-06, 9.2104e-06], device='cuda:0')
+2026-01-13 09:52:06,854 INFO [zipformer.py:2441] attn_weights_entropy = tensor([0.7252, 0.6511, 0.7103, 0.6278, 0.7067, 0.6907, 0.6260, 0.6583],
+       device='cuda:0'), covar=tensor([0.5315, 0.5400, 0.6805, 0.7501, 0.5915, 0.6420, 0.5023, 0.6269],
+       device='cuda:0'), in_proj_covar=tensor([0.0011, 0.0011, 0.0012, 0.0011, 0.0011, 0.0011, 0.0010, 0.0011],
+       device='cuda:0'), out_proj_covar=tensor([8.6005e-06, 8.7409e-06, 9.4261e-06, 9.2131e-06, 8.4794e-06, 9.1113e-06,
+        8.7320e-06, 9.2104e-06], device='cuda:0')
+2026-01-13 09:52:07,289 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.0412, 0.8934, 0.9503, 0.8047, 0.9606, 0.9714, 0.7580, 0.8332],
+       device='cuda:0'), covar=tensor([0.3802, 0.3549, 0.4864, 0.5985, 0.4014, 0.4819, 0.4034, 0.5168],
+       device='cuda:0'), in_proj_covar=tensor([0.0011, 0.0011, 0.0012, 0.0011, 0.0011, 0.0011, 0.0010, 0.0011],
+       device='cuda:0'), out_proj_covar=tensor([8.6005e-06, 8.7409e-06, 9.4261e-06, 9.2131e-06, 8.4794e-06, 9.1113e-06,
+        8.7320e-06, 9.2104e-06], device='cuda:0')
+2026-01-13 09:52:35,650 INFO [train.py:929] Epoch 1, validation: loss=1.398, simple_loss=1.091, pruned_loss=0.9058, over 1639044.00 frames.
+2026-01-13 09:52:35,651 INFO [train.py:930] Maximum memory allocated so far is 3020MB
+2026-01-13 09:52:39,553 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1611.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:52:45,633 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1627.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 09:52:54,704 INFO [train.py:895] Epoch 1, batch 1650, loss[loss=0.7192, simple_loss=0.5969, pruned_loss=0.4374, over 2442.00 frames. ], tot_loss[loss=0.7982, simple_loss=0.6286, pruned_loss=0.5347, over 453599.94 frames. ], batch size: 8, lr: 4.87e-02, grad_scale: 8.0
+2026-01-13 09:52:57,694 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=12.14 vs. limit=5.0
+2026-01-13 09:52:59,396 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0
+2026-01-13 09:52:59,701 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.07 vs. limit=2.0
+2026-01-13 09:53:00,833 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.10 vs. limit=2.0
+2026-01-13 09:53:03,096 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1672.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 09:53:06,415 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0
+2026-01-13 09:53:12,317 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=8.96 vs. limit=5.0
+2026-01-13 09:53:13,516 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=2.40 vs. limit=2.0
+2026-01-13 09:53:14,421 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 8.619e+01 1.501e+02 1.846e+02 2.535e+02 6.490e+02, threshold=3.693e+02, percent-clipped=7.0
+2026-01-13 09:53:14,458 INFO [train.py:895] Epoch 1, batch 1700, loss[loss=0.6405, simple_loss=0.5307, pruned_loss=0.3876, over 2438.00 frames. ], tot_loss[loss=0.7794, simple_loss=0.6169, pruned_loss=0.5144, over 454813.30 frames. ], batch size: 6, lr: 4.86e-02, grad_scale: 8.0
+2026-01-13 09:53:17,559 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0
+2026-01-13 09:53:31,569 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=2.85 vs. limit=2.0
+2026-01-13 09:53:34,203 INFO [train.py:895] Epoch 1, batch 1750, loss[loss=0.5803, simple_loss=0.5001, pruned_loss=0.3369, over 2123.00 frames. ], tot_loss[loss=0.7662, simple_loss=0.609, pruned_loss=0.4988, over 455720.24 frames. ], batch size: 5, lr: 4.86e-02, grad_scale: 8.0
+2026-01-13 09:53:35,326 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0
+2026-01-13 09:53:35,983 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0
+2026-01-13 09:53:42,399 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=10.78 vs. limit=5.0
+2026-01-13 09:53:49,299 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1790.0, num_to_drop=1, layers_to_drop={3}
+2026-01-13 09:53:53,681 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.077e+02 1.778e+02 2.102e+02 2.903e+02 5.195e+02, threshold=4.204e+02, percent-clipped=14.0
+2026-01-13 09:53:53,718 INFO [train.py:895] Epoch 1, batch 1800, loss[loss=0.5619, simple_loss=0.485, pruned_loss=0.3243, over 2335.00 frames. ], tot_loss[loss=0.7618, simple_loss=0.6062, pruned_loss=0.4904, over 455635.28 frames. ], batch size: 7, lr: 4.85e-02, grad_scale: 8.0
+2026-01-13 09:53:56,955 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1809.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:54:02,456 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.49 vs. limit=2.0
+2026-01-13 09:54:05,241 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.10 vs. limit=2.0
+2026-01-13 09:54:08,383 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1838.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:54:13,316 INFO [train.py:895] Epoch 1, batch 1850, loss[loss=0.6067, simple_loss=0.5128, pruned_loss=0.355, over 2220.00 frames. ], tot_loss[loss=0.7509, simple_loss=0.5993, pruned_loss=0.4779, over 454670.71 frames. ], batch size: 7, lr: 4.84e-02, grad_scale: 8.0
+2026-01-13 09:54:15,731 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1857.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:54:15,783 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1857.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:54:18,262 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.1113, 3.6336, 2.1818, 1.1288, 2.2892, 4.2194, 4.1625, 3.0072],
+       device='cuda:0'), covar=tensor([2.4718, 0.5397, 1.0978, 3.3853, 1.8217, 0.2623, 0.3167, 0.7466],
+       device='cuda:0'), in_proj_covar=tensor([0.0039, 0.0036, 0.0030, 0.0050, 0.0042, 0.0034, 0.0030, 0.0033],
+       device='cuda:0'), out_proj_covar=tensor([4.0907e-05, 2.6241e-05, 2.4574e-05, 7.0151e-05, 4.1970e-05, 1.9934e-05,
+        1.8683e-05, 2.1075e-05], device='cuda:0')
+2026-01-13 09:54:20,204 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=2.18 vs. limit=2.0
+2026-01-13 09:54:21,226 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1870.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 09:54:32,872 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1899.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:54:33,537 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 9.977e+01 1.660e+02 2.307e+02 3.099e+02 6.701e+02, threshold=4.613e+02, percent-clipped=10.0
+2026-01-13 09:54:33,574 INFO [train.py:895] Epoch 1, batch 1900, loss[loss=0.5432, simple_loss=0.4714, pruned_loss=0.3096, over 2443.00 frames. ], tot_loss[loss=0.7371, simple_loss=0.5909, pruned_loss=0.4636, over 455659.63 frames. ], batch size: 8, lr: 4.83e-02, grad_scale: 8.0
+2026-01-13 09:54:40,531 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1918.0, num_to_drop=2, layers_to_drop={0, 3}
+2026-01-13 09:54:41,926 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1922.0, num_to_drop=1, layers_to_drop={3}
+2026-01-13 09:54:45,411 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.6616, 3.2021, 2.3614, 1.2860, 2.6946, 3.6912, 3.7353, 2.8585],
+       device='cuda:0'), covar=tensor([1.8893, 0.5348, 0.7731, 2.7319, 1.0534, 0.3048, 0.2561, 0.6686],
+       device='cuda:0'), in_proj_covar=tensor([0.0048, 0.0043, 0.0035, 0.0059, 0.0048, 0.0038, 0.0035, 0.0038],
+       device='cuda:0'), out_proj_covar=tensor([4.9263e-05, 3.1043e-05, 2.8444e-05, 8.1894e-05, 4.7748e-05, 2.1732e-05,
+        2.1507e-05, 2.4959e-05], device='cuda:0')
+2026-01-13 09:54:45,465 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1931.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 09:54:53,246 INFO [train.py:895] Epoch 1, batch 1950, loss[loss=0.6972, simple_loss=0.5634, pruned_loss=0.4178, over 2471.00 frames. ], tot_loss[loss=0.7256, simple_loss=0.5842, pruned_loss=0.4513, over 456451.07 frames. ], batch size: 7, lr: 4.83e-02, grad_scale: 8.0
+2026-01-13 09:54:56,530 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=2.28 vs. limit=2.0
+2026-01-13 09:54:59,431 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1967.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:55:01,201 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=6.59 vs. limit=5.0
+2026-01-13 09:55:08,310 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0
+2026-01-13 09:55:12,763 INFO [checkpoint.py:74] Saving checkpoint to /kaggle/working/amharic_training/exp_amharic_streaming/checkpoint-2000.pt
+2026-01-13 09:55:14,778 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 9.039e+01 1.671e+02 2.317e+02 3.153e+02 6.405e+02, threshold=4.633e+02, percent-clipped=7.0
+2026-01-13 09:55:14,814 INFO [train.py:895] Epoch 1, batch 2000, loss[loss=0.5465, simple_loss=0.4921, pruned_loss=0.3005, over 2125.00 frames. ], tot_loss[loss=0.7253, simple_loss=0.5841, pruned_loss=0.4474, over 455320.95 frames. ], batch size: 5, lr: 4.82e-02, grad_scale: 16.0
+2026-01-13 09:55:16,442 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=2.40 vs. limit=2.0
+2026-01-13 09:55:17,659 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=2.13 vs. limit=2.0
+2026-01-13 09:55:18,448 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.18 vs. limit=2.0
+2026-01-13 09:55:23,753 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=7.56 vs. limit=5.0
+2026-01-13 09:55:24,025 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.1333, 2.3017, 2.0932, 2.2134, 2.2112, 1.9894, 2.1984, 2.2906],
+       device='cuda:0'), covar=tensor([0.1327, 0.1453, 0.1580, 0.1033, 0.1359, 0.1716, 0.1560, 0.1169],
+       device='cuda:0'), in_proj_covar=tensor([0.0006, 0.0007, 0.0008, 0.0005, 0.0006, 0.0007, 0.0007, 0.0006],
+       device='cuda:0'), out_proj_covar=tensor([4.1862e-06, 4.7623e-06, 4.9548e-06, 3.7227e-06, 3.8090e-06, 4.3603e-06,
+        4.4149e-06, 4.2052e-06], device='cuda:0')
+2026-01-13 09:55:34,066 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.11 vs. limit=2.0
+2026-01-13 09:55:35,305 INFO [train.py:895] Epoch 1, batch 2050, loss[loss=0.5633, simple_loss=0.4974, pruned_loss=0.3146, over 2371.00 frames. ], tot_loss[loss=0.7169, simple_loss=0.5798, pruned_loss=0.438, over 453890.85 frames. ], batch size: 6, lr: 4.81e-02, grad_scale: 16.0
+2026-01-13 09:55:51,871 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2090.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 09:55:53,767 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=7.28 vs. limit=5.0
+2026-01-13 09:55:56,470 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 9.468e+01 1.728e+02 2.144e+02 3.084e+02 6.957e+02, threshold=4.289e+02, percent-clipped=5.0
+2026-01-13 09:55:56,507 INFO [train.py:895] Epoch 1, batch 2100, loss[loss=0.5611, simple_loss=0.4876, pruned_loss=0.3173, over 2394.00 frames. ], tot_loss[loss=0.6991, simple_loss=0.5704, pruned_loss=0.4225, over 455451.10 frames. ], batch size: 7, lr: 4.80e-02, grad_scale: 16.0
+2026-01-13 09:56:11,487 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2138.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:56:17,009 INFO [train.py:895] Epoch 1, batch 2150, loss[loss=0.6109, simple_loss=0.5232, pruned_loss=0.3493, over 2469.00 frames. ], tot_loss[loss=0.6897, simple_loss=0.5653, pruned_loss=0.4137, over 455600.73 frames. ], batch size: 7, lr: 4.79e-02, grad_scale: 16.0
+2026-01-13 09:56:27,375 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=4.12 vs. limit=2.0
+2026-01-13 09:56:34,541 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=2194.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:56:37,560 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.176e+02 1.799e+02 2.463e+02 3.181e+02 8.966e+02, threshold=4.926e+02, percent-clipped=8.0
+2026-01-13 09:56:37,596 INFO [train.py:895] Epoch 1, batch 2200, loss[loss=0.5524, simple_loss=0.5131, pruned_loss=0.2958, over 2229.00 frames. ], tot_loss[loss=0.6803, simple_loss=0.5597, pruned_loss=0.4056, over 455746.98 frames. ], batch size: 7, lr: 4.78e-02, grad_scale: 16.0
+2026-01-13 09:56:42,689 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=2213.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:56:44,879 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.7435, 2.7314, 2.9277, 2.7587, 2.2228, 2.5740, 2.8544, 2.9037],
+       device='cuda:0'), covar=tensor([0.0763, 0.0766, 0.0699, 0.0614, 0.1204, 0.0944, 0.0875, 0.0843],
+       device='cuda:0'), in_proj_covar=tensor([0.0005, 0.0005, 0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0005],
+       device='cuda:0'), out_proj_covar=tensor([3.2826e-06, 3.3741e-06, 3.4254e-06, 2.7983e-06, 3.0287e-06, 3.3280e-06,
+        3.3705e-06, 3.3298e-06], device='cuda:0')
+2026-01-13 09:56:46,448 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2222.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:56:48,095 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=2226.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:56:58,612 INFO [train.py:895] Epoch 1, batch 2250, loss[loss=0.7146, simple_loss=0.5876, pruned_loss=0.4208, over 2367.00 frames. ], tot_loss[loss=0.6685, simple_loss=0.5542, pruned_loss=0.3954, over 457285.38 frames. ], batch size: 7, lr: 4.77e-02, grad_scale: 16.0
+2026-01-13 09:56:59,844 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.0555, 3.5513, 2.4385, 2.8331, 2.4610, 2.8623, 2.3547, 3.4015],
+       device='cuda:0'), covar=tensor([0.1854, 0.1543, 0.2826, 0.1857, 0.1965, 0.1528, 0.2516, 0.1202],
+       device='cuda:0'), in_proj_covar=tensor([0.0011, 0.0013, 0.0014, 0.0013, 0.0011, 0.0012, 0.0014, 0.0011],
+       device='cuda:0'), out_proj_covar=tensor([1.0297e-05, 1.2401e-05, 1.3388e-05, 1.2189e-05, 1.0600e-05, 1.0475e-05,
+        1.3766e-05, 9.5187e-06], device='cuda:0')
+2026-01-13 09:57:05,437 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2267.0, num_to_drop=1, layers_to_drop={2}
+2026-01-13 09:57:06,628 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2270.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:57:18,184 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=6.92 vs. limit=5.0
+2026-01-13 09:57:19,625 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.237e+02 2.269e+02 2.713e+02 3.933e+02 1.891e+03, threshold=5.426e+02, percent-clipped=14.0
+2026-01-13 09:57:19,662 INFO [train.py:895] Epoch 1, batch 2300, loss[loss=0.5302, simple_loss=0.4778, pruned_loss=0.2913, over 2321.00 frames. ], tot_loss[loss=0.6621, simple_loss=0.5505, pruned_loss=0.3899, over 456527.54 frames. ], batch size: 6, lr: 4.77e-02, grad_scale: 16.0
+2026-01-13 09:57:25,673 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2315.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:57:31,014 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0
+2026-01-13 09:57:33,241 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.12 vs. limit=2.0
+2026-01-13 09:57:35,935 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=2340.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:57:40,387 INFO [train.py:895] Epoch 1, batch 2350, loss[loss=0.6283, simple_loss=0.5412, pruned_loss=0.3577, over 2336.00 frames. ], tot_loss[loss=0.6554, simple_loss=0.5478, pruned_loss=0.3839, over 456258.93 frames. ], batch size: 7, lr: 4.76e-02, grad_scale: 16.0
+2026-01-13 09:57:49,814 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=6.27 vs. limit=5.0
+2026-01-13 09:57:53,538 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=5.96 vs. limit=5.0
+2026-01-13 09:57:59,714 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=2.89 vs. limit=2.0
+2026-01-13 09:58:00,714 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=7.29 vs. limit=5.0
+2026-01-13 09:58:00,860 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.163e+02 2.073e+02 2.848e+02 3.857e+02 1.950e+03, threshold=5.696e+02, percent-clipped=7.0
+2026-01-13 09:58:00,896 INFO [train.py:895] Epoch 1, batch 2400, loss[loss=0.5903, simple_loss=0.5055, pruned_loss=0.3376, over 2167.00 frames. ], tot_loss[loss=0.6528, simple_loss=0.5472, pruned_loss=0.3811, over 456506.04 frames. ], batch size: 6, lr: 4.75e-02, grad_scale: 16.0
+2026-01-13 09:58:01,033 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=2401.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 09:58:01,545 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.37 vs. limit=2.0
+2026-01-13 09:58:05,178 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=3.54 vs. limit=2.0
+2026-01-13 09:58:17,764 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=6.40 vs. limit=5.0
+2026-01-13 09:58:21,056 INFO [train.py:895] Epoch 1, batch 2450, loss[loss=0.6489, simple_loss=0.5358, pruned_loss=0.381, over 2376.00 frames. ], tot_loss[loss=0.6565, simple_loss=0.5506, pruned_loss=0.3826, over 454657.62 frames. ], batch size: 7, lr: 4.74e-02, grad_scale: 16.0
+2026-01-13 09:58:21,195 INFO [zipformer.py:2441] attn_weights_entropy = tensor([1.2201, 1.1724, 1.0803, 0.8796, 0.9595, 1.1776, 0.8044, 1.0004],
+       device='cuda:0'), covar=tensor([0.0516, 0.0439, 0.0381, 0.0544, 0.0723, 0.0745, 0.0713, 0.0787],
+       device='cuda:0'), in_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004],
+       device='cuda:0'), out_proj_covar=tensor([2.4676e-06, 2.0972e-06, 2.4137e-06, 2.4490e-06, 2.9016e-06, 2.9526e-06,
+        3.0142e-06, 2.6793e-06], device='cuda:0')
+2026-01-13 09:58:25,497 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.32 vs. limit=2.0
+2026-01-13 09:58:34,719 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.4525, 3.0001, 2.9165, 2.9846, 2.3264, 3.0305, 3.1386, 3.2133],
+       device='cuda:0'), covar=tensor([0.0626, 0.0293, 0.0410, 0.0345, 0.0682, 0.0360, 0.0435, 0.0264],
+       device='cuda:0'), in_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004],
+       device='cuda:0'), out_proj_covar=tensor([2.6914e-06, 2.1113e-06, 2.5280e-06, 2.3105e-06, 2.5720e-06, 2.4078e-06,
+        2.5207e-06, 2.3794e-06], device='cuda:0')
+2026-01-13 09:58:39,264 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2494.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 09:58:42,190 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.203e+02 2.126e+02 2.914e+02 3.815e+02 7.467e+02, threshold=5.828e+02, percent-clipped=8.0
+2026-01-13 09:58:42,227 INFO [train.py:895] Epoch 1, batch 2500, loss[loss=0.6273, simple_loss=0.5255, pruned_loss=0.3645, over 2236.00 frames. ], tot_loss[loss=0.6445, simple_loss=0.5436, pruned_loss=0.3738, over 455670.51 frames. ], batch size: 7, lr: 4.73e-02, grad_scale: 16.0
+2026-01-13 09:58:42,538 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=7.06 vs. limit=5.0
+2026-01-13 09:58:47,294 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2513.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:58:48,643 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=6.91 vs. limit=5.0
+2026-01-13 09:58:52,670 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2526.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:58:59,575 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2542.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:59:03,255 INFO [train.py:895] Epoch 1, batch 2550, loss[loss=0.6106, simple_loss=0.5296, pruned_loss=0.3458, over 2116.00 frames. ], tot_loss[loss=0.6401, simple_loss=0.5424, pruned_loss=0.3698, over 454832.38 frames. ], batch size: 5, lr: 4.72e-02, grad_scale: 16.0
+2026-01-13 09:59:07,478 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2561.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:59:12,783 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2574.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 09:59:14,363 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=2.27 vs. limit=2.0
+2026-01-13 09:59:24,372 INFO [train.py:1204] Saving batch to /kaggle/working/amharic_training/exp_amharic_streaming/batch-bdd640fb-0667-1ad1-1c80-317fa3b1799d.pt
+2026-01-13 09:59:24,376 INFO [train.py:1210] features shape: torch.Size([7, 1417, 80])
+2026-01-13 09:59:24,376 INFO [train.py:1214] num tokens: 394

log/log-train-2026-01-13-09-59-44 ADDED Viewed

	@@ -0,0 +1,153 @@

+2026-01-13 09:59:44,226 INFO [train.py:967] Training started
+2026-01-13 09:59:44,227 INFO [train.py:977] Device: cuda:0
+2026-01-13 09:59:44,230 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": false,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-dirty",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 60,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 09:59:44,230 INFO [train.py:988] About to create model
+2026-01-13 09:59:44,827 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 09:59:44,844 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 09:59:46,879 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 09:59:46,880 INFO [asr_datamodule.py:239] Disable MUSAN
+2026-01-13 09:59:46,880 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 09:59:46,880 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 09:59:46,881 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 09:59:46,881 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 09:59:46,881 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 09:59:47,172 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 09:59:47,173 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 09:59:47,173 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 09:59:47,174 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 09:59:47,358 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 09:59:50,693 INFO [train.py:895] Epoch 1, batch 0, loss[loss=8.266, simple_loss=7.527, pruned_loss=7.369, over 1133.00 frames. ], tot_loss[loss=8.266, simple_loss=7.527, pruned_loss=7.369, over 1133.00 frames. ], batch size: 3, lr: 2.50e-02, grad_scale: 2.0
+2026-01-13 09:59:50,694 INFO [train.py:920] Computing validation loss
+2026-01-13 10:00:20,727 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9221, 2.9223, 2.9224, 2.9208, 2.9222, 2.9222, 2.9224, 2.9219],
+       device='cuda:0'), covar=tensor([0.0289, 0.0429, 0.0275, 0.0194, 0.0285, 0.0507, 0.0181, 0.0234],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.5572e-06, 8.6463e-06, 8.6548e-06, 8.5693e-06, 8.8456e-06, 8.6909e-06,
+        8.7530e-06, 8.7243e-06], device='cuda:0')
+2026-01-13 10:00:40,775 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.2735, 4.2735, 4.2735, 4.2735, 4.2735, 4.2735, 4.2735, 4.2735],
+       device='cuda:0'), covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0001, 0.0003, 0.0001, 0.0004],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([8.8449e-06, 8.8559e-06, 8.7936e-06, 8.6492e-06, 8.7990e-06, 8.7099e-06,
+        8.5965e-06, 8.7138e-06], device='cuda:0')
+2026-01-13 10:00:48,266 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5746, 3.5782, 3.5779, 3.5786, 3.5763, 3.5779, 3.5773, 3.5790],
+       device='cuda:0'), covar=tensor([0.0053, 0.0063, 0.0067, 0.0063, 0.0046, 0.0067, 0.0048, 0.0077],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7006e-06, 8.7711e-06, 8.6197e-06, 8.7972e-06, 8.6466e-06, 8.7046e-06,
+        8.6999e-06, 8.8221e-06], device='cuda:0')
+2026-01-13 10:01:00,127 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9242, 2.9246, 2.9249, 2.9209, 2.9236, 2.9246, 2.9248, 2.9249],
+       device='cuda:0'), covar=tensor([0.0038, 0.0050, 0.0055, 0.0038, 0.0034, 0.0054, 0.0064, 0.0048],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.5573e-06, 8.6462e-06, 8.6548e-06, 8.5692e-06, 8.8457e-06, 8.6909e-06,
+        8.7529e-06, 8.7243e-06], device='cuda:0')
+2026-01-13 10:01:24,207 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.1306, 2.1304, 2.1301, 2.1287, 2.1291, 2.1297, 2.1296, 2.1297],
+       device='cuda:0'), covar=tensor([0.0316, 0.0136, 0.0202, 0.0123, 0.0172, 0.0243, 0.0188, 0.0208],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7112e-06, 8.6172e-06, 8.9107e-06, 8.6145e-06, 8.8203e-06, 8.7171e-06,
+        8.7750e-06, 8.7847e-06], device='cuda:0')
+2026-01-13 10:01:24,713 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.1635, 2.1630, 2.1631, 2.1599, 2.1612, 2.1618, 2.1615, 2.1627],
+       device='cuda:0'), covar=tensor([0.0131, 0.0082, 0.0122, 0.0081, 0.0099, 0.0123, 0.0113, 0.0135],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7110e-06, 8.6162e-06, 8.9107e-06, 8.6146e-06, 8.8203e-06, 8.7170e-06,
+        8.7749e-06, 8.7846e-06], device='cuda:0')
+2026-01-13 10:01:28,975 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5961, 3.6034, 3.6028, 3.6037, 3.6003, 3.6029, 3.6022, 3.6039],
+       device='cuda:0'), covar=tensor([0.0042, 0.0059, 0.0067, 0.0086, 0.0057, 0.0092, 0.0050, 0.0103],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7006e-06, 8.7711e-06, 8.6196e-06, 8.7972e-06, 8.6465e-06, 8.7046e-06,
+        8.7002e-06, 8.8221e-06], device='cuda:0')
+2026-01-13 10:01:31,718 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.1828, 4.1828, 4.1828, 4.1828, 4.1828, 4.1828, 4.1828, 4.1828],
+       device='cuda:0'), covar=tensor([0.0005, 0.0003, 0.0004, 0.0003, 0.0002, 0.0005, 0.0001, 0.0006],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([8.8449e-06, 8.8559e-06, 8.7936e-06, 8.6492e-06, 8.7990e-06, 8.7099e-06,
+        8.5965e-06, 8.7138e-06], device='cuda:0')
+2026-01-13 10:02:01,735 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.6188, 3.6187, 3.6187, 3.6187, 3.6186, 3.6188, 3.6188, 3.6187],
+       device='cuda:0'), covar=tensor([0.0008, 0.0007, 0.0008, 0.0007, 0.0008, 0.0007, 0.0006, 0.0006],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0008, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.9692e-06, 8.7182e-06, 8.7382e-06, 8.6883e-06, 8.7636e-06, 8.6839e-06,
+        8.6905e-06, 8.8196e-06], device='cuda:0')

log/log-train-2026-01-13-10-02-58 ADDED Viewed

	@@ -0,0 +1,544 @@

+2026-01-13 10:02:58,748 INFO [train.py:967] Training started
+2026-01-13 10:02:58,749 INFO [train.py:977] Device: cuda:0
+2026-01-13 10:02:58,751 INFO [train.py:986] {
+  "am_scale": 0.0,
+  "attention_dims": "192,192,192,192,192",
+  "average_period": 200,
+  "base_lr": 0.05,
+  "batch_idx_train": 0,
+  "best_train_epoch": -1,
+  "best_train_loss": Infinity,
+  "best_valid_epoch": -1,
+  "best_valid_loss": Infinity,
+  "blank_id": 0,
+  "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
+  "bucketing_sampler": true,
+  "cnn_module_kernels": "31,31,31,31,31",
+  "concatenate_cuts": false,
+  "context_size": 2,
+  "decode_chunk_len": 32,
+  "decoder_dim": 512,
+  "drop_last": true,
+  "duration_factor": 1.0,
+  "enable_musan": false,
+  "enable_spec_aug": true,
+  "encoder_dims": "384,384,384,384,384",
+  "encoder_unmasked_dims": "256,256,256,256,256",
+  "env_info": {
+    "IP address": "172.19.2.2",
+    "hostname": "8e64ffbd666a",
+    "icefall-git-branch": "master",
+    "icefall-git-date": "Fri Nov 28 03:42:20 2025",
+    "icefall-git-sha1": "0904e490-dirty",
+    "icefall-path": "/kaggle/working/icefall",
+    "k2-build-type": "Release",
+    "k2-git-date": "Thu Jul 25 03:34:26 2024",
+    "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
+    "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
+    "k2-version": "1.24.4",
+    "k2-with-cuda": true,
+    "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
+    "lhotse-version": "1.32.1",
+    "python-version": "3.12",
+    "torch-cuda-available": true,
+    "torch-cuda-version": "12.1",
+    "torch-version": "2.4.0+cu121"
+  },
+  "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
+  "feature_dim": 80,
+  "feedforward_dims": "1024,1024,2048,2048,1024",
+  "full_libri": false,
+  "gap": 1.0,
+  "inf_check": false,
+  "input_strategy": "PrecomputedFeatures",
+  "joiner_dim": 512,
+  "keep_last_k": 5,
+  "lm_scale": 0.25,
+  "log_interval": 50,
+  "lr_batches": 5000,
+  "lr_epochs": 3.5,
+  "manifest_dir": "/kaggle/working/amharic_training/manifests",
+  "master_port": 12354,
+  "max_duration": 60,
+  "mini_libri": false,
+  "nhead": "8,8,8,8,8",
+  "num_buckets": 30,
+  "num_encoder_layers": "2,4,3,2,4",
+  "num_epochs": 50,
+  "num_left_chunks": 4,
+  "num_workers": 2,
+  "on_the_fly_feats": false,
+  "print_diagnostics": false,
+  "prune_range": 5,
+  "reset_interval": 200,
+  "return_cuts": true,
+  "save_every_n": 1000,
+  "seed": 42,
+  "short_chunk_size": 50,
+  "shuffle": true,
+  "simple_loss_scale": 0.5,
+  "spec_aug_time_warp_factor": 80,
+  "start_batch": 0,
+  "start_epoch": 1,
+  "subsampling_factor": 4,
+  "tensorboard": true,
+  "use_fp16": true,
+  "valid_interval": 1600,
+  "vocab_size": 1000,
+  "warm_step": 2000,
+  "world_size": 1,
+  "zipformer_downsampling_factors": "1,2,4,8,2"
+}
+2026-01-13 10:02:58,752 INFO [train.py:988] About to create model
+2026-01-13 10:02:59,338 INFO [zipformer.py:405] At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
+2026-01-13 10:02:59,355 INFO [train.py:992] Number of model parameters: 71330891
+2026-01-13 10:03:01,414 INFO [asr_datamodule.py:422] About to get train-clean-100 cuts
+2026-01-13 10:03:01,415 INFO [asr_datamodule.py:239] Disable MUSAN
+2026-01-13 10:03:01,415 INFO [asr_datamodule.py:257] Enable SpecAugment
+2026-01-13 10:03:01,415 INFO [asr_datamodule.py:258] Time warp factor: 80
+2026-01-13 10:03:01,415 INFO [asr_datamodule.py:268] Num frame mask: 10
+2026-01-13 10:03:01,415 INFO [asr_datamodule.py:281] About to create train dataset
+2026-01-13 10:03:01,415 INFO [asr_datamodule.py:308] Using DynamicBucketingSampler.
+2026-01-13 10:03:01,702 INFO [asr_datamodule.py:324] About to create train dataloader
+2026-01-13 10:03:01,703 INFO [asr_datamodule.py:460] About to get dev-clean cuts
+2026-01-13 10:03:01,703 INFO [asr_datamodule.py:467] About to get dev-other cuts
+2026-01-13 10:03:01,704 INFO [asr_datamodule.py:355] About to create dev dataset
+2026-01-13 10:03:01,891 INFO [asr_datamodule.py:372] About to create dev dataloader
+2026-01-13 10:03:05,307 INFO [train.py:895] Epoch 1, batch 0, loss[loss=8.266, simple_loss=7.527, pruned_loss=7.369, over 1133.00 frames. ], tot_loss[loss=8.266, simple_loss=7.527, pruned_loss=7.369, over 1133.00 frames. ], batch size: 3, lr: 2.50e-02, grad_scale: 2.0
+2026-01-13 10:03:05,308 INFO [train.py:920] Computing validation loss
+2026-01-13 10:03:35,030 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9221, 2.9223, 2.9224, 2.9208, 2.9222, 2.9222, 2.9224, 2.9219],
+       device='cuda:0'), covar=tensor([0.0289, 0.0429, 0.0275, 0.0194, 0.0285, 0.0507, 0.0181, 0.0234],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.5572e-06, 8.6463e-06, 8.6548e-06, 8.5693e-06, 8.8456e-06, 8.6909e-06,
+        8.7530e-06, 8.7243e-06], device='cuda:0')
+2026-01-13 10:03:55,014 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.2735, 4.2735, 4.2735, 4.2735, 4.2735, 4.2735, 4.2735, 4.2735],
+       device='cuda:0'), covar=tensor([0.0003, 0.0002, 0.0003, 0.0002, 0.0001, 0.0003, 0.0001, 0.0004],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([8.8449e-06, 8.8559e-06, 8.7936e-06, 8.6492e-06, 8.7990e-06, 8.7099e-06,
+        8.5965e-06, 8.7138e-06], device='cuda:0')
+2026-01-13 10:04:02,610 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5746, 3.5782, 3.5779, 3.5786, 3.5763, 3.5779, 3.5773, 3.5790],
+       device='cuda:0'), covar=tensor([0.0053, 0.0063, 0.0067, 0.0063, 0.0046, 0.0067, 0.0048, 0.0077],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7006e-06, 8.7711e-06, 8.6197e-06, 8.7972e-06, 8.6466e-06, 8.7046e-06,
+        8.6999e-06, 8.8221e-06], device='cuda:0')
+2026-01-13 10:04:14,629 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9242, 2.9246, 2.9249, 2.9209, 2.9236, 2.9246, 2.9248, 2.9249],
+       device='cuda:0'), covar=tensor([0.0038, 0.0050, 0.0055, 0.0038, 0.0034, 0.0054, 0.0064, 0.0048],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.5573e-06, 8.6462e-06, 8.6548e-06, 8.5692e-06, 8.8457e-06, 8.6909e-06,
+        8.7529e-06, 8.7243e-06], device='cuda:0')
+2026-01-13 10:04:38,796 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.1306, 2.1304, 2.1301, 2.1287, 2.1291, 2.1297, 2.1296, 2.1297],
+       device='cuda:0'), covar=tensor([0.0316, 0.0136, 0.0202, 0.0123, 0.0172, 0.0243, 0.0188, 0.0208],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7112e-06, 8.6172e-06, 8.9107e-06, 8.6145e-06, 8.8203e-06, 8.7171e-06,
+        8.7750e-06, 8.7847e-06], device='cuda:0')
+2026-01-13 10:04:39,312 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.1635, 2.1630, 2.1631, 2.1599, 2.1612, 2.1618, 2.1615, 2.1627],
+       device='cuda:0'), covar=tensor([0.0131, 0.0082, 0.0122, 0.0081, 0.0099, 0.0123, 0.0113, 0.0135],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7110e-06, 8.6162e-06, 8.9107e-06, 8.6146e-06, 8.8203e-06, 8.7170e-06,
+        8.7749e-06, 8.7846e-06], device='cuda:0')
+2026-01-13 10:04:43,601 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5961, 3.6034, 3.6028, 3.6037, 3.6003, 3.6029, 3.6022, 3.6039],
+       device='cuda:0'), covar=tensor([0.0042, 0.0059, 0.0067, 0.0086, 0.0057, 0.0092, 0.0050, 0.0103],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7006e-06, 8.7711e-06, 8.6196e-06, 8.7972e-06, 8.6465e-06, 8.7046e-06,
+        8.7002e-06, 8.8221e-06], device='cuda:0')
+2026-01-13 10:04:46,335 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.1828, 4.1828, 4.1828, 4.1828, 4.1828, 4.1828, 4.1828, 4.1828],
+       device='cuda:0'), covar=tensor([0.0005, 0.0003, 0.0004, 0.0003, 0.0002, 0.0005, 0.0001, 0.0006],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([8.8449e-06, 8.8559e-06, 8.7936e-06, 8.6492e-06, 8.7990e-06, 8.7099e-06,
+        8.5965e-06, 8.7138e-06], device='cuda:0')
+2026-01-13 10:05:16,390 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.6188, 3.6187, 3.6187, 3.6187, 3.6186, 3.6188, 3.6188, 3.6187],
+       device='cuda:0'), covar=tensor([0.0008, 0.0007, 0.0008, 0.0007, 0.0008, 0.0007, 0.0006, 0.0006],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0008, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.9692e-06, 8.7182e-06, 8.7382e-06, 8.6883e-06, 8.7636e-06, 8.6839e-06,
+        8.6905e-06, 8.8196e-06], device='cuda:0')
+2026-01-13 10:05:46,073 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.1632, 2.1624, 2.1626, 2.1565, 2.1608, 2.1604, 2.1616, 2.1622],
+       device='cuda:0'), covar=tensor([0.0160, 0.0075, 0.0116, 0.0057, 0.0119, 0.0080, 0.0140, 0.0076],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.7110e-06, 8.6162e-06, 8.9107e-06, 8.6146e-06, 8.8203e-06, 8.7170e-06,
+        8.7749e-06, 8.7846e-06], device='cuda:0')
+2026-01-13 10:05:50,142 INFO [train.py:929] Epoch 1, validation: loss=8.287, simple_loss=7.531, pruned_loss=7.546, over 1639044.00 frames.
+2026-01-13 10:05:50,143 INFO [train.py:930] Maximum memory allocated so far is 2030MB
+2026-01-13 10:05:51,382 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=5.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 10:05:56,066 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=23.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:05:57,397 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.2354, 3.2356, 3.2346, 3.2231, 3.2300, 3.2323, 3.2290, 3.2346],
+       device='cuda:0'), covar=tensor([0.0024, 0.0016, 0.0019, 0.0024, 0.0016, 0.0019, 0.0029, 0.0019],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
+       device='cuda:0'), out_proj_covar=tensor([8.4937e-06, 8.5913e-06, 8.5835e-06, 8.5034e-06, 8.7562e-06, 8.6215e-06,
+        8.6786e-06, 8.6622e-06], device='cuda:0')
+2026-01-13 10:06:03,306 INFO [train.py:895] Epoch 1, batch 50, loss[loss=1.123, simple_loss=0.9985, pruned_loss=1.118, over 1193.00 frames. ], tot_loss[loss=2.117, simple_loss=1.925, pruned_loss=1.847, over 59730.87 frames. ], batch size: 3, lr: 2.75e-02, grad_scale: 2.0
+2026-01-13 10:06:05,784 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=6.93 vs. limit=2.0
+2026-01-13 10:06:11,404 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=83.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:06:15,697 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.247e+01 1.688e+01 2.420e+01 1.018e+02 9.593e+02, threshold=4.839e+01, percent-clipped=0.0
+2026-01-13 10:06:15,737 INFO [train.py:895] Epoch 1, batch 100, loss[loss=1.128, simple_loss=0.9891, pruned_loss=1.125, over 1446.00 frames. ], tot_loss[loss=1.551, simple_loss=1.395, pruned_loss=1.424, over 105044.27 frames. ], batch size: 4, lr: 3.00e-02, grad_scale: 2.0
+2026-01-13 10:06:18,412 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=6.89 vs. limit=2.0
+2026-01-13 10:06:26,856 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=144.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 10:06:28,641 INFO [train.py:895] Epoch 1, batch 150, loss[loss=1.008, simple_loss=0.8689, pruned_loss=1.024, over 1183.00 frames. ], tot_loss[loss=1.356, simple_loss=1.205, pruned_loss=1.291, over 138555.34 frames. ], batch size: 3, lr: 3.25e-02, grad_scale: 2.0
+2026-01-13 10:06:33,546 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.94 vs. limit=2.0
+2026-01-13 10:06:36,797 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=39.25 vs. limit=5.0
+2026-01-13 10:06:42,258 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.485e+01 1.810e+01 2.094e+01 2.492e+01 6.264e+01, threshold=4.188e+01, percent-clipped=1.0
+2026-01-13 10:06:42,297 INFO [train.py:895] Epoch 1, batch 200, loss[loss=1.301, simple_loss=1.113, pruned_loss=1.279, over 1355.00 frames. ], tot_loss[loss=1.241, simple_loss=1.091, pruned_loss=1.208, over 166013.53 frames. ], batch size: 8, lr: 3.50e-02, grad_scale: 2.0
+2026-01-13 10:06:43,406 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.06 vs. limit=2.0
+2026-01-13 10:06:48,267 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=36.28 vs. limit=5.0
+2026-01-13 10:06:48,926 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.97 vs. limit=2.0
+2026-01-13 10:06:54,966 INFO [train.py:895] Epoch 1, batch 250, loss[loss=1.005, simple_loss=0.8386, pruned_loss=1.028, over 1267.00 frames. ], tot_loss[loss=1.163, simple_loss=1.011, pruned_loss=1.143, over 187714.69 frames. ], batch size: 5, lr: 3.75e-02, grad_scale: 2.0
+2026-01-13 10:07:03,312 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.32 vs. limit=2.0
+2026-01-13 10:07:06,058 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=296.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:07:07,082 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=300.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 10:07:07,260 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.720e+01 2.137e+01 2.520e+01 3.008e+01 4.658e+01, threshold=5.040e+01, percent-clipped=3.0
+2026-01-13 10:07:07,297 INFO [train.py:895] Epoch 1, batch 300, loss[loss=0.9649, simple_loss=0.7989, pruned_loss=0.9603, over 1446.00 frames. ], tot_loss[loss=1.1, simple_loss=0.9458, pruned_loss=1.087, over 204992.57 frames. ], batch size: 5, lr: 4.00e-02, grad_scale: 2.0
+2026-01-13 10:07:15,182 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=2.11 vs. limit=2.0
+2026-01-13 10:07:15,347 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.4091, 4.4137, 4.4072, 4.4138, 4.4079, 4.4135, 4.4141, 4.4132],
+       device='cuda:0'), covar=tensor([0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0002],
+       device='cuda:0'), in_proj_covar=tensor([0.0008, 0.0008, 0.0007, 0.0008, 0.0008, 0.0008, 0.0008, 0.0007],
+       device='cuda:0'), out_proj_covar=tensor([7.4896e-06, 7.4002e-06, 7.0811e-06, 7.3262e-06, 7.2432e-06, 7.2198e-06,
+        7.2014e-06, 7.2145e-06], device='cuda:0')
+2026-01-13 10:07:19,349 INFO [train.py:895] Epoch 1, batch 350, loss[loss=0.7895, simple_loss=0.6479, pruned_loss=0.7695, over 1158.00 frames. ], tot_loss[loss=1.057, simple_loss=0.9004, pruned_loss=1.042, over 217504.64 frames. ], batch size: 3, lr: 4.25e-02, grad_scale: 2.0
+2026-01-13 10:07:20,855 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=357.0, num_to_drop=2, layers_to_drop={0, 3}
+2026-01-13 10:07:28,253 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=387.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:07:30,338 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=64.46 vs. limit=5.0
+2026-01-13 10:07:31,907 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.850e+01 2.341e+01 2.632e+01 3.206e+01 6.190e+01, threshold=5.264e+01, percent-clipped=1.0
+2026-01-13 10:07:31,944 INFO [train.py:895] Epoch 1, batch 400, loss[loss=0.9079, simple_loss=0.744, pruned_loss=0.8502, over 1355.00 frames. ], tot_loss[loss=1.027, simple_loss=0.8671, pruned_loss=1.006, over 226931.44 frames. ], batch size: 4, lr: 4.50e-02, grad_scale: 4.0
+2026-01-13 10:07:40,135 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=49.41 vs. limit=5.0
+2026-01-13 10:07:41,414 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=439.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:07:43,729 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=448.0, num_to_drop=2, layers_to_drop={0, 3}
+2026-01-13 10:07:43,818 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.90 vs. limit=2.0
+2026-01-13 10:07:44,370 INFO [train.py:895] Epoch 1, batch 450, loss[loss=0.9968, simple_loss=0.8023, pruned_loss=0.9403, over 1337.00 frames. ], tot_loss[loss=1.009, simple_loss=0.8434, pruned_loss=0.9798, over 234986.57 frames. ], batch size: 4, lr: 4.75e-02, grad_scale: 4.0
+2026-01-13 10:07:48,323 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=68.73 vs. limit=5.0
+2026-01-13 10:07:52,076 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.6090, 3.6094, 3.6071, 3.6109, 3.6119, 3.6011, 3.6134, 3.5858],
+       device='cuda:0'), covar=tensor([0.0019, 0.0009, 0.0016, 0.0015, 0.0011, 0.0020, 0.0012, 0.0036],
+       device='cuda:0'), in_proj_covar=tensor([0.0008, 0.0007, 0.0008, 0.0008, 0.0008, 0.0008, 0.0008, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([7.7488e-06, 7.7514e-06, 7.7011e-06, 7.8076e-06, 7.6817e-06, 7.7351e-06,
+        7.6346e-06, 7.8219e-06], device='cuda:0')
+2026-01-13 10:07:56,565 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.097e+01 2.438e+01 2.785e+01 3.159e+01 1.015e+02, threshold=5.571e+01, percent-clipped=4.0
+2026-01-13 10:07:56,602 INFO [train.py:895] Epoch 1, batch 500, loss[loss=0.9108, simple_loss=0.7262, pruned_loss=0.8468, over 1408.00 frames. ], tot_loss[loss=0.9969, simple_loss=0.8253, pruned_loss=0.9603, over 241851.11 frames. ], batch size: 5, lr: 4.99e-02, grad_scale: 4.0
+2026-01-13 10:08:02,301 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.47 vs. limit=2.0
+2026-01-13 10:08:09,066 INFO [train.py:895] Epoch 1, batch 550, loss[loss=0.8607, simple_loss=0.6779, pruned_loss=0.7939, over 1355.00 frames. ], tot_loss[loss=0.992, simple_loss=0.8132, pruned_loss=0.9464, over 245979.79 frames. ], batch size: 6, lr: 4.98e-02, grad_scale: 4.0
+2026-01-13 10:08:11,762 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=562.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:08:12,067 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=7.86 vs. limit=2.0
+2026-01-13 10:08:13,184 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=568.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:08:18,724 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=590.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:08:21,695 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=600.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 10:08:21,787 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=41.78 vs. limit=5.0
+2026-01-13 10:08:21,843 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.809e+01 2.308e+01 2.527e+01 2.916e+01 6.883e+01, threshold=5.054e+01, percent-clipped=3.0
+2026-01-13 10:08:21,880 INFO [train.py:895] Epoch 1, batch 600, loss[loss=1.063, simple_loss=0.8337, pruned_loss=0.9579, over 1234.00 frames. ], tot_loss[loss=0.9855, simple_loss=0.7998, pruned_loss=0.9315, over 249445.36 frames. ], batch size: 5, lr: 4.98e-02, grad_scale: 4.0
+2026-01-13 10:08:27,490 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=623.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:08:28,973 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=629.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 10:08:31,749 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=54.76 vs. limit=5.0
+2026-01-13 10:08:33,615 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=648.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:08:34,393 INFO [train.py:895] Epoch 1, batch 650, loss[loss=1.003, simple_loss=0.7781, pruned_loss=0.8958, over 1369.00 frames. ], tot_loss[loss=0.9839, simple_loss=0.7912, pruned_loss=0.9193, over 251720.18 frames. ], batch size: 4, lr: 4.98e-02, grad_scale: 4.0
+2026-01-13 10:08:34,462 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=651.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 10:08:34,657 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=652.0, num_to_drop=2, layers_to_drop={0, 3}
+2026-01-13 10:08:46,615 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.990e+01 2.372e+01 2.559e+01 2.978e+01 1.187e+02, threshold=5.118e+01, percent-clipped=4.0
+2026-01-13 10:08:46,652 INFO [train.py:895] Epoch 1, batch 700, loss[loss=0.8626, simple_loss=0.6567, pruned_loss=0.7726, over 1445.00 frames. ], tot_loss[loss=0.981, simple_loss=0.7808, pruned_loss=0.9074, over 255009.77 frames. ], batch size: 4, lr: 4.98e-02, grad_scale: 4.0
+2026-01-13 10:08:55,332 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=14.27 vs. limit=2.0
+2026-01-13 10:08:56,178 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=739.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:08:57,158 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=743.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 10:08:59,055 INFO [train.py:895] Epoch 1, batch 750, loss[loss=0.8587, simple_loss=0.651, pruned_loss=0.7534, over 1357.00 frames. ], tot_loss[loss=0.979, simple_loss=0.7716, pruned_loss=0.8964, over 256216.05 frames. ], batch size: 4, lr: 4.97e-02, grad_scale: 4.0
+2026-01-13 10:09:00,067 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.90 vs. limit=2.0
+2026-01-13 10:09:01,510 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=37.01 vs. limit=5.0
+2026-01-13 10:09:02,956 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.22 vs. limit=2.0
+2026-01-13 10:09:05,870 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.01 vs. limit=2.0
+2026-01-13 10:09:08,071 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=787.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:09:10,418 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=14.11 vs. limit=2.0
+2026-01-13 10:09:11,952 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.998e+01 2.404e+01 2.668e+01 3.048e+01 5.163e+01, threshold=5.337e+01, percent-clipped=1.0
+2026-01-13 10:09:11,989 INFO [train.py:895] Epoch 1, batch 800, loss[loss=0.988, simple_loss=0.7377, pruned_loss=0.8645, over 1133.00 frames. ], tot_loss[loss=0.9872, simple_loss=0.7703, pruned_loss=0.8936, over 257800.44 frames. ], batch size: 3, lr: 4.97e-02, grad_scale: 8.0
+2026-01-13 10:09:20,261 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.7030, 3.7067, 3.7088, 3.6986, 3.7102, 3.7126, 3.6756, 3.7025],
+       device='cuda:0'), covar=tensor([0.0004, 0.0008, 0.0014, 0.0007, 0.0011, 0.0009, 0.0015, 0.0008],
+       device='cuda:0'), in_proj_covar=tensor([0.0010, 0.0010, 0.0011, 0.0011, 0.0011, 0.0010, 0.0010, 0.0010],
+       device='cuda:0'), out_proj_covar=tensor([9.3656e-06, 9.3906e-06, 9.9855e-06, 9.8552e-06, 9.9021e-06, 9.4255e-06,
+        9.0756e-06, 1.0160e-05], device='cuda:0')
+2026-01-13 10:09:20,332 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.08 vs. limit=2.0
+2026-01-13 10:09:20,883 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.93 vs. limit=2.0
+2026-01-13 10:09:23,601 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=847.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:09:24,437 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=152.18 vs. limit=5.0
+2026-01-13 10:09:24,546 INFO [train.py:895] Epoch 1, batch 850, loss[loss=0.9253, simple_loss=0.6843, pruned_loss=0.8004, over 1231.00 frames. ], tot_loss[loss=0.9913, simple_loss=0.766, pruned_loss=0.8882, over 257809.83 frames. ], batch size: 4, lr: 4.96e-02, grad_scale: 8.0
+2026-01-13 10:09:28,453 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=14.29 vs. limit=2.0
+2026-01-13 10:09:33,205 INFO [zipformer.py:2441] attn_weights_entropy = tensor([5.2196, 5.2179, 5.2174, 5.2236, 5.2138, 5.2153, 5.2245, 5.2213],
+       device='cuda:0'), covar=tensor([0.0019, 0.0011, 0.0018, 0.0010, 0.0014, 0.0007, 0.0011, 0.0009],
+       device='cuda:0'), in_proj_covar=tensor([0.0013, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0011, 0.0012],
+       device='cuda:0'), out_proj_covar=tensor([1.2493e-05, 1.1813e-05, 1.1621e-05, 1.1113e-05, 1.2048e-05, 1.1489e-05,
+        1.1255e-05, 1.1284e-05], device='cuda:0')
+2026-01-13 10:09:35,589 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=18.32 vs. limit=2.0
+2026-01-13 10:09:37,114 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.037e+01 2.613e+01 2.883e+01 3.426e+01 9.998e+01, threshold=5.766e+01, percent-clipped=7.0
+2026-01-13 10:09:37,151 INFO [train.py:895] Epoch 1, batch 900, loss[loss=0.9903, simple_loss=0.7299, pruned_loss=0.8404, over 1141.00 frames. ], tot_loss[loss=0.9974, simple_loss=0.7632, pruned_loss=0.884, over 258728.63 frames. ], batch size: 3, lr: 4.96e-02, grad_scale: 8.0
+2026-01-13 10:09:38,864 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=908.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 10:09:40,513 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=19.47 vs. limit=2.0
+2026-01-13 10:09:41,382 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=918.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:09:41,954 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=10.46 vs. limit=2.0
+2026-01-13 10:09:42,796 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.3084, 3.3084, 3.3085, 3.3084, 3.3084, 3.3084, 3.3085, 3.3084],
+       device='cuda:0'), covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002],
+       device='cuda:0'), in_proj_covar=tensor([0.0013, 0.0012, 0.0012, 0.0013, 0.0012, 0.0012, 0.0013, 0.0012],
+       device='cuda:0'), out_proj_covar=tensor([1.1693e-05, 1.1105e-05, 1.2054e-05, 1.1608e-05, 1.1282e-05, 1.1547e-05,
+        1.1444e-05, 1.1760e-05], device='cuda:0')
+2026-01-13 10:09:42,801 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=924.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 10:09:44,841 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.57 vs. limit=2.0
+2026-01-13 10:09:48,505 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=946.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:09:49,822 INFO [train.py:895] Epoch 1, batch 950, loss[loss=1.058, simple_loss=0.7608, pruned_loss=0.9063, over 1495.00 frames. ], tot_loss[loss=1.006, simple_loss=0.762, pruned_loss=0.8827, over 259119.81 frames. ], batch size: 4, lr: 4.96e-02, grad_scale: 8.0
+2026-01-13 10:09:50,149 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=952.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:09:56,565 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=17.46 vs. limit=2.0
+2026-01-13 10:09:59,519 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.4382, 3.4383, 3.4387, 3.4382, 3.4379, 3.4394, 3.4361, 3.4381],
+       device='cuda:0'), covar=tensor([0.0008, 0.0007, 0.0016, 0.0012, 0.0010, 0.0011, 0.0012, 0.0007],
+       device='cuda:0'), in_proj_covar=tensor([0.0013, 0.0013, 0.0014, 0.0014, 0.0014, 0.0012, 0.0014, 0.0014],
+       device='cuda:0'), out_proj_covar=tensor([1.2693e-05, 1.2520e-05, 1.2723e-05, 1.2939e-05, 1.3418e-05, 1.2239e-05,
+        1.2641e-05, 1.3351e-05], device='cuda:0')
+2026-01-13 10:10:02,269 INFO [checkpoint.py:74] Saving checkpoint to /kaggle/working/amharic_training/exp_amharic_streaming/checkpoint-1000.pt
+2026-01-13 10:10:04,913 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1000.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:10:05,051 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.07 vs. limit=2.0
+2026-01-13 10:10:05,146 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.175e+01 2.790e+01 3.110e+01 3.778e+01 1.444e+02, threshold=6.220e+01, percent-clipped=6.0
+2026-01-13 10:10:05,183 INFO [train.py:895] Epoch 1, batch 1000, loss[loss=1.172, simple_loss=0.839, pruned_loss=0.9861, over 1426.00 frames. ], tot_loss[loss=1.019, simple_loss=0.7639, pruned_loss=0.8855, over 259760.95 frames. ], batch size: 4, lr: 4.95e-02, grad_scale: 8.0
+2026-01-13 10:10:16,397 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1043.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 10:10:18,440 INFO [train.py:895] Epoch 1, batch 1050, loss[loss=0.9753, simple_loss=0.6946, pruned_loss=0.8085, over 1292.00 frames. ], tot_loss[loss=1.025, simple_loss=0.761, pruned_loss=0.8813, over 260310.75 frames. ], batch size: 4, lr: 4.95e-02, grad_scale: 8.0
+2026-01-13 10:10:27,813 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=9.53 vs. limit=2.0
+2026-01-13 10:10:28,996 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1091.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:10:31,602 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.215e+01 2.826e+01 3.196e+01 3.616e+01 1.158e+02, threshold=6.392e+01, percent-clipped=1.0
+2026-01-13 10:10:31,638 INFO [train.py:895] Epoch 1, batch 1100, loss[loss=1.154, simple_loss=0.8107, pruned_loss=0.9521, over 1405.00 frames. ], tot_loss[loss=1.031, simple_loss=0.7581, pruned_loss=0.8778, over 260956.50 frames. ], batch size: 4, lr: 4.94e-02, grad_scale: 8.0
+2026-01-13 10:10:44,895 INFO [train.py:895] Epoch 1, batch 1150, loss[loss=0.9918, simple_loss=0.6809, pruned_loss=0.8201, over 1409.00 frames. ], tot_loss[loss=1.044, simple_loss=0.7594, pruned_loss=0.8793, over 262138.72 frames. ], batch size: 4, lr: 4.94e-02, grad_scale: 8.0
+2026-01-13 10:10:45,489 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1153.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:10:47,685 WARNING [optim.py:385] Scaling gradients by 0.024549107998609543, model_norm_threshold=63.92420196533203
+2026-01-13 10:10:47,769 INFO [optim.py:446] Parameter Dominanting tot_sumsq encoder.encoder_embed.conv.0.weight with proportion 1.00, where dominant_sumsq=(grad_sumsq*orig_rms_sq)=6.756e+06, grad_sumsq = 1.826e+08, orig_rms_sq=3.700e-02
+2026-01-13 10:10:50,904 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.42 vs. limit=2.0
+2026-01-13 10:10:57,865 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.460e+01 3.058e+01 3.453e+01 3.971e+01 2.604e+03, threshold=6.905e+01, percent-clipped=3.0
+2026-01-13 10:10:57,902 INFO [train.py:895] Epoch 1, batch 1200, loss[loss=1.216, simple_loss=0.855, pruned_loss=0.9645, over 1295.00 frames. ], tot_loss[loss=1.055, simple_loss=0.7604, pruned_loss=0.8798, over 262412.46 frames. ], batch size: 4, lr: 4.93e-02, grad_scale: 8.0
+2026-01-13 10:10:58,498 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1203.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:11:01,444 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1214.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 10:11:02,434 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1218.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:11:03,045 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.32 vs. limit=2.0
+2026-01-13 10:11:04,028 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1224.0, num_to_drop=1, layers_to_drop={2}
+2026-01-13 10:11:07,160 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=47.79 vs. limit=5.0
+2026-01-13 10:11:09,732 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1246.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:11:11,115 INFO [train.py:895] Epoch 1, batch 1250, loss[loss=1.079, simple_loss=0.7345, pruned_loss=0.8655, over 1447.00 frames. ], tot_loss[loss=1.061, simple_loss=0.757, pruned_loss=0.8765, over 264262.36 frames. ], batch size: 5, lr: 4.92e-02, grad_scale: 8.0
+2026-01-13 10:11:14,998 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1266.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:11:16,448 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1272.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:11:22,784 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1294.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:11:24,586 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.560e+01 3.171e+01 3.537e+01 4.222e+01 1.088e+02, threshold=7.073e+01, percent-clipped=7.0
+2026-01-13 10:11:24,623 INFO [train.py:895] Epoch 1, batch 1300, loss[loss=1.353, simple_loss=0.9442, pruned_loss=1.044, over 1206.00 frames. ], tot_loss[loss=1.066, simple_loss=0.7542, pruned_loss=0.8725, over 264169.34 frames. ], batch size: 6, lr: 4.92e-02, grad_scale: 8.0
+2026-01-13 10:11:28,934 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.23 vs. limit=2.0
+2026-01-13 10:11:38,249 INFO [train.py:895] Epoch 1, batch 1350, loss[loss=0.8738, simple_loss=0.5912, pruned_loss=0.6812, over 1366.00 frames. ], tot_loss[loss=1.065, simple_loss=0.7453, pruned_loss=0.8648, over 263085.89 frames. ], batch size: 4, lr: 4.91e-02, grad_scale: 8.0
+2026-01-13 10:11:45,024 INFO [zipformer.py:2441] attn_weights_entropy = tensor([5.7861, 5.7600, 5.7878, 5.7891, 5.7587, 5.7874, 5.7876, 5.7847],
+       device='cuda:0'), covar=tensor([0.0010, 0.0011, 0.0007, 0.0010, 0.0011, 0.0013, 0.0009, 0.0014],
+       device='cuda:0'), in_proj_covar=tensor([0.0020, 0.0018, 0.0018, 0.0020, 0.0021, 0.0020, 0.0020, 0.0020],
+       device='cuda:0'), out_proj_covar=tensor([1.7714e-05, 1.6661e-05, 1.6202e-05, 1.8405e-05, 1.9219e-05, 1.7417e-05,
+        1.8047e-05, 1.8374e-05], device='cuda:0')
+2026-01-13 10:11:46,209 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9230, 2.9206, 2.9227, 2.9206, 2.9232, 2.9232, 2.9212, 2.9229],
+       device='cuda:0'), covar=tensor([0.0008, 0.0010, 0.0009, 0.0008, 0.0006, 0.0005, 0.0004, 0.0006],
+       device='cuda:0'), in_proj_covar=tensor([0.0014, 0.0015, 0.0015, 0.0016, 0.0015, 0.0014, 0.0015, 0.0015],
+       device='cuda:0'), out_proj_covar=tensor([1.3425e-05, 1.3684e-05, 1.3399e-05, 1.4391e-05, 1.4306e-05, 1.3227e-05,
+        1.3075e-05, 1.4138e-05], device='cuda:0')
+2026-01-13 10:11:52,512 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.402e+01 3.276e+01 3.721e+01 4.511e+01 1.995e+02, threshold=7.442e+01, percent-clipped=4.0
+2026-01-13 10:11:52,548 INFO [train.py:895] Epoch 1, batch 1400, loss[loss=1.087, simple_loss=0.7172, pruned_loss=0.8499, over 1217.00 frames. ], tot_loss[loss=1.078, simple_loss=0.7472, pruned_loss=0.8665, over 262784.45 frames. ], batch size: 3, lr: 4.91e-02, grad_scale: 8.0
+2026-01-13 10:12:02,134 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1434.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:12:07,353 INFO [train.py:895] Epoch 1, batch 1450, loss[loss=1.166, simple_loss=0.7715, pruned_loss=0.896, over 1387.00 frames. ], tot_loss[loss=1.093, simple_loss=0.7508, pruned_loss=0.8696, over 262172.00 frames. ], batch size: 5, lr: 4.90e-02, grad_scale: 8.0
+2026-01-13 10:12:15,283 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=34.18 vs. limit=5.0
+2026-01-13 10:12:16,903 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=11.00 vs. limit=2.0
+2026-01-13 10:12:18,861 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=7.50 vs. limit=2.0
+2026-01-13 10:12:20,301 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1495.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 10:12:21,964 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=22.89 vs. limit=2.0
+2026-01-13 10:12:21,993 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 2.785e+01 3.460e+01 3.813e+01 4.554e+01 1.272e+02, threshold=7.626e+01, percent-clipped=7.0
+2026-01-13 10:12:22,032 INFO [train.py:895] Epoch 1, batch 1500, loss[loss=1.219, simple_loss=0.7655, pruned_loss=0.9557, over 1271.00 frames. ], tot_loss[loss=1.103, simple_loss=0.7513, pruned_loss=0.8699, over 261697.61 frames. ], batch size: 4, lr: 4.89e-02, grad_scale: 8.0
+2026-01-13 10:12:22,677 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1503.0, num_to_drop=1, layers_to_drop={2}
+2026-01-13 10:12:24,404 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1509.0, num_to_drop=2, layers_to_drop={0, 3}
+2026-01-13 10:12:28,035 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5991, 3.5964, 3.5961, 3.5017, 3.4602, 3.5972, 3.6072, 3.1779],
+       device='cuda:0'), covar=tensor([0.0047, 0.0160, 0.0137, 0.0124, 0.0109, 0.0073, 0.0123, 0.0664],
+       device='cuda:0'), in_proj_covar=tensor([0.0023, 0.0024, 0.0024, 0.0022, 0.0023, 0.0028, 0.0024, 0.0021],
+       device='cuda:0'), out_proj_covar=tensor([2.0695e-05, 2.1413e-05, 2.0099e-05, 2.0677e-05, 2.0075e-05, 2.0114e-05,
+        2.1548e-05, 2.1711e-05], device='cuda:0')
+2026-01-13 10:12:30,505 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.28 vs. limit=2.0
+2026-01-13 10:12:36,253 INFO [train.py:895] Epoch 1, batch 1550, loss[loss=1.134, simple_loss=0.7439, pruned_loss=0.8507, over 1192.00 frames. ], tot_loss[loss=1.113, simple_loss=0.7519, pruned_loss=0.8688, over 262336.01 frames. ], batch size: 4, lr: 4.89e-02, grad_scale: 8.0
+2026-01-13 10:12:36,282 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1551.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:12:40,545 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1566.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:12:46,340 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=8.38 vs. limit=2.0
+2026-01-13 10:12:46,355 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=19.82 vs. limit=2.0
+2026-01-13 10:12:47,396 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.83 vs. limit=2.0
+2026-01-13 10:12:50,554 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 3.131e+01 4.182e+01 5.089e+01 6.042e+01 2.314e+02, threshold=1.018e+02, percent-clipped=12.0
+2026-01-13 10:12:50,590 INFO [train.py:895] Epoch 1, batch 1600, loss[loss=1.05, simple_loss=0.6882, pruned_loss=0.7769, over 1293.00 frames. ], tot_loss[loss=1.124, simple_loss=0.753, pruned_loss=0.8684, over 262369.94 frames. ], batch size: 4, lr: 4.88e-02, grad_scale: 8.0
+2026-01-13 10:12:50,591 INFO [train.py:920] Computing validation loss
+2026-01-13 10:12:52,620 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.7601, 2.7673, 2.7676, 2.7663, 2.7669, 2.7657, 2.7626, 2.7567],
+       device='cuda:0'), covar=tensor([0.0013, 0.0010, 0.0014, 0.0022, 0.0020, 0.0017, 0.0018, 0.0017],
+       device='cuda:0'), in_proj_covar=tensor([0.0017, 0.0017, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0017],
+       device='cuda:0'), out_proj_covar=tensor([1.4038e-05, 1.4443e-05, 1.4773e-05, 1.5643e-05, 1.4625e-05, 1.4863e-05,
+        1.4202e-05, 1.4594e-05], device='cuda:0')
+2026-01-13 10:12:56,000 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9211, 2.9178, 2.9206, 2.9227, 2.9227, 2.9215, 2.9212, 2.9198],
+       device='cuda:0'), covar=tensor([0.0011, 0.0014, 0.0009, 0.0012, 0.0015, 0.0011, 0.0013, 0.0014],
+       device='cuda:0'), in_proj_covar=tensor([0.0021, 0.0022, 0.0020, 0.0018, 0.0023, 0.0020, 0.0021, 0.0020],
+       device='cuda:0'), out_proj_covar=tensor([2.1139e-05, 2.0467e-05, 1.9723e-05, 1.8041e-05, 2.2014e-05, 1.9838e-05,
+        1.8900e-05, 2.0187e-05], device='cuda:0')
+2026-01-13 10:13:05,874 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5916, 3.5896, 3.5916, 3.5888, 3.5917, 3.5908, 3.5912, 3.5853],
+       device='cuda:0'), covar=tensor([0.0026, 0.0022, 0.0025, 0.0039, 0.0024, 0.0026, 0.0025, 0.0038],
+       device='cuda:0'), in_proj_covar=tensor([0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018],
+       device='cuda:0'), out_proj_covar=tensor([1.8250e-05, 1.7821e-05, 1.7170e-05, 1.9563e-05, 1.7093e-05, 1.8057e-05,
+        1.8479e-05, 1.7928e-05], device='cuda:0')
+2026-01-13 10:13:17,278 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5668, 3.5779, 3.5552, 3.5855, 3.5516, 3.4835, 3.5827, 3.5708],
+       device='cuda:0'), covar=tensor([0.0045, 0.0022, 0.0018, 0.0021, 0.0021, 0.0184, 0.0019, 0.0012],
+       device='cuda:0'), in_proj_covar=tensor([0.0025, 0.0028, 0.0025, 0.0025, 0.0025, 0.0026, 0.0024, 0.0022],
+       device='cuda:0'), out_proj_covar=tensor([2.5765e-05, 2.9468e-05, 2.5595e-05, 2.5864e-05, 2.4105e-05, 2.5646e-05,
+        2.3893e-05, 2.0834e-05], device='cuda:0')
+2026-01-13 10:13:25,256 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.8853, 2.8993, 2.8902, 2.8983, 2.9001, 2.8961, 2.8912, 2.8864],
+       device='cuda:0'), covar=tensor([0.0010, 0.0008, 0.0016, 0.0019, 0.0015, 0.0011, 0.0011, 0.0008],
+       device='cuda:0'), in_proj_covar=tensor([0.0017, 0.0017, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0017],
+       device='cuda:0'), out_proj_covar=tensor([1.4038e-05, 1.4443e-05, 1.4773e-05, 1.5643e-05, 1.4625e-05, 1.4863e-05,
+        1.4202e-05, 1.4594e-05], device='cuda:0')
+2026-01-13 10:14:19,366 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9151, 2.9242, 2.9216, 2.9215, 2.9229, 2.9208, 2.9187, 2.9145],
+       device='cuda:0'), covar=tensor([0.0006, 0.0006, 0.0010, 0.0012, 0.0012, 0.0009, 0.0008, 0.0009],
+       device='cuda:0'), in_proj_covar=tensor([0.0017, 0.0017, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0017],
+       device='cuda:0'), out_proj_covar=tensor([1.4038e-05, 1.4443e-05, 1.4773e-05, 1.5643e-05, 1.4625e-05, 1.4863e-05,
+        1.4202e-05, 1.4594e-05], device='cuda:0')
+2026-01-13 10:14:26,041 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.4728, 3.5375, 3.3813, 3.4304, 3.5188, 3.5247, 3.5280, 3.4937],
+       device='cuda:0'), covar=tensor([0.0041, 0.0029, 0.0046, 0.0027, 0.0038, 0.0020, 0.0023, 0.0022],
+       device='cuda:0'), in_proj_covar=tensor([0.0019, 0.0019, 0.0018, 0.0020, 0.0019, 0.0018, 0.0018, 0.0019],
+       device='cuda:0'), out_proj_covar=tensor([1.7211e-05, 1.6775e-05, 1.5699e-05, 1.8109e-05, 1.7291e-05, 1.6719e-05,
+        1.6329e-05, 1.6459e-05], device='cuda:0')
+2026-01-13 10:14:29,292 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.3260, 3.3318, 3.3251, 3.3337, 3.3211, 3.3002, 3.3339, 3.3302],
+       device='cuda:0'), covar=tensor([0.0036, 0.0034, 0.0053, 0.0030, 0.0053, 0.0042, 0.0035, 0.0032],
+       device='cuda:0'), in_proj_covar=tensor([0.0025, 0.0028, 0.0025, 0.0025, 0.0025, 0.0026, 0.0024, 0.0022],
+       device='cuda:0'), out_proj_covar=tensor([2.5765e-05, 2.9468e-05, 2.5595e-05, 2.5864e-05, 2.4105e-05, 2.5646e-05,
+        2.3893e-05, 2.0834e-05], device='cuda:0')
+2026-01-13 10:14:45,534 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5849, 3.5740, 3.5863, 3.5850, 3.5689, 3.5844, 3.5837, 3.5854],
+       device='cuda:0'), covar=tensor([0.0090, 0.0045, 0.0052, 0.0055, 0.0134, 0.0064, 0.0063, 0.0118],
+       device='cuda:0'), in_proj_covar=tensor([0.0022, 0.0019, 0.0019, 0.0020, 0.0025, 0.0022, 0.0020, 0.0024],
+       device='cuda:0'), out_proj_covar=tensor([1.7656e-05, 1.7133e-05, 1.5791e-05, 1.6920e-05, 2.1786e-05, 1.7446e-05,
+        1.6499e-05, 2.0183e-05], device='cuda:0')
+2026-01-13 10:14:57,723 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.9083, 2.9198, 2.9198, 2.9174, 2.9190, 2.9155, 2.9153, 2.9044],
+       device='cuda:0'), covar=tensor([0.0005, 0.0006, 0.0009, 0.0009, 0.0007, 0.0006, 0.0007, 0.0005],
+       device='cuda:0'), in_proj_covar=tensor([0.0017, 0.0017, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0017],
+       device='cuda:0'), out_proj_covar=tensor([1.4038e-05, 1.4443e-05, 1.4773e-05, 1.5643e-05, 1.4625e-05, 1.4863e-05,
+        1.4202e-05, 1.4594e-05], device='cuda:0')
+2026-01-13 10:15:18,711 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.5838, 3.5843, 3.5834, 3.5824, 3.5847, 3.5773, 3.5810, 3.5842],
+       device='cuda:0'), covar=tensor([0.0009, 0.0010, 0.0013, 0.0016, 0.0011, 0.0010, 0.0011, 0.0011],
+       device='cuda:0'), in_proj_covar=tensor([0.0014, 0.0014, 0.0014, 0.0016, 0.0014, 0.0014, 0.0014, 0.0015],
+       device='cuda:0'), out_proj_covar=tensor([1.5035e-05, 1.3810e-05, 1.5331e-05, 1.5576e-05, 1.4998e-05, 1.5178e-05,
+        1.4159e-05, 1.5214e-05], device='cuda:0')
+2026-01-13 10:15:24,772 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.2063, 2.2184, 2.2103, 2.1938, 2.2143, 2.2154, 2.2090, 2.2121],
+       device='cuda:0'), covar=tensor([0.0004, 0.0006, 0.0005, 0.0006, 0.0010, 0.0004, 0.0007, 0.0005],
+       device='cuda:0'), in_proj_covar=tensor([0.0014, 0.0014, 0.0013, 0.0013, 0.0014, 0.0013, 0.0013, 0.0014],
+       device='cuda:0'), out_proj_covar=tensor([1.4020e-05, 1.3997e-05, 1.3386e-05, 1.3399e-05, 1.3895e-05, 1.4047e-05,
+        1.3130e-05, 1.4403e-05], device='cuda:0')
+2026-01-13 10:15:36,636 INFO [train.py:929] Epoch 1, validation: loss=1.813, simple_loss=1.191, pruned_loss=1.34, over 1639044.00 frames.
+2026-01-13 10:15:36,637 INFO [train.py:930] Maximum memory allocated so far is 2169MB
+2026-01-13 10:15:39,825 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1611.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:15:44,705 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1627.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 10:15:47,705 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=73.72 vs. limit=5.0
+2026-01-13 10:15:51,578 INFO [train.py:895] Epoch 1, batch 1650, loss[loss=1.208, simple_loss=0.7772, pruned_loss=0.8918, over 1227.00 frames. ], tot_loss[loss=1.143, simple_loss=0.7596, pruned_loss=0.8746, over 261972.70 frames. ], batch size: 4, lr: 4.87e-02, grad_scale: 8.0
+2026-01-13 10:15:52,373 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.62 vs. limit=2.0
+2026-01-13 10:15:53,676 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.45 vs. limit=2.0
+2026-01-13 10:15:57,985 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1672.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:15:59,277 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.63 vs. limit=2.0
+2026-01-13 10:16:01,378 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=18.42 vs. limit=2.0
+2026-01-13 10:16:01,875 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.9962, 3.9966, 3.9956, 3.9943, 3.9949, 3.9927, 3.9961, 3.9961],
+       device='cuda:0'), covar=tensor([0.0004, 0.0004, 0.0003, 0.0003, 0.0002, 0.0004, 0.0002, 0.0003],
+       device='cuda:0'), in_proj_covar=tensor([0.0015, 0.0016, 0.0015, 0.0015, 0.0015, 0.0016, 0.0015, 0.0015],
+       device='cuda:0'), out_proj_covar=tensor([1.5789e-05, 1.6422e-05, 1.5300e-05, 1.6000e-05, 1.5459e-05, 1.5918e-05,
+        1.6051e-05, 1.5201e-05], device='cuda:0')
+2026-01-13 10:16:05,390 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=16.65 vs. limit=2.0
+2026-01-13 10:16:06,377 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 3.619e+01 5.029e+01 5.993e+01 7.363e+01 1.979e+02, threshold=1.199e+02, percent-clipped=9.0
+2026-01-13 10:16:06,413 INFO [train.py:895] Epoch 1, batch 1700, loss[loss=1.117, simple_loss=0.7132, pruned_loss=0.8166, over 1225.00 frames. ], tot_loss[loss=1.147, simple_loss=0.7559, pruned_loss=0.8697, over 262329.65 frames. ], batch size: 3, lr: 4.86e-02, grad_scale: 8.0
+2026-01-13 10:16:21,303 INFO [train.py:895] Epoch 1, batch 1750, loss[loss=1.123, simple_loss=0.7239, pruned_loss=0.8064, over 1279.00 frames. ], tot_loss[loss=1.147, simple_loss=0.7507, pruned_loss=0.8618, over 262923.08 frames. ], batch size: 3, lr: 4.86e-02, grad_scale: 8.0
+2026-01-13 10:16:25,937 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=18.26 vs. limit=2.0
+2026-01-13 10:16:26,273 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.75 vs. limit=2.0
+2026-01-13 10:16:32,929 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1790.0, num_to_drop=1, layers_to_drop={2}
+2026-01-13 10:16:35,651 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=74.01 vs. limit=5.0
+2026-01-13 10:16:36,330 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 4.256e+01 6.493e+01 7.386e+01 8.806e+01 3.288e+02, threshold=1.477e+02, percent-clipped=6.0
+2026-01-13 10:16:36,367 INFO [train.py:895] Epoch 1, batch 1800, loss[loss=1.172, simple_loss=0.7241, pruned_loss=0.8499, over 1332.00 frames. ], tot_loss[loss=1.155, simple_loss=0.7508, pruned_loss=0.86, over 262589.97 frames. ], batch size: 4, lr: 4.85e-02, grad_scale: 8.0
+2026-01-13 10:16:37,506 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=41.80 vs. limit=5.0
+2026-01-13 10:16:38,813 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=1809.0, num_to_drop=1, layers_to_drop={2}
+2026-01-13 10:16:43,310 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=307.58 vs. limit=5.0
+2026-01-13 10:16:47,375 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1838.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:16:47,518 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.28 vs. limit=2.0
+2026-01-13 10:16:51,330 INFO [train.py:895] Epoch 1, batch 1850, loss[loss=1.099, simple_loss=0.6944, pruned_loss=0.7776, over 1272.00 frames. ], tot_loss[loss=1.161, simple_loss=0.7484, pruned_loss=0.8559, over 262777.50 frames. ], batch size: 4, lr: 4.84e-02, grad_scale: 8.0
+2026-01-13 10:16:53,215 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=1857.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:16:53,280 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1857.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:16:55,267 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=17.17 vs. limit=2.0
+2026-01-13 10:16:56,522 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.7388, 3.7466, 3.7227, 3.7289, 3.7463, 3.7390, 3.7298, 3.7440],
+       device='cuda:0'), covar=tensor([0.0068, 0.0126, 0.0118, 0.0107, 0.0096, 0.0155, 0.0120, 0.0068],
+       device='cuda:0'), in_proj_covar=tensor([0.0022, 0.0024, 0.0023, 0.0022, 0.0025, 0.0024, 0.0025, 0.0022],
+       device='cuda:0'), out_proj_covar=tensor([2.2779e-05, 2.4041e-05, 2.3752e-05, 2.1944e-05, 2.4110e-05, 2.3980e-05,
+        2.2859e-05, 2.2726e-05], device='cuda:0')
+2026-01-13 10:16:57,440 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=1870.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:17:00,689 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.06 vs. limit=2.0
+2026-01-13 10:17:00,949 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=16.60 vs. limit=2.0
+2026-01-13 10:17:01,278 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=80.56 vs. limit=5.0
+2026-01-13 10:17:04,579 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.17 vs. limit=2.0
+2026-01-13 10:17:06,255 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1899.0, num_to_drop=2, layers_to_drop={0, 2}
+2026-01-13 10:17:06,773 INFO [train.py:895] Epoch 1, batch 1900, loss[loss=1.195, simple_loss=0.7055, pruned_loss=0.8634, over 1217.00 frames. ], tot_loss[loss=1.169, simple_loss=0.748, pruned_loss=0.8541, over 262195.30 frames. ], batch size: 4, lr: 4.83e-02, grad_scale: 4.0
+2026-01-13 10:17:07,053 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 6.581e+01 8.305e+01 9.547e+01 1.269e+02 6.810e+02, threshold=1.909e+02, percent-clipped=17.0
+2026-01-13 10:17:08,471 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=48.31 vs. limit=5.0
+2026-01-13 10:17:11,996 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1918.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 10:17:13,107 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1922.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:17:15,825 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=1931.0, num_to_drop=2, layers_to_drop={2, 3}
+2026-01-13 10:17:18,272 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=265.26 vs. limit=5.0
+2026-01-13 10:17:21,622 INFO [train.py:895] Epoch 1, batch 1950, loss[loss=1.232, simple_loss=0.7522, pruned_loss=0.8662, over 1404.00 frames. ], tot_loss[loss=1.184, simple_loss=0.7512, pruned_loss=0.8574, over 262249.73 frames. ], batch size: 4, lr: 4.83e-02, grad_scale: 4.0
+2026-01-13 10:17:21,780 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=18.13 vs. limit=2.0
+2026-01-13 10:17:26,433 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=1967.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:17:36,528 INFO [checkpoint.py:74] Saving checkpoint to /kaggle/working/amharic_training/exp_amharic_streaming/checkpoint-2000.pt
+2026-01-13 10:17:39,449 INFO [train.py:895] Epoch 1, batch 2000, loss[loss=1.069, simple_loss=0.671, pruned_loss=0.7337, over 1281.00 frames. ], tot_loss[loss=1.197, simple_loss=0.7547, pruned_loss=0.8591, over 262600.55 frames. ], batch size: 3, lr: 4.82e-02, grad_scale: 8.0
+2026-01-13 10:17:39,735 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 7.132e+01 9.948e+01 1.244e+02 1.656e+02 5.242e+02, threshold=2.489e+02, percent-clipped=13.0
+2026-01-13 10:17:40,586 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=58.63 vs. limit=5.0
+2026-01-13 10:17:47,342 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.51 vs. limit=2.0
+2026-01-13 10:17:55,171 INFO [train.py:895] Epoch 1, batch 2050, loss[loss=1.114, simple_loss=0.6795, pruned_loss=0.7745, over 1187.00 frames. ], tot_loss[loss=1.212, simple_loss=0.7581, pruned_loss=0.8637, over 262132.15 frames. ], batch size: 3, lr: 4.81e-02, grad_scale: 8.0
+2026-01-13 10:17:58,485 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=16.89 vs. limit=2.0
+2026-01-13 10:17:59,762 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=11.16 vs. limit=2.0
+2026-01-13 10:17:59,997 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=222.08 vs. limit=5.0
+2026-01-13 10:18:07,555 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2090.0, num_to_drop=2, layers_to_drop={1, 2}
+2026-01-13 10:18:10,967 INFO [train.py:895] Epoch 1, batch 2100, loss[loss=1.21, simple_loss=0.7296, pruned_loss=0.8457, over 1371.00 frames. ], tot_loss[loss=1.211, simple_loss=0.7538, pruned_loss=0.8583, over 262004.15 frames. ], batch size: 4, lr: 4.80e-02, grad_scale: 8.0
+2026-01-13 10:18:11,286 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 7.850e+01 1.355e+02 1.839e+02 2.543e+02 5.511e+02, threshold=3.679e+02, percent-clipped=28.0
+2026-01-13 10:18:12,037 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.8808, 2.9389, 2.9541, 2.8155, 2.6688, 2.9340, 2.6561, 2.9043],
+       device='cuda:0'), covar=tensor([0.0067, 0.0050, 0.0051, 0.0036, 0.0031, 0.0050, 0.0066, 0.0028],
+       device='cuda:0'), in_proj_covar=tensor([0.0022, 0.0021, 0.0021, 0.0019, 0.0021, 0.0026, 0.0022, 0.0020],
+       device='cuda:0'), out_proj_covar=tensor([2.0010e-05, 2.1677e-05, 1.8984e-05, 1.7412e-05, 1.8111e-05, 2.3035e-05,
+        2.0033e-05, 1.9341e-05], device='cuda:0')
+2026-01-13 10:18:22,420 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2138.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:18:26,574 INFO [train.py:895] Epoch 1, batch 2150, loss[loss=1.141, simple_loss=0.6621, pruned_loss=0.8097, over 1406.00 frames. ], tot_loss[loss=1.214, simple_loss=0.7518, pruned_loss=0.8563, over 262633.77 frames. ], batch size: 4, lr: 4.79e-02, grad_scale: 8.0
+2026-01-13 10:18:27,397 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=271.27 vs. limit=5.0
+2026-01-13 10:18:30,558 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.94 vs. limit=2.0
+2026-01-13 10:18:31,925 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=24.64 vs. limit=5.0
+2026-01-13 10:18:32,509 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=63.54 vs. limit=5.0
+2026-01-13 10:18:32,570 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.62 vs. limit=2.0
+2026-01-13 10:18:37,450 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=3.45 vs. limit=2.0
+2026-01-13 10:18:40,228 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=2194.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:18:42,596 INFO [train.py:895] Epoch 1, batch 2200, loss[loss=1.103, simple_loss=0.682, pruned_loss=0.7624, over 1272.00 frames. ], tot_loss[loss=1.217, simple_loss=0.7528, pruned_loss=0.8547, over 263067.45 frames. ], batch size: 4, lr: 4.78e-02, grad_scale: 8.0
+2026-01-13 10:18:42,860 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 9.630e+01 1.336e+02 1.799e+02 2.582e+02 5.343e+02, threshold=3.598e+02, percent-clipped=7.0
+2026-01-13 10:18:46,353 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=2213.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:18:49,121 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2222.0, num_to_drop=2, layers_to_drop={0, 1}
+2026-01-13 10:18:50,116 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=7.82 vs. limit=2.0
+2026-01-13 10:18:50,291 INFO [zipformer.py:1188] warmup_begin=1333.3, warmup_end=2000.0, batch_count=2226.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:18:56,689 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=5.06 vs. limit=2.0
+2026-01-13 10:18:58,063 INFO [train.py:895] Epoch 1, batch 2250, loss[loss=1.162, simple_loss=0.7047, pruned_loss=0.81, over 1357.00 frames. ], tot_loss[loss=1.213, simple_loss=0.7504, pruned_loss=0.8493, over 261953.42 frames. ], batch size: 4, lr: 4.77e-02, grad_scale: 8.0
+2026-01-13 10:19:02,726 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=12.35 vs. limit=2.0
+2026-01-13 10:19:03,295 INFO [zipformer.py:1188] warmup_begin=2000.0, warmup_end=2666.7, batch_count=2267.0, num_to_drop=1, layers_to_drop={1}
+2026-01-13 10:19:04,191 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2270.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:19:14,167 INFO [train.py:895] Epoch 1, batch 2300, loss[loss=1.257, simple_loss=0.7597, pruned_loss=0.8769, over 1162.00 frames. ], tot_loss[loss=1.214, simple_loss=0.7491, pruned_loss=0.848, over 260889.29 frames. ], batch size: 3, lr: 4.77e-02, grad_scale: 8.0
+2026-01-13 10:19:14,441 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 9.117e+01 1.366e+02 1.885e+02 2.522e+02 5.269e+02, threshold=3.770e+02, percent-clipped=4.0
+2026-01-13 10:19:18,742 INFO [zipformer.py:1188] warmup_begin=666.7, warmup_end=1333.3, batch_count=2315.0, num_to_drop=0, layers_to_drop=set()
+2026-01-13 10:19:26,691 INFO [zipformer.py:1188] warmup_begin=2666.7, warmup_end=3333.3, batch_count=2340.0, num_to_drop=1, layers_to_drop={0}
+2026-01-13 10:19:28,481 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.33 vs. limit=2.0
+2026-01-13 10:19:29,966 INFO [zipformer.py:2441] attn_weights_entropy = tensor([2.2617, 2.2608, 2.2616, 2.2597, 2.2623, 2.2622, 2.2620, 2.2615],
+       device='cuda:0'), covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005],
+       device='cuda:0'), in_proj_covar=tensor([0.0019, 0.0019, 0.0024, 0.0021, 0.0018, 0.0018, 0.0020, 0.0019],
+       device='cuda:0'), out_proj_covar=tensor([1.3575e-05, 1.3013e-05, 1.6175e-05, 1.4561e-05, 1.3200e-05, 1.3202e-05,
+        1.3315e-05, 1.3222e-05], device='cuda:0')
+2026-01-13 10:19:30,176 INFO [train.py:895] Epoch 1, batch 2350, loss[loss=1.235, simple_loss=0.7679, pruned_loss=0.851, over 1335.00 frames. ], tot_loss[loss=1.213, simple_loss=0.7481, pruned_loss=0.846, over 261631.10 frames. ], batch size: 4, lr: 4.76e-02, grad_scale: 8.0
+2026-01-13 10:19:34,610 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.10 vs. limit=2.0
+2026-01-13 10:19:38,913 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.6639, 3.9629, 5.1465, 3.5459, 3.6214, 5.6243, 6.0597, 5.7808],
+       device='cuda:0'), covar=tensor([0.0231, 0.0162, 0.0082, 0.0143, 0.0475, 0.0095, 0.0027, 0.0070],
+       device='cuda:0'), in_proj_covar=tensor([0.0027, 0.0034, 0.0026, 0.0033, 0.0027, 0.0025, 0.0025, 0.0029],
+       device='cuda:0'), out_proj_covar=tensor([1.8913e-05, 2.2221e-05, 1.7289e-05, 2.1213e-05, 2.2825e-05, 1.6488e-05,
+        1.9128e-05, 1.9237e-05], device='cuda:0')
+2026-01-13 10:19:42,073 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.17 vs. limit=2.0
+2026-01-13 10:19:42,786 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=12.20 vs. limit=2.0
+2026-01-13 10:19:46,899 INFO [train.py:895] Epoch 1, batch 2400, loss[loss=1.297, simple_loss=0.8001, pruned_loss=0.8968, over 1441.00 frames. ], tot_loss[loss=1.217, simple_loss=0.7511, pruned_loss=0.8469, over 262401.53 frames. ], batch size: 4, lr: 4.75e-02, grad_scale: 8.0
+2026-01-13 10:19:47,050 INFO [zipformer.py:2441] attn_weights_entropy = tensor([4.2410, 4.2487, 4.2052, 4.1792, 4.2328, 4.1485, 4.2444, 4.2303],
+       device='cuda:0'), covar=tensor([0.0003, 0.0004, 0.0006, 0.0007, 0.0011, 0.0008, 0.0005, 0.0006],
+       device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0008, 0.0008, 0.0009, 0.0008, 0.0008, 0.0008],
+       device='cuda:0'), out_proj_covar=tensor([7.7324e-06, 8.2165e-06, 7.5260e-06, 7.5356e-06, 7.9658e-06, 8.3004e-06,
+        8.3068e-06, 6.7754e-06], device='cuda:0')
+2026-01-13 10:19:47,055 INFO [zipformer.py:1188] warmup_begin=3333.3, warmup_end=4000.0, batch_count=2401.0, num_to_drop=2, layers_to_drop={1, 3}
+2026-01-13 10:19:47,305 INFO [optim.py:365] Clipping_scale=2.0, grad-norm quartiles 1.082e+02 1.449e+02 1.811e+02 2.178e+02 9.591e+02, threshold=3.623e+02, percent-clipped=5.0
+2026-01-13 10:19:53,160 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=4.72 vs. limit=2.0
+2026-01-13 10:19:54,130 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=6.43 vs. limit=2.0
+2026-01-13 10:19:54,661 INFO [zipformer.py:2441] attn_weights_entropy = tensor([3.8877, 3.9654, 3.9519, 3.9890, 3.9118, 3.9912, 3.9828, 3.9785],
+       device='cuda:0'), covar=tensor([0.0012, 0.0009, 0.0007, 0.0006, 0.0010, 0.0007, 0.0007, 0.0007],
+       device='cuda:0'), in_proj_covar=tensor([0.0024, 0.0024, 0.0024, 0.0020, 0.0024, 0.0025, 0.0025, 0.0024],
+       device='cuda:0'), out_proj_covar=tensor([2.0002e-05, 1.9449e-05, 1.9479e-05, 1.7335e-05, 2.0748e-05, 2.0089e-05,
+        2.0606e-05, 2.0826e-05], device='cuda:0')
+2026-01-13 10:19:57,293 INFO [scaling.py:681] Whitening: num_groups=1, num_channels=384, metric=50.99 vs. limit=5.0
+2026-01-13 10:19:57,351 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=96, metric=9.24 vs. limit=2.0
+2026-01-13 10:19:58,347 INFO [scaling.py:681] Whitening: num_groups=8, num_channels=192, metric=15.89 vs. limit=2.0

tensorboard/events.out.tfevents.1768292246.8e64ffbd666a.2056.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:596ce76c7fb0bc16a6f1789c6071a66101442ef11548e5420a7bd1dad790f1b2
+size 88

tensorboard/events.out.tfevents.1768293012.8e64ffbd666a.2141.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:568fecfd3b38d99a8ee4a3b7c0e92d9cff9e406d347aad053c84b2ec9fb0a953
+size 88

tensorboard/events.out.tfevents.1768293150.8e64ffbd666a.2188.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9e1daa22e87ba971000a015ec211c305456b18d27d8bd62a0668ba77041c38e
+size 88

tensorboard/events.out.tfevents.1768293284.8e64ffbd666a.2227.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3feeb7f856d111f84ef8d3d3abf917da689f20ccc08e3ec7838626c48d53a70
+size 88

tensorboard/events.out.tfevents.1768293391.8e64ffbd666a.2263.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ed2836056f63c3074226d6465894590fa6f873d6f31a9e451fd6da8454f5c53
+size 135

tensorboard/events.out.tfevents.1768293626.8e64ffbd666a.2307.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe6b9e6d76c1050023063ead1a6339ef1c3aaa932c09b57762266a4c405a19ae
+size 88

tensorboard/events.out.tfevents.1768293892.8e64ffbd666a.2350.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f3e4fc9c11d150b418d3de3019f3732ee6a46dca71cde825e94af9592f1163d
+size 135

tensorboard/events.out.tfevents.1768295636.8e64ffbd666a.3204.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df88e58da6e01797f3cc717827fbb1ec09c2066dfbe79dfa634cd646250eaa52
+size 135

tensorboard/events.out.tfevents.1768295850.8e64ffbd666a.3255.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d9696260fc5a1376c9d17421ae31aa00f3d28d4074d42641803d9c3da5141f1
+size 135

tensorboard/events.out.tfevents.1768296052.8e64ffbd666a.3309.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0749099e5b586e24ef6f75021c99a0c195466b7f20928d603207bf50c32d065
+size 135

tensorboard/events.out.tfevents.1768296226.8e64ffbd666a.3355.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b84e54db18aa8bc78e9ea8911af3dbd4b42f9e04f5cdbc9b74d34242191afb0
+size 135

tensorboard/events.out.tfevents.1768296401.8e64ffbd666a.3410.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d59d4d7a7a2c94e4b834ff65d7a5b99590f5450c2db57c123309fbddd3095bb0
+size 135

tensorboard/events.out.tfevents.1768296618.8e64ffbd666a.3456.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56adce167b08b758d1c116e9e3f8eee2a135a90353cfd94795e2797cdb35daa4
+size 135

tensorboard/events.out.tfevents.1768296977.8e64ffbd666a.3504.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c942a66e25e7447a03bf3794718775f84fb06fa56feea1245cc1846686d8f13a
+size 88

tensorboard/events.out.tfevents.1768297128.8e64ffbd666a.3540.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab0443deef5a623a03b25887f74f040ee12ad824222b4ecb090f868113301042
+size 24487

tensorboard/events.out.tfevents.1768298384.8e64ffbd666a.19994.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49ae8e86347a90e116fd67c17a18ed5022d0de043273a6dc066830ec47ea7df1
+size 598

tensorboard/events.out.tfevents.1768298578.8e64ffbd666a.24203.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf5b45bc742fa9c60e565d2884832c6541b1c06067cbd154442e6e637b8efd4c
+size 22132