Upload folder using huggingface_hub
Browse files- log/log-train-2026-01-13-16-52-16-0 +96 -0
- log/log-train-2026-01-13-16-52-16-1 +96 -0
- log/log-train-2026-01-13-16-53-36-0 +107 -0
- log/log-train-2026-01-13-16-53-36-1 +107 -0
- log/log-train-2026-01-13-16-54-14-0 +107 -0
- log/log-train-2026-01-13-16-54-14-1 +107 -0
- log/log-train-2026-01-13-17-00-38-0 +107 -0
- log/log-train-2026-01-13-17-00-38-1 +107 -0
- log/log-train-2026-01-13-17-01-14-0 +107 -0
- log/log-train-2026-01-13-17-01-14-1 +107 -0
- log/log-train-2026-01-13-17-06-37-0 +169 -0
- log/log-train-2026-01-13-17-06-37-1 +171 -0
- tensorboard/events.out.tfevents.1768323136.6ec37ec2ba95.217.0 +3 -0
- tensorboard/events.out.tfevents.1768323216.6ec37ec2ba95.324.0 +3 -0
- tensorboard/events.out.tfevents.1768323254.6ec37ec2ba95.501.0 +3 -0
- tensorboard/events.out.tfevents.1768323638.6ec37ec2ba95.678.0 +3 -0
- tensorboard/events.out.tfevents.1768323674.6ec37ec2ba95.851.0 +3 -0
- tensorboard/events.out.tfevents.1768323997.6ec37ec2ba95.1021.0 +3 -0
log/log-train-2026-01-13-16-52-16-0
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 16:52:16,624 INFO [train.py:967] (0/2) Training started
|
| 2 |
+
2026-01-13 16:52:16,625 INFO [train.py:977] (0/2) Device: cuda:0
|
| 3 |
+
2026-01-13 16:52:16,632 INFO [train.py:986] (0/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 16:52:16,633 INFO [train.py:988] (0/2) About to create model
|
| 93 |
+
2026-01-13 16:52:17,282 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 16:52:17,300 INFO [train.py:992] (0/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 16:52:18,071 INFO [train.py:1007] (0/2) Using DDP
|
| 96 |
+
2026-01-13 16:52:22,679 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
|
log/log-train-2026-01-13-16-52-16-1
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 16:52:16,766 INFO [train.py:967] (1/2) Training started
|
| 2 |
+
2026-01-13 16:52:16,766 INFO [train.py:977] (1/2) Device: cuda:1
|
| 3 |
+
2026-01-13 16:52:16,768 INFO [train.py:986] (1/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 16:52:16,769 INFO [train.py:988] (1/2) About to create model
|
| 93 |
+
2026-01-13 16:52:17,360 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 16:52:17,377 INFO [train.py:992] (1/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 16:52:17,486 INFO [train.py:1007] (1/2) Using DDP
|
| 96 |
+
2026-01-13 16:52:22,679 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
|
log/log-train-2026-01-13-16-53-36-0
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 16:53:36,680 INFO [train.py:967] (0/2) Training started
|
| 2 |
+
2026-01-13 16:53:36,681 INFO [train.py:977] (0/2) Device: cuda:0
|
| 3 |
+
2026-01-13 16:53:36,684 INFO [train.py:986] (0/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 16:53:36,684 INFO [train.py:988] (0/2) About to create model
|
| 93 |
+
2026-01-13 16:53:37,260 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 16:53:37,277 INFO [train.py:992] (0/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 16:53:38,042 INFO [train.py:1007] (0/2) Using DDP
|
| 96 |
+
2026-01-13 16:53:39,351 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 16:53:39,352 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
|
| 98 |
+
2026-01-13 16:53:39,352 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 16:53:39,352 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 16:53:39,352 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 16:53:39,352 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
|
| 102 |
+
2026-01-13 16:53:39,352 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 16:53:39,695 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
|
| 104 |
+
2026-01-13 16:53:39,696 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 16:53:39,696 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 16:53:39,697 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
|
| 107 |
+
2026-01-13 16:53:39,923 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
|
log/log-train-2026-01-13-16-53-36-1
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 16:53:36,791 INFO [train.py:967] (1/2) Training started
|
| 2 |
+
2026-01-13 16:53:36,791 INFO [train.py:977] (1/2) Device: cuda:1
|
| 3 |
+
2026-01-13 16:53:36,794 INFO [train.py:986] (1/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 16:53:36,794 INFO [train.py:988] (1/2) About to create model
|
| 93 |
+
2026-01-13 16:53:37,379 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 16:53:37,398 INFO [train.py:992] (1/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 16:53:37,505 INFO [train.py:1007] (1/2) Using DDP
|
| 96 |
+
2026-01-13 16:53:39,347 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 16:53:39,348 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
|
| 98 |
+
2026-01-13 16:53:39,348 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 16:53:39,348 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 16:53:39,349 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 16:53:39,349 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
|
| 102 |
+
2026-01-13 16:53:39,349 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 16:53:39,691 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
|
| 104 |
+
2026-01-13 16:53:39,692 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 16:53:39,692 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 16:53:39,693 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
|
| 107 |
+
2026-01-13 16:53:39,908 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
|
log/log-train-2026-01-13-16-54-14-0
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 16:54:14,567 INFO [train.py:967] (0/2) Training started
|
| 2 |
+
2026-01-13 16:54:14,568 INFO [train.py:977] (0/2) Device: cuda:0
|
| 3 |
+
2026-01-13 16:54:14,571 INFO [train.py:986] (0/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 16:54:14,572 INFO [train.py:988] (0/2) About to create model
|
| 93 |
+
2026-01-13 16:54:15,171 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 16:54:15,189 INFO [train.py:992] (0/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 16:54:15,951 INFO [train.py:1007] (0/2) Using DDP
|
| 96 |
+
2026-01-13 16:54:17,256 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 16:54:17,257 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
|
| 98 |
+
2026-01-13 16:54:17,257 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 16:54:17,258 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 16:54:17,258 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 16:54:17,258 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
|
| 102 |
+
2026-01-13 16:54:17,258 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 16:54:17,617 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
|
| 104 |
+
2026-01-13 16:54:17,618 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 16:54:17,618 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 16:54:17,619 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
|
| 107 |
+
2026-01-13 16:54:17,834 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
|
log/log-train-2026-01-13-16-54-14-1
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 16:54:14,677 INFO [train.py:967] (1/2) Training started
|
| 2 |
+
2026-01-13 16:54:14,677 INFO [train.py:977] (1/2) Device: cuda:1
|
| 3 |
+
2026-01-13 16:54:14,680 INFO [train.py:986] (1/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 16:54:14,680 INFO [train.py:988] (1/2) About to create model
|
| 93 |
+
2026-01-13 16:54:15,275 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 16:54:15,293 INFO [train.py:992] (1/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 16:54:15,400 INFO [train.py:1007] (1/2) Using DDP
|
| 96 |
+
2026-01-13 16:54:17,262 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 16:54:17,263 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
|
| 98 |
+
2026-01-13 16:54:17,264 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 16:54:17,264 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 16:54:17,264 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 16:54:17,264 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
|
| 102 |
+
2026-01-13 16:54:17,264 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 16:54:17,631 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
|
| 104 |
+
2026-01-13 16:54:17,632 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 16:54:17,632 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 16:54:17,633 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
|
| 107 |
+
2026-01-13 16:54:17,869 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
|
log/log-train-2026-01-13-17-00-38-0
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 17:00:38,644 INFO [train.py:967] (0/2) Training started
|
| 2 |
+
2026-01-13 17:00:38,646 INFO [train.py:977] (0/2) Device: cuda:0
|
| 3 |
+
2026-01-13 17:00:38,650 INFO [train.py:986] (0/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 17:00:38,651 INFO [train.py:988] (0/2) About to create model
|
| 93 |
+
2026-01-13 17:00:39,258 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 17:00:39,275 INFO [train.py:992] (0/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 17:00:40,039 INFO [train.py:1007] (0/2) Using DDP
|
| 96 |
+
2026-01-13 17:00:41,419 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 17:00:41,420 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
|
| 98 |
+
2026-01-13 17:00:41,420 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 17:00:41,420 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 17:00:41,420 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 17:00:41,420 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
|
| 102 |
+
2026-01-13 17:00:41,420 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 17:00:41,760 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
|
| 104 |
+
2026-01-13 17:00:41,761 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 17:00:41,761 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 17:00:41,762 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
|
| 107 |
+
2026-01-13 17:00:41,982 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
|
log/log-train-2026-01-13-17-00-38-1
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 17:00:38,739 INFO [train.py:967] (1/2) Training started
|
| 2 |
+
2026-01-13 17:00:38,739 INFO [train.py:977] (1/2) Device: cuda:1
|
| 3 |
+
2026-01-13 17:00:38,742 INFO [train.py:986] (1/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 17:00:38,742 INFO [train.py:988] (1/2) About to create model
|
| 93 |
+
2026-01-13 17:00:39,358 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 17:00:39,375 INFO [train.py:992] (1/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 17:00:39,482 INFO [train.py:1007] (1/2) Using DDP
|
| 96 |
+
2026-01-13 17:00:41,419 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 17:00:41,421 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
|
| 98 |
+
2026-01-13 17:00:41,421 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 17:00:41,421 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 17:00:41,421 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 17:00:41,421 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
|
| 102 |
+
2026-01-13 17:00:41,421 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 17:00:41,757 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
|
| 104 |
+
2026-01-13 17:00:41,757 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 17:00:41,758 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 17:00:41,759 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
|
| 107 |
+
2026-01-13 17:00:41,980 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
|
log/log-train-2026-01-13-17-01-14-0
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 17:01:14,363 INFO [train.py:967] (0/2) Training started
|
| 2 |
+
2026-01-13 17:01:14,364 INFO [train.py:977] (0/2) Device: cuda:0
|
| 3 |
+
2026-01-13 17:01:14,367 INFO [train.py:986] (0/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 17:01:14,367 INFO [train.py:988] (0/2) About to create model
|
| 93 |
+
2026-01-13 17:01:14,952 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 17:01:14,971 INFO [train.py:992] (0/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 17:01:15,734 INFO [train.py:1007] (0/2) Using DDP
|
| 96 |
+
2026-01-13 17:01:17,024 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 17:01:17,025 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
|
| 98 |
+
2026-01-13 17:01:17,025 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 17:01:17,025 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 17:01:17,025 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 17:01:17,025 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
|
| 102 |
+
2026-01-13 17:01:17,026 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 17:01:17,366 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
|
| 104 |
+
2026-01-13 17:01:17,366 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 17:01:17,367 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 17:01:17,367 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
|
| 107 |
+
2026-01-13 17:01:17,579 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
|
log/log-train-2026-01-13-17-01-14-1
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 17:01:14,470 INFO [train.py:967] (1/2) Training started
|
| 2 |
+
2026-01-13 17:01:14,471 INFO [train.py:977] (1/2) Device: cuda:1
|
| 3 |
+
2026-01-13 17:01:14,473 INFO [train.py:986] (1/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 17:01:14,473 INFO [train.py:988] (1/2) About to create model
|
| 93 |
+
2026-01-13 17:01:15,050 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 17:01:15,068 INFO [train.py:992] (1/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 17:01:15,174 INFO [train.py:1007] (1/2) Using DDP
|
| 96 |
+
2026-01-13 17:01:17,037 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 17:01:17,038 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
|
| 98 |
+
2026-01-13 17:01:17,038 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 17:01:17,038 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 17:01:17,038 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 17:01:17,038 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
|
| 102 |
+
2026-01-13 17:01:17,038 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 17:01:17,377 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
|
| 104 |
+
2026-01-13 17:01:17,378 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 17:01:17,378 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 17:01:17,379 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
|
| 107 |
+
2026-01-13 17:01:17,591 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
|
log/log-train-2026-01-13-17-06-37-0
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 17:06:37,678 INFO [train.py:967] (0/2) Training started
|
| 2 |
+
2026-01-13 17:06:37,679 INFO [train.py:977] (0/2) Device: cuda:0
|
| 3 |
+
2026-01-13 17:06:37,681 INFO [train.py:986] (0/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 17:06:37,682 INFO [train.py:988] (0/2) About to create model
|
| 93 |
+
2026-01-13 17:06:38,266 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 17:06:38,285 INFO [train.py:992] (0/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 17:06:39,101 INFO [train.py:1007] (0/2) Using DDP
|
| 96 |
+
2026-01-13 17:06:40,454 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 17:06:40,456 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
|
| 98 |
+
2026-01-13 17:06:40,456 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 17:06:40,456 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 17:06:40,456 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 17:06:40,456 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
|
| 102 |
+
2026-01-13 17:06:40,456 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 17:06:40,855 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
|
| 104 |
+
2026-01-13 17:06:40,855 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 17:06:40,856 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 17:06:40,856 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
|
| 107 |
+
2026-01-13 17:06:41,074 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
|
| 108 |
+
2026-01-13 17:06:56,066 INFO [train.py:895] (0/2) Epoch 1, batch 0, loss[loss=8.165, simple_loss=7.427, pruned_loss=7.363, over 2638.00 frames. ], tot_loss[loss=8.165, simple_loss=7.427, pruned_loss=7.363, over 2638.00 frames. ], batch size: 7, lr: 2.50e-02, grad_scale: 2.0
|
| 109 |
+
2026-01-13 17:06:56,066 INFO [train.py:920] (0/2) Computing validation loss
|
| 110 |
+
2026-01-13 17:08:00,243 INFO [zipformer.py:2441] (0/2) attn_weights_entropy = tensor([2.9147, 2.9149, 2.9154, 2.9121, 2.9146, 2.9150, 2.9150, 2.9151],
|
| 111 |
+
device='cuda:0'), covar=tensor([0.0029, 0.0041, 0.0048, 0.0025, 0.0032, 0.0035, 0.0052, 0.0034],
|
| 112 |
+
device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
|
| 113 |
+
device='cuda:0'), out_proj_covar=tensor([8.5573e-06, 8.6460e-06, 8.6547e-06, 8.5689e-06, 8.8456e-06, 8.6908e-06,
|
| 114 |
+
8.7531e-06, 8.7239e-06], device='cuda:0')
|
| 115 |
+
2026-01-13 17:08:21,491 INFO [train.py:929] (0/2) Epoch 1, validation: loss=8.291, simple_loss=7.534, pruned_loss=7.553, over 1639044.00 frames.
|
| 116 |
+
2026-01-13 17:08:21,492 INFO [train.py:930] (0/2) Maximum memory allocated so far is 2796MB
|
| 117 |
+
2026-01-13 17:08:23,199 INFO [zipformer.py:1188] (0/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5.0, num_to_drop=2, layers_to_drop={1, 3}
|
| 118 |
+
2026-01-13 17:08:29,961 INFO [zipformer.py:1188] (0/2) warmup_begin=666.7, warmup_end=1333.3, batch_count=23.0, num_to_drop=1, layers_to_drop={1}
|
| 119 |
+
2026-01-13 17:08:32,511 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=6.07 vs. limit=2.0
|
| 120 |
+
2026-01-13 17:08:33,581 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=5.73 vs. limit=2.0
|
| 121 |
+
2026-01-13 17:08:37,070 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=14.43 vs. limit=2.0
|
| 122 |
+
2026-01-13 17:08:40,738 INFO [train.py:895] (0/2) Epoch 1, batch 50, loss[loss=1.1, simple_loss=0.9771, pruned_loss=1.103, over 2768.00 frames. ], tot_loss[loss=2.142, simple_loss=1.948, pruned_loss=1.876, over 122563.73 frames. ], batch size: 7, lr: 2.75e-02, grad_scale: 2.0
|
| 123 |
+
2026-01-13 17:08:52,490 INFO [zipformer.py:2441] (0/2) attn_weights_entropy = tensor([5.1111, 5.1111, 5.1051, 5.1033, 5.1106, 5.1110, 5.1105, 5.1110],
|
| 124 |
+
device='cuda:0'), covar=tensor([0.0017, 0.0046, 0.0033, 0.0029, 0.0021, 0.0033, 0.0023, 0.0020],
|
| 125 |
+
device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
|
| 126 |
+
device='cuda:0'), out_proj_covar=tensor([8.7029e-06, 8.8104e-06, 8.8122e-06, 8.6125e-06, 9.0121e-06, 8.7908e-06,
|
| 127 |
+
8.8471e-06, 8.8093e-06], device='cuda:0')
|
| 128 |
+
2026-01-13 17:08:53,265 INFO [zipformer.py:1188] (0/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83.0, num_to_drop=1, layers_to_drop={1}
|
| 129 |
+
2026-01-13 17:08:57,962 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=3.07 vs. limit=2.0
|
| 130 |
+
2026-01-13 17:09:00,026 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 1.726e+01 2.698e+01 5.079e+01 1.890e+02 2.214e+03, threshold=1.016e+02, percent-clipped=0.0
|
| 131 |
+
2026-01-13 17:09:00,065 INFO [train.py:895] (0/2) Epoch 1, batch 100, loss[loss=0.9872, simple_loss=0.8589, pruned_loss=1.031, over 2891.00 frames. ], tot_loss[loss=1.553, simple_loss=1.394, pruned_loss=1.445, over 216716.41 frames. ], batch size: 8, lr: 3.00e-02, grad_scale: 2.0
|
| 132 |
+
2026-01-13 17:09:13,046 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=5.61 vs. limit=2.0
|
| 133 |
+
2026-01-13 17:09:17,396 INFO [zipformer.py:1188] (0/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=144.0, num_to_drop=2, layers_to_drop={1, 3}
|
| 134 |
+
2026-01-13 17:09:20,086 INFO [train.py:895] (0/2) Epoch 1, batch 150, loss[loss=0.8253, simple_loss=0.7036, pruned_loss=0.8828, over 2774.00 frames. ], tot_loss[loss=1.313, simple_loss=1.163, pruned_loss=1.277, over 290051.50 frames. ], batch size: 7, lr: 3.25e-02, grad_scale: 2.0
|
| 135 |
+
2026-01-13 17:09:40,653 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 1.991e+01 2.552e+01 2.903e+01 3.376e+01 6.929e+01, threshold=5.806e+01, percent-clipped=0.0
|
| 136 |
+
2026-01-13 17:09:40,692 INFO [train.py:895] (0/2) Epoch 1, batch 200, loss[loss=1.141, simple_loss=0.9689, pruned_loss=1.16, over 2637.00 frames. ], tot_loss[loss=1.177, simple_loss=1.03, pruned_loss=1.172, over 347964.82 frames. ], batch size: 16, lr: 3.50e-02, grad_scale: 2.0
|
| 137 |
+
2026-01-13 17:09:43,454 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=4.43 vs. limit=2.0
|
| 138 |
+
2026-01-13 17:09:46,242 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=37.43 vs. limit=5.0
|
| 139 |
+
2026-01-13 17:09:56,985 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=42.78 vs. limit=5.0
|
| 140 |
+
2026-01-13 17:09:59,109 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=14.33 vs. limit=2.0
|
| 141 |
+
2026-01-13 17:10:00,956 INFO [train.py:895] (0/2) Epoch 1, batch 250, loss[loss=0.8344, simple_loss=0.6973, pruned_loss=0.8492, over 2755.00 frames. ], tot_loss[loss=1.088, simple_loss=0.942, pruned_loss=1.093, over 392480.77 frames. ], batch size: 11, lr: 3.75e-02, grad_scale: 2.0
|
| 142 |
+
2026-01-13 17:10:19,125 INFO [zipformer.py:1188] (0/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=296.0, num_to_drop=1, layers_to_drop={1}
|
| 143 |
+
2026-01-13 17:10:20,733 INFO [zipformer.py:1188] (0/2) warmup_begin=1333.3, warmup_end=2000.0, batch_count=300.0, num_to_drop=2, layers_to_drop={1, 3}
|
| 144 |
+
2026-01-13 17:10:21,012 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 2.628e+01 3.406e+01 3.889e+01 4.977e+01 1.495e+02, threshold=7.778e+01, percent-clipped=13.0
|
| 145 |
+
2026-01-13 17:10:21,048 INFO [train.py:895] (0/2) Epoch 1, batch 300, loss[loss=0.8336, simple_loss=0.6967, pruned_loss=0.8042, over 2892.00 frames. ], tot_loss[loss=1.026, simple_loss=0.8803, pruned_loss=1.03, over 428733.71 frames. ], batch size: 10, lr: 4.00e-02, grad_scale: 2.0
|
| 146 |
+
2026-01-13 17:10:26,809 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=2.54 vs. limit=2.0
|
| 147 |
+
2026-01-13 17:10:39,996 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0
|
| 148 |
+
2026-01-13 17:10:40,639 INFO [train.py:895] (0/2) Epoch 1, batch 350, loss[loss=0.8286, simple_loss=0.6797, pruned_loss=0.8085, over 2696.00 frames. ], tot_loss[loss=0.9781, simple_loss=0.832, pruned_loss=0.9753, over 455853.48 frames. ], batch size: 7, lr: 4.25e-02, grad_scale: 2.0
|
| 149 |
+
2026-01-13 17:10:43,170 INFO [zipformer.py:1188] (0/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=357.0, num_to_drop=2, layers_to_drop={1, 3}
|
| 150 |
+
2026-01-13 17:10:48,892 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=2.92 vs. limit=2.0
|
| 151 |
+
2026-01-13 17:10:53,144 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=14.35 vs. limit=5.0
|
| 152 |
+
2026-01-13 17:10:54,931 INFO [zipformer.py:1188] (0/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=387.0, num_to_drop=1, layers_to_drop={1}
|
| 153 |
+
2026-01-13 17:10:55,070 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=12.67 vs. limit=2.0
|
| 154 |
+
2026-01-13 17:10:56,683 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=11.14 vs. limit=5.0
|
| 155 |
+
2026-01-13 17:11:00,683 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 2.792e+01 3.460e+01 4.419e+01 5.164e+01 2.002e+02, threshold=8.837e+01, percent-clipped=7.0
|
| 156 |
+
2026-01-13 17:11:00,720 INFO [train.py:895] (0/2) Epoch 1, batch 400, loss[loss=0.8411, simple_loss=0.6884, pruned_loss=0.7904, over 2705.00 frames. ], tot_loss[loss=0.9489, simple_loss=0.8001, pruned_loss=0.9381, over 475942.16 frames. ], batch size: 8, lr: 4.50e-02, grad_scale: 4.0
|
| 157 |
+
2026-01-13 17:11:07,553 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=10.28 vs. limit=2.0
|
| 158 |
+
2026-01-13 17:11:11,067 INFO [zipformer.py:2441] (0/2) attn_weights_entropy = tensor([4.6921, 5.1287, 4.3867, 4.6206, 3.9530, 5.0709, 4.4906, 4.6190],
|
| 159 |
+
device='cuda:0'), covar=tensor([0.0421, 0.0065, 0.0737, 0.0372, 0.1756, 0.0083, 0.0905, 0.0393],
|
| 160 |
+
device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0010, 0.0009, 0.0009, 0.0009],
|
| 161 |
+
device='cuda:0'), out_proj_covar=tensor([9.0232e-06, 8.7080e-06, 9.0746e-06, 8.3539e-06, 1.0025e-05, 8.5254e-06,
|
| 162 |
+
8.9379e-06, 8.6380e-06], device='cuda:0')
|
| 163 |
+
2026-01-13 17:11:15,769 INFO [zipformer.py:1188] (0/2) warmup_begin=1333.3, warmup_end=2000.0, batch_count=439.0, num_to_drop=2, layers_to_drop={1, 2}
|
| 164 |
+
2026-01-13 17:11:19,201 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=10.70 vs. limit=5.0
|
| 165 |
+
2026-01-13 17:11:19,449 INFO [zipformer.py:1188] (0/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=448.0, num_to_drop=2, layers_to_drop={0, 3}
|
| 166 |
+
2026-01-13 17:11:20,503 INFO [train.py:895] (0/2) Epoch 1, batch 450, loss[loss=0.7667, simple_loss=0.6296, pruned_loss=0.6868, over 2660.00 frames. ], tot_loss[loss=0.9287, simple_loss=0.7768, pruned_loss=0.9064, over 492502.80 frames. ], batch size: 8, lr: 4.75e-02, grad_scale: 4.0
|
| 167 |
+
2026-01-13 17:11:26,921 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=10.20 vs. limit=5.0
|
| 168 |
+
2026-01-13 17:11:40,428 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 2.871e+01 3.474e+01 4.145e+01 5.434e+01 1.454e+02, threshold=8.291e+01, percent-clipped=4.0
|
| 169 |
+
2026-01-13 17:11:40,465 INFO [train.py:895] (0/2) Epoch 1, batch 500, loss[loss=0.8643, simple_loss=0.6974, pruned_loss=0.7811, over 2804.00 frames. ], tot_loss[loss=0.9117, simple_loss=0.7566, pruned_loss=0.8773, over 506680.85 frames. ], batch size: 10, lr: 4.99e-02, grad_scale: 4.0
|
log/log-train-2026-01-13-17-06-37-1
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-01-13 17:06:37,791 INFO [train.py:967] (1/2) Training started
|
| 2 |
+
2026-01-13 17:06:37,791 INFO [train.py:977] (1/2) Device: cuda:1
|
| 3 |
+
2026-01-13 17:06:37,793 INFO [train.py:986] (1/2) {
|
| 4 |
+
"am_scale": 0.0,
|
| 5 |
+
"attention_dims": "192,192,192,192,192",
|
| 6 |
+
"average_period": 200,
|
| 7 |
+
"base_lr": 0.05,
|
| 8 |
+
"batch_idx_train": 0,
|
| 9 |
+
"best_train_epoch": -1,
|
| 10 |
+
"best_train_loss": Infinity,
|
| 11 |
+
"best_valid_epoch": -1,
|
| 12 |
+
"best_valid_loss": Infinity,
|
| 13 |
+
"blank_id": 0,
|
| 14 |
+
"bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
|
| 15 |
+
"bucketing_sampler": true,
|
| 16 |
+
"cnn_module_kernels": "31,31,31,31,31",
|
| 17 |
+
"concatenate_cuts": false,
|
| 18 |
+
"context_size": 2,
|
| 19 |
+
"decode_chunk_len": 32,
|
| 20 |
+
"decoder_dim": 512,
|
| 21 |
+
"drop_last": true,
|
| 22 |
+
"duration_factor": 1.0,
|
| 23 |
+
"enable_musan": false,
|
| 24 |
+
"enable_spec_aug": true,
|
| 25 |
+
"encoder_dims": "384,384,384,384,384",
|
| 26 |
+
"encoder_unmasked_dims": "256,256,256,256,256",
|
| 27 |
+
"env_info": {
|
| 28 |
+
"IP address": "172.19.2.2",
|
| 29 |
+
"hostname": "6ec37ec2ba95",
|
| 30 |
+
"icefall-git-branch": "master",
|
| 31 |
+
"icefall-git-date": "Fri Nov 28 03:42:20 2025",
|
| 32 |
+
"icefall-git-sha1": "0904e490-clean",
|
| 33 |
+
"icefall-path": "/kaggle/working/icefall",
|
| 34 |
+
"k2-build-type": "Release",
|
| 35 |
+
"k2-git-date": "Thu Jul 25 03:34:26 2024",
|
| 36 |
+
"k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
|
| 37 |
+
"k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
|
| 38 |
+
"k2-version": "1.24.4",
|
| 39 |
+
"k2-with-cuda": true,
|
| 40 |
+
"lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
|
| 41 |
+
"lhotse-version": "1.32.1",
|
| 42 |
+
"python-version": "3.12",
|
| 43 |
+
"torch-cuda-available": true,
|
| 44 |
+
"torch-cuda-version": "12.1",
|
| 45 |
+
"torch-version": "2.4.0+cu121"
|
| 46 |
+
},
|
| 47 |
+
"exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
|
| 48 |
+
"feature_dim": 80,
|
| 49 |
+
"feedforward_dims": "1024,1024,2048,2048,1024",
|
| 50 |
+
"full_libri": false,
|
| 51 |
+
"gap": 1.0,
|
| 52 |
+
"inf_check": false,
|
| 53 |
+
"input_strategy": "PrecomputedFeatures",
|
| 54 |
+
"joiner_dim": 512,
|
| 55 |
+
"keep_last_k": 5,
|
| 56 |
+
"lm_scale": 0.25,
|
| 57 |
+
"log_interval": 50,
|
| 58 |
+
"lr_batches": 5000,
|
| 59 |
+
"lr_epochs": 3.5,
|
| 60 |
+
"manifest_dir": "/kaggle/working/amharic_training/manifests",
|
| 61 |
+
"master_port": 12354,
|
| 62 |
+
"max_duration": 120,
|
| 63 |
+
"mini_libri": false,
|
| 64 |
+
"nhead": "8,8,8,8,8",
|
| 65 |
+
"num_buckets": 30,
|
| 66 |
+
"num_encoder_layers": "2,4,3,2,4",
|
| 67 |
+
"num_epochs": 50,
|
| 68 |
+
"num_left_chunks": 4,
|
| 69 |
+
"num_workers": 2,
|
| 70 |
+
"on_the_fly_feats": false,
|
| 71 |
+
"print_diagnostics": false,
|
| 72 |
+
"prune_range": 5,
|
| 73 |
+
"reset_interval": 200,
|
| 74 |
+
"return_cuts": true,
|
| 75 |
+
"save_every_n": 1000,
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"short_chunk_size": 50,
|
| 78 |
+
"shuffle": true,
|
| 79 |
+
"simple_loss_scale": 0.5,
|
| 80 |
+
"spec_aug_time_warp_factor": 80,
|
| 81 |
+
"start_batch": 0,
|
| 82 |
+
"start_epoch": 1,
|
| 83 |
+
"subsampling_factor": 4,
|
| 84 |
+
"tensorboard": true,
|
| 85 |
+
"use_fp16": true,
|
| 86 |
+
"valid_interval": 1600,
|
| 87 |
+
"vocab_size": 1000,
|
| 88 |
+
"warm_step": 2000,
|
| 89 |
+
"world_size": 2,
|
| 90 |
+
"zipformer_downsampling_factors": "1,2,4,8,2"
|
| 91 |
+
}
|
| 92 |
+
2026-01-13 17:06:37,794 INFO [train.py:988] (1/2) About to create model
|
| 93 |
+
2026-01-13 17:06:38,385 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
|
| 94 |
+
2026-01-13 17:06:38,403 INFO [train.py:992] (1/2) Number of model parameters: 71330891
|
| 95 |
+
2026-01-13 17:06:38,512 INFO [train.py:1007] (1/2) Using DDP
|
| 96 |
+
2026-01-13 17:06:40,469 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
|
| 97 |
+
2026-01-13 17:06:40,470 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
|
| 98 |
+
2026-01-13 17:06:40,470 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
|
| 99 |
+
2026-01-13 17:06:40,471 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
|
| 100 |
+
2026-01-13 17:06:40,471 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
|
| 101 |
+
2026-01-13 17:06:40,471 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
|
| 102 |
+
2026-01-13 17:06:40,471 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
|
| 103 |
+
2026-01-13 17:06:40,872 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
|
| 104 |
+
2026-01-13 17:06:40,872 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
|
| 105 |
+
2026-01-13 17:06:40,873 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
|
| 106 |
+
2026-01-13 17:06:40,873 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
|
| 107 |
+
2026-01-13 17:06:41,102 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
|
| 108 |
+
2026-01-13 17:06:56,061 INFO [train.py:895] (1/2) Epoch 1, batch 0, loss[loss=8.191, simple_loss=7.455, pruned_loss=7.342, over 2645.00 frames. ], tot_loss[loss=8.191, simple_loss=7.455, pruned_loss=7.342, over 2645.00 frames. ], batch size: 7, lr: 2.50e-02, grad_scale: 2.0
|
| 109 |
+
2026-01-13 17:06:56,062 INFO [train.py:920] (1/2) Computing validation loss
|
| 110 |
+
2026-01-13 17:08:00,436 INFO [zipformer.py:2441] (1/2) attn_weights_entropy = tensor([2.9155, 2.9157, 2.9161, 2.9129, 2.9154, 2.9159, 2.9159, 2.9159],
|
| 111 |
+
device='cuda:1'), covar=tensor([0.0037, 0.0062, 0.0063, 0.0034, 0.0040, 0.0046, 0.0067, 0.0041],
|
| 112 |
+
device='cuda:1'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
|
| 113 |
+
device='cuda:1'), out_proj_covar=tensor([8.5573e-06, 8.6460e-06, 8.6547e-06, 8.5689e-06, 8.8456e-06, 8.6908e-06,
|
| 114 |
+
8.7531e-06, 8.7239e-06], device='cuda:1')
|
| 115 |
+
2026-01-13 17:08:21,491 INFO [train.py:929] (1/2) Epoch 1, validation: loss=8.291, simple_loss=7.534, pruned_loss=7.553, over 1639044.00 frames.
|
| 116 |
+
2026-01-13 17:08:21,492 INFO [train.py:930] (1/2) Maximum memory allocated so far is 2801MB
|
| 117 |
+
2026-01-13 17:08:23,182 INFO [zipformer.py:1188] (1/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5.0, num_to_drop=2, layers_to_drop={0, 1}
|
| 118 |
+
2026-01-13 17:08:29,961 INFO [zipformer.py:1188] (1/2) warmup_begin=666.7, warmup_end=1333.3, batch_count=23.0, num_to_drop=1, layers_to_drop={1}
|
| 119 |
+
2026-01-13 17:08:32,502 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=4.95 vs. limit=2.0
|
| 120 |
+
2026-01-13 17:08:33,581 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=5.26 vs. limit=2.0
|
| 121 |
+
2026-01-13 17:08:37,075 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=192, metric=13.56 vs. limit=2.0
|
| 122 |
+
2026-01-13 17:08:40,737 INFO [train.py:895] (1/2) Epoch 1, batch 50, loss[loss=1.051, simple_loss=0.9344, pruned_loss=1.049, over 2766.00 frames. ], tot_loss[loss=2.147, simple_loss=1.951, pruned_loss=1.882, over 122589.82 frames. ], batch size: 7, lr: 2.75e-02, grad_scale: 2.0
|
| 123 |
+
2026-01-13 17:08:52,476 INFO [zipformer.py:2441] (1/2) attn_weights_entropy = tensor([5.1140, 5.1141, 5.1095, 5.1045, 5.1129, 5.1137, 5.1126, 5.1135],
|
| 124 |
+
device='cuda:1'), covar=tensor([0.0009, 0.0023, 0.0019, 0.0016, 0.0006, 0.0018, 0.0013, 0.0012],
|
| 125 |
+
device='cuda:1'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
|
| 126 |
+
device='cuda:1'), out_proj_covar=tensor([8.5574e-06, 8.6733e-06, 8.6879e-06, 8.5465e-06, 8.8501e-06, 8.6969e-06,
|
| 127 |
+
8.7308e-06, 8.6867e-06], device='cuda:1')
|
| 128 |
+
2026-01-13 17:08:53,257 INFO [zipformer.py:1188] (1/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83.0, num_to_drop=1, layers_to_drop={1}
|
| 129 |
+
2026-01-13 17:08:57,912 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=3.48 vs. limit=2.0
|
| 130 |
+
2026-01-13 17:09:00,027 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.726e+01 2.698e+01 5.079e+01 1.890e+02 2.214e+03, threshold=1.016e+02, percent-clipped=0.0
|
| 131 |
+
2026-01-13 17:09:00,066 INFO [train.py:895] (1/2) Epoch 1, batch 100, loss[loss=0.9553, simple_loss=0.8285, pruned_loss=1.015, over 2893.00 frames. ], tot_loss[loss=1.55, simple_loss=1.391, pruned_loss=1.443, over 216705.88 frames. ], batch size: 8, lr: 3.00e-02, grad_scale: 2.0
|
| 132 |
+
2026-01-13 17:09:17,393 INFO [zipformer.py:1188] (1/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=144.0, num_to_drop=2, layers_to_drop={1, 2}
|
| 133 |
+
2026-01-13 17:09:20,085 INFO [train.py:895] (1/2) Epoch 1, batch 150, loss[loss=0.8435, simple_loss=0.7125, pruned_loss=0.9396, over 2786.00 frames. ], tot_loss[loss=1.311, simple_loss=1.161, pruned_loss=1.277, over 290274.69 frames. ], batch size: 7, lr: 3.25e-02, grad_scale: 2.0
|
| 134 |
+
2026-01-13 17:09:39,620 INFO [zipformer.py:2441] (1/2) attn_weights_entropy = tensor([4.2447, 4.2507, 4.2467, 4.2488, 4.2510, 4.2508, 4.2486, 4.2512],
|
| 135 |
+
device='cuda:1'), covar=tensor([0.0020, 0.0018, 0.0026, 0.0025, 0.0023, 0.0022, 0.0015, 0.0020],
|
| 136 |
+
device='cuda:1'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
|
| 137 |
+
device='cuda:1'), out_proj_covar=tensor([8.8484e-06, 8.8488e-06, 8.6189e-06, 8.8502e-06, 8.6395e-06, 8.7008e-06,
|
| 138 |
+
8.7248e-06, 8.7738e-06], device='cuda:1')
|
| 139 |
+
2026-01-13 17:09:40,650 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.991e+01 2.552e+01 2.903e+01 3.376e+01 6.929e+01, threshold=5.806e+01, percent-clipped=0.0
|
| 140 |
+
2026-01-13 17:09:40,689 INFO [train.py:895] (1/2) Epoch 1, batch 200, loss[loss=1.119, simple_loss=0.947, pruned_loss=1.155, over 2665.00 frames. ], tot_loss[loss=1.179, simple_loss=1.032, pruned_loss=1.176, over 347390.37 frames. ], batch size: 16, lr: 3.50e-02, grad_scale: 2.0
|
| 141 |
+
2026-01-13 17:09:53,935 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=5.48 vs. limit=2.0
|
| 142 |
+
2026-01-13 17:10:00,956 INFO [train.py:895] (1/2) Epoch 1, batch 250, loss[loss=1.031, simple_loss=0.8593, pruned_loss=1.061, over 2763.00 frames. ], tot_loss[loss=1.091, simple_loss=0.9446, pruned_loss=1.099, over 391985.25 frames. ], batch size: 11, lr: 3.75e-02, grad_scale: 2.0
|
| 143 |
+
2026-01-13 17:10:01,151 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=3.75 vs. limit=2.0
|
| 144 |
+
2026-01-13 17:10:01,977 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=2.53 vs. limit=2.0
|
| 145 |
+
2026-01-13 17:10:12,105 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0
|
| 146 |
+
2026-01-13 17:10:19,140 INFO [zipformer.py:1188] (1/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=296.0, num_to_drop=1, layers_to_drop={1}
|
| 147 |
+
2026-01-13 17:10:20,730 INFO [zipformer.py:1188] (1/2) warmup_begin=1333.3, warmup_end=2000.0, batch_count=300.0, num_to_drop=2, layers_to_drop={0, 1}
|
| 148 |
+
2026-01-13 17:10:21,011 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.628e+01 3.406e+01 3.889e+01 4.977e+01 1.495e+02, threshold=7.778e+01, percent-clipped=13.0
|
| 149 |
+
2026-01-13 17:10:21,048 INFO [train.py:895] (1/2) Epoch 1, batch 300, loss[loss=0.8037, simple_loss=0.6637, pruned_loss=0.807, over 2885.00 frames. ], tot_loss[loss=1.029, simple_loss=0.8823, pruned_loss=1.034, over 428005.97 frames. ], batch size: 10, lr: 4.00e-02, grad_scale: 2.0
|
| 150 |
+
2026-01-13 17:10:26,867 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0
|
| 151 |
+
2026-01-13 17:10:31,380 INFO [zipformer.py:2441] (1/2) attn_weights_entropy = tensor([3.6023, 3.6043, 3.6010, 3.6077, 3.6021, 3.6072, 3.6062, 3.6057],
|
| 152 |
+
device='cuda:1'), covar=tensor([0.0033, 0.0036, 0.0040, 0.0028, 0.0029, 0.0039, 0.0052, 0.0038],
|
| 153 |
+
device='cuda:1'), in_proj_covar=tensor([0.0009, 0.0008, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
|
| 154 |
+
device='cuda:1'), out_proj_covar=tensor([9.0297e-06, 8.5946e-06, 8.6648e-06, 8.7116e-06, 8.8798e-06, 8.5990e-06,
|
| 155 |
+
8.8286e-06, 8.7837e-06], device='cuda:1')
|
| 156 |
+
2026-01-13 17:10:33,433 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=192, metric=19.84 vs. limit=2.0
|
| 157 |
+
2026-01-13 17:10:33,517 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=2.32 vs. limit=2.0
|
| 158 |
+
2026-01-13 17:10:40,638 INFO [train.py:895] (1/2) Epoch 1, batch 350, loss[loss=0.9178, simple_loss=0.7621, pruned_loss=0.863, over 2690.00 frames. ], tot_loss[loss=0.9843, simple_loss=0.8369, pruned_loss=0.9828, over 455041.83 frames. ], batch size: 7, lr: 4.25e-02, grad_scale: 2.0
|
| 159 |
+
2026-01-13 17:10:43,192 INFO [zipformer.py:1188] (1/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=357.0, num_to_drop=2, layers_to_drop={2, 3}
|
| 160 |
+
2026-01-13 17:10:51,167 INFO [scaling.py:681] (1/2) Whitening: num_groups=1, num_channels=384, metric=15.06 vs. limit=5.0
|
| 161 |
+
2026-01-13 17:10:54,931 INFO [zipformer.py:1188] (1/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=387.0, num_to_drop=1, layers_to_drop={0}
|
| 162 |
+
2026-01-13 17:11:00,683 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.792e+01 3.460e+01 4.419e+01 5.164e+01 2.002e+02, threshold=8.837e+01, percent-clipped=7.0
|
| 163 |
+
2026-01-13 17:11:00,720 INFO [train.py:895] (1/2) Epoch 1, batch 400, loss[loss=0.7745, simple_loss=0.6329, pruned_loss=0.7313, over 2716.00 frames. ], tot_loss[loss=0.9508, simple_loss=0.8014, pruned_loss=0.9415, over 474747.91 frames. ], batch size: 8, lr: 4.50e-02, grad_scale: 4.0
|
| 164 |
+
2026-01-13 17:11:14,445 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=192, metric=15.28 vs. limit=2.0
|
| 165 |
+
2026-01-13 17:11:15,775 INFO [zipformer.py:1188] (1/2) warmup_begin=1333.3, warmup_end=2000.0, batch_count=439.0, num_to_drop=2, layers_to_drop={0, 2}
|
| 166 |
+
2026-01-13 17:11:19,454 INFO [zipformer.py:1188] (1/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=448.0, num_to_drop=2, layers_to_drop={0, 1}
|
| 167 |
+
2026-01-13 17:11:20,503 INFO [train.py:895] (1/2) Epoch 1, batch 450, loss[loss=0.8531, simple_loss=0.6946, pruned_loss=0.7819, over 2672.00 frames. ], tot_loss[loss=0.9284, simple_loss=0.7761, pruned_loss=0.9079, over 491498.11 frames. ], batch size: 8, lr: 4.75e-02, grad_scale: 4.0
|
| 168 |
+
2026-01-13 17:11:23,205 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=192, metric=14.11 vs. limit=2.0
|
| 169 |
+
2026-01-13 17:11:30,442 INFO [scaling.py:681] (1/2) Whitening: num_groups=1, num_channels=384, metric=9.21 vs. limit=5.0
|
| 170 |
+
2026-01-13 17:11:40,428 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.871e+01 3.474e+01 4.145e+01 5.434e+01 1.454e+02, threshold=8.291e+01, percent-clipped=4.0
|
| 171 |
+
2026-01-13 17:11:40,465 INFO [train.py:895] (1/2) Epoch 1, batch 500, loss[loss=0.8659, simple_loss=0.6927, pruned_loss=0.7988, over 2806.00 frames. ], tot_loss[loss=0.9103, simple_loss=0.7551, pruned_loss=0.8777, over 504851.47 frames. ], batch size: 10, lr: 4.99e-02, grad_scale: 4.0
|
tensorboard/events.out.tfevents.1768323136.6ec37ec2ba95.217.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da79842ce2109414f3d22d8d057a74fa6834d0564f65cefc7609348b0bbb6050
|
| 3 |
+
size 88
|
tensorboard/events.out.tfevents.1768323216.6ec37ec2ba95.324.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6eab36efde4b857cc7e5d1e931baac6568e9d7b3d2b83a2b0663cd07690cb99
|
| 3 |
+
size 135
|
tensorboard/events.out.tfevents.1768323254.6ec37ec2ba95.501.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c51558b37d7235f244e95101572a614ddd83dcc0da82cc68be39db0f2974c45
|
| 3 |
+
size 135
|
tensorboard/events.out.tfevents.1768323638.6ec37ec2ba95.678.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9722bfba8665b11089af7438d0d28f3e412a917f90090109dd167e2c7825351
|
| 3 |
+
size 135
|
tensorboard/events.out.tfevents.1768323674.6ec37ec2ba95.851.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ccef400d06e7a278057a3931d6e1289fc8bab3f66e2544c741334ce1519f65c
|
| 3 |
+
size 135
|
tensorboard/events.out.tfevents.1768323997.6ec37ec2ba95.1021.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56ed9a581c980c0d759e87e48e3006fdccbf4840ae4dc9ae582adb49b0c5ace8
|
| 3 |
+
size 3584
|