projecti7 commited on
Commit
fc58ffd
·
verified ·
1 Parent(s): 3c3c0db

Upload folder using huggingface_hub

Browse files
log/log-train-2026-01-13-16-52-16-0 ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 16:52:16,624 INFO [train.py:967] (0/2) Training started
2
+ 2026-01-13 16:52:16,625 INFO [train.py:977] (0/2) Device: cuda:0
3
+ 2026-01-13 16:52:16,632 INFO [train.py:986] (0/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 16:52:16,633 INFO [train.py:988] (0/2) About to create model
93
+ 2026-01-13 16:52:17,282 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 16:52:17,300 INFO [train.py:992] (0/2) Number of model parameters: 71330891
95
+ 2026-01-13 16:52:18,071 INFO [train.py:1007] (0/2) Using DDP
96
+ 2026-01-13 16:52:22,679 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
log/log-train-2026-01-13-16-52-16-1 ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 16:52:16,766 INFO [train.py:967] (1/2) Training started
2
+ 2026-01-13 16:52:16,766 INFO [train.py:977] (1/2) Device: cuda:1
3
+ 2026-01-13 16:52:16,768 INFO [train.py:986] (1/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 16:52:16,769 INFO [train.py:988] (1/2) About to create model
93
+ 2026-01-13 16:52:17,360 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 16:52:17,377 INFO [train.py:992] (1/2) Number of model parameters: 71330891
95
+ 2026-01-13 16:52:17,486 INFO [train.py:1007] (1/2) Using DDP
96
+ 2026-01-13 16:52:22,679 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
log/log-train-2026-01-13-16-53-36-0 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 16:53:36,680 INFO [train.py:967] (0/2) Training started
2
+ 2026-01-13 16:53:36,681 INFO [train.py:977] (0/2) Device: cuda:0
3
+ 2026-01-13 16:53:36,684 INFO [train.py:986] (0/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 16:53:36,684 INFO [train.py:988] (0/2) About to create model
93
+ 2026-01-13 16:53:37,260 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 16:53:37,277 INFO [train.py:992] (0/2) Number of model parameters: 71330891
95
+ 2026-01-13 16:53:38,042 INFO [train.py:1007] (0/2) Using DDP
96
+ 2026-01-13 16:53:39,351 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
97
+ 2026-01-13 16:53:39,352 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
98
+ 2026-01-13 16:53:39,352 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
99
+ 2026-01-13 16:53:39,352 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
100
+ 2026-01-13 16:53:39,352 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
101
+ 2026-01-13 16:53:39,352 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
102
+ 2026-01-13 16:53:39,352 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 16:53:39,695 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
104
+ 2026-01-13 16:53:39,696 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
105
+ 2026-01-13 16:53:39,696 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
106
+ 2026-01-13 16:53:39,697 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
107
+ 2026-01-13 16:53:39,923 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
log/log-train-2026-01-13-16-53-36-1 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 16:53:36,791 INFO [train.py:967] (1/2) Training started
2
+ 2026-01-13 16:53:36,791 INFO [train.py:977] (1/2) Device: cuda:1
3
+ 2026-01-13 16:53:36,794 INFO [train.py:986] (1/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 16:53:36,794 INFO [train.py:988] (1/2) About to create model
93
+ 2026-01-13 16:53:37,379 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 16:53:37,398 INFO [train.py:992] (1/2) Number of model parameters: 71330891
95
+ 2026-01-13 16:53:37,505 INFO [train.py:1007] (1/2) Using DDP
96
+ 2026-01-13 16:53:39,347 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
97
+ 2026-01-13 16:53:39,348 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
98
+ 2026-01-13 16:53:39,348 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
99
+ 2026-01-13 16:53:39,348 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
100
+ 2026-01-13 16:53:39,349 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
101
+ 2026-01-13 16:53:39,349 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
102
+ 2026-01-13 16:53:39,349 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 16:53:39,691 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
104
+ 2026-01-13 16:53:39,692 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
105
+ 2026-01-13 16:53:39,692 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
106
+ 2026-01-13 16:53:39,693 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
107
+ 2026-01-13 16:53:39,908 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
log/log-train-2026-01-13-16-54-14-0 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 16:54:14,567 INFO [train.py:967] (0/2) Training started
2
+ 2026-01-13 16:54:14,568 INFO [train.py:977] (0/2) Device: cuda:0
3
+ 2026-01-13 16:54:14,571 INFO [train.py:986] (0/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 16:54:14,572 INFO [train.py:988] (0/2) About to create model
93
+ 2026-01-13 16:54:15,171 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 16:54:15,189 INFO [train.py:992] (0/2) Number of model parameters: 71330891
95
+ 2026-01-13 16:54:15,951 INFO [train.py:1007] (0/2) Using DDP
96
+ 2026-01-13 16:54:17,256 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
97
+ 2026-01-13 16:54:17,257 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
98
+ 2026-01-13 16:54:17,257 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
99
+ 2026-01-13 16:54:17,258 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
100
+ 2026-01-13 16:54:17,258 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
101
+ 2026-01-13 16:54:17,258 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
102
+ 2026-01-13 16:54:17,258 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 16:54:17,617 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
104
+ 2026-01-13 16:54:17,618 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
105
+ 2026-01-13 16:54:17,618 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
106
+ 2026-01-13 16:54:17,619 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
107
+ 2026-01-13 16:54:17,834 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
log/log-train-2026-01-13-16-54-14-1 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 16:54:14,677 INFO [train.py:967] (1/2) Training started
2
+ 2026-01-13 16:54:14,677 INFO [train.py:977] (1/2) Device: cuda:1
3
+ 2026-01-13 16:54:14,680 INFO [train.py:986] (1/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 16:54:14,680 INFO [train.py:988] (1/2) About to create model
93
+ 2026-01-13 16:54:15,275 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 16:54:15,293 INFO [train.py:992] (1/2) Number of model parameters: 71330891
95
+ 2026-01-13 16:54:15,400 INFO [train.py:1007] (1/2) Using DDP
96
+ 2026-01-13 16:54:17,262 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
97
+ 2026-01-13 16:54:17,263 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
98
+ 2026-01-13 16:54:17,264 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
99
+ 2026-01-13 16:54:17,264 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
100
+ 2026-01-13 16:54:17,264 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
101
+ 2026-01-13 16:54:17,264 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
102
+ 2026-01-13 16:54:17,264 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 16:54:17,631 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
104
+ 2026-01-13 16:54:17,632 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
105
+ 2026-01-13 16:54:17,632 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
106
+ 2026-01-13 16:54:17,633 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
107
+ 2026-01-13 16:54:17,869 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
log/log-train-2026-01-13-17-00-38-0 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 17:00:38,644 INFO [train.py:967] (0/2) Training started
2
+ 2026-01-13 17:00:38,646 INFO [train.py:977] (0/2) Device: cuda:0
3
+ 2026-01-13 17:00:38,650 INFO [train.py:986] (0/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 17:00:38,651 INFO [train.py:988] (0/2) About to create model
93
+ 2026-01-13 17:00:39,258 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 17:00:39,275 INFO [train.py:992] (0/2) Number of model parameters: 71330891
95
+ 2026-01-13 17:00:40,039 INFO [train.py:1007] (0/2) Using DDP
96
+ 2026-01-13 17:00:41,419 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
97
+ 2026-01-13 17:00:41,420 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
98
+ 2026-01-13 17:00:41,420 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
99
+ 2026-01-13 17:00:41,420 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
100
+ 2026-01-13 17:00:41,420 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
101
+ 2026-01-13 17:00:41,420 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
102
+ 2026-01-13 17:00:41,420 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 17:00:41,760 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
104
+ 2026-01-13 17:00:41,761 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
105
+ 2026-01-13 17:00:41,761 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
106
+ 2026-01-13 17:00:41,762 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
107
+ 2026-01-13 17:00:41,982 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
log/log-train-2026-01-13-17-00-38-1 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 17:00:38,739 INFO [train.py:967] (1/2) Training started
2
+ 2026-01-13 17:00:38,739 INFO [train.py:977] (1/2) Device: cuda:1
3
+ 2026-01-13 17:00:38,742 INFO [train.py:986] (1/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 17:00:38,742 INFO [train.py:988] (1/2) About to create model
93
+ 2026-01-13 17:00:39,358 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 17:00:39,375 INFO [train.py:992] (1/2) Number of model parameters: 71330891
95
+ 2026-01-13 17:00:39,482 INFO [train.py:1007] (1/2) Using DDP
96
+ 2026-01-13 17:00:41,419 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
97
+ 2026-01-13 17:00:41,421 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
98
+ 2026-01-13 17:00:41,421 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
99
+ 2026-01-13 17:00:41,421 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
100
+ 2026-01-13 17:00:41,421 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
101
+ 2026-01-13 17:00:41,421 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
102
+ 2026-01-13 17:00:41,421 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 17:00:41,757 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
104
+ 2026-01-13 17:00:41,757 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
105
+ 2026-01-13 17:00:41,758 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
106
+ 2026-01-13 17:00:41,759 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
107
+ 2026-01-13 17:00:41,980 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
log/log-train-2026-01-13-17-01-14-0 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 17:01:14,363 INFO [train.py:967] (0/2) Training started
2
+ 2026-01-13 17:01:14,364 INFO [train.py:977] (0/2) Device: cuda:0
3
+ 2026-01-13 17:01:14,367 INFO [train.py:986] (0/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 17:01:14,367 INFO [train.py:988] (0/2) About to create model
93
+ 2026-01-13 17:01:14,952 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 17:01:14,971 INFO [train.py:992] (0/2) Number of model parameters: 71330891
95
+ 2026-01-13 17:01:15,734 INFO [train.py:1007] (0/2) Using DDP
96
+ 2026-01-13 17:01:17,024 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
97
+ 2026-01-13 17:01:17,025 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
98
+ 2026-01-13 17:01:17,025 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
99
+ 2026-01-13 17:01:17,025 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
100
+ 2026-01-13 17:01:17,025 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
101
+ 2026-01-13 17:01:17,025 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
102
+ 2026-01-13 17:01:17,026 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 17:01:17,366 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
104
+ 2026-01-13 17:01:17,366 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
105
+ 2026-01-13 17:01:17,367 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
106
+ 2026-01-13 17:01:17,367 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
107
+ 2026-01-13 17:01:17,579 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
log/log-train-2026-01-13-17-01-14-1 ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 17:01:14,470 INFO [train.py:967] (1/2) Training started
2
+ 2026-01-13 17:01:14,471 INFO [train.py:977] (1/2) Device: cuda:1
3
+ 2026-01-13 17:01:14,473 INFO [train.py:986] (1/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 17:01:14,473 INFO [train.py:988] (1/2) About to create model
93
+ 2026-01-13 17:01:15,050 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 17:01:15,068 INFO [train.py:992] (1/2) Number of model parameters: 71330891
95
+ 2026-01-13 17:01:15,174 INFO [train.py:1007] (1/2) Using DDP
96
+ 2026-01-13 17:01:17,037 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
97
+ 2026-01-13 17:01:17,038 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
98
+ 2026-01-13 17:01:17,038 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
99
+ 2026-01-13 17:01:17,038 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
100
+ 2026-01-13 17:01:17,038 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
101
+ 2026-01-13 17:01:17,038 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
102
+ 2026-01-13 17:01:17,038 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 17:01:17,377 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
104
+ 2026-01-13 17:01:17,378 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
105
+ 2026-01-13 17:01:17,378 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
106
+ 2026-01-13 17:01:17,379 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
107
+ 2026-01-13 17:01:17,591 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
log/log-train-2026-01-13-17-06-37-0 ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 17:06:37,678 INFO [train.py:967] (0/2) Training started
2
+ 2026-01-13 17:06:37,679 INFO [train.py:977] (0/2) Device: cuda:0
3
+ 2026-01-13 17:06:37,681 INFO [train.py:986] (0/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 17:06:37,682 INFO [train.py:988] (0/2) About to create model
93
+ 2026-01-13 17:06:38,266 INFO [zipformer.py:405] (0/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 17:06:38,285 INFO [train.py:992] (0/2) Number of model parameters: 71330891
95
+ 2026-01-13 17:06:39,101 INFO [train.py:1007] (0/2) Using DDP
96
+ 2026-01-13 17:06:40,454 INFO [asr_datamodule.py:422] (0/2) About to get train-clean-100 cuts
97
+ 2026-01-13 17:06:40,456 INFO [asr_datamodule.py:239] (0/2) Disable MUSAN
98
+ 2026-01-13 17:06:40,456 INFO [asr_datamodule.py:257] (0/2) Enable SpecAugment
99
+ 2026-01-13 17:06:40,456 INFO [asr_datamodule.py:258] (0/2) Time warp factor: 80
100
+ 2026-01-13 17:06:40,456 INFO [asr_datamodule.py:268] (0/2) Num frame mask: 10
101
+ 2026-01-13 17:06:40,456 INFO [asr_datamodule.py:281] (0/2) About to create train dataset
102
+ 2026-01-13 17:06:40,456 INFO [asr_datamodule.py:308] (0/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 17:06:40,855 INFO [asr_datamodule.py:324] (0/2) About to create train dataloader
104
+ 2026-01-13 17:06:40,855 INFO [asr_datamodule.py:460] (0/2) About to get dev-clean cuts
105
+ 2026-01-13 17:06:40,856 INFO [asr_datamodule.py:467] (0/2) About to get dev-other cuts
106
+ 2026-01-13 17:06:40,856 INFO [asr_datamodule.py:355] (0/2) About to create dev dataset
107
+ 2026-01-13 17:06:41,074 INFO [asr_datamodule.py:372] (0/2) About to create dev dataloader
108
+ 2026-01-13 17:06:56,066 INFO [train.py:895] (0/2) Epoch 1, batch 0, loss[loss=8.165, simple_loss=7.427, pruned_loss=7.363, over 2638.00 frames. ], tot_loss[loss=8.165, simple_loss=7.427, pruned_loss=7.363, over 2638.00 frames. ], batch size: 7, lr: 2.50e-02, grad_scale: 2.0
109
+ 2026-01-13 17:06:56,066 INFO [train.py:920] (0/2) Computing validation loss
110
+ 2026-01-13 17:08:00,243 INFO [zipformer.py:2441] (0/2) attn_weights_entropy = tensor([2.9147, 2.9149, 2.9154, 2.9121, 2.9146, 2.9150, 2.9150, 2.9151],
111
+ device='cuda:0'), covar=tensor([0.0029, 0.0041, 0.0048, 0.0025, 0.0032, 0.0035, 0.0052, 0.0034],
112
+ device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
113
+ device='cuda:0'), out_proj_covar=tensor([8.5573e-06, 8.6460e-06, 8.6547e-06, 8.5689e-06, 8.8456e-06, 8.6908e-06,
114
+ 8.7531e-06, 8.7239e-06], device='cuda:0')
115
+ 2026-01-13 17:08:21,491 INFO [train.py:929] (0/2) Epoch 1, validation: loss=8.291, simple_loss=7.534, pruned_loss=7.553, over 1639044.00 frames.
116
+ 2026-01-13 17:08:21,492 INFO [train.py:930] (0/2) Maximum memory allocated so far is 2796MB
117
+ 2026-01-13 17:08:23,199 INFO [zipformer.py:1188] (0/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5.0, num_to_drop=2, layers_to_drop={1, 3}
118
+ 2026-01-13 17:08:29,961 INFO [zipformer.py:1188] (0/2) warmup_begin=666.7, warmup_end=1333.3, batch_count=23.0, num_to_drop=1, layers_to_drop={1}
119
+ 2026-01-13 17:08:32,511 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=6.07 vs. limit=2.0
120
+ 2026-01-13 17:08:33,581 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=5.73 vs. limit=2.0
121
+ 2026-01-13 17:08:37,070 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=14.43 vs. limit=2.0
122
+ 2026-01-13 17:08:40,738 INFO [train.py:895] (0/2) Epoch 1, batch 50, loss[loss=1.1, simple_loss=0.9771, pruned_loss=1.103, over 2768.00 frames. ], tot_loss[loss=2.142, simple_loss=1.948, pruned_loss=1.876, over 122563.73 frames. ], batch size: 7, lr: 2.75e-02, grad_scale: 2.0
123
+ 2026-01-13 17:08:52,490 INFO [zipformer.py:2441] (0/2) attn_weights_entropy = tensor([5.1111, 5.1111, 5.1051, 5.1033, 5.1106, 5.1110, 5.1105, 5.1110],
124
+ device='cuda:0'), covar=tensor([0.0017, 0.0046, 0.0033, 0.0029, 0.0021, 0.0033, 0.0023, 0.0020],
125
+ device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
126
+ device='cuda:0'), out_proj_covar=tensor([8.7029e-06, 8.8104e-06, 8.8122e-06, 8.6125e-06, 9.0121e-06, 8.7908e-06,
127
+ 8.8471e-06, 8.8093e-06], device='cuda:0')
128
+ 2026-01-13 17:08:53,265 INFO [zipformer.py:1188] (0/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83.0, num_to_drop=1, layers_to_drop={1}
129
+ 2026-01-13 17:08:57,962 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=3.07 vs. limit=2.0
130
+ 2026-01-13 17:09:00,026 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 1.726e+01 2.698e+01 5.079e+01 1.890e+02 2.214e+03, threshold=1.016e+02, percent-clipped=0.0
131
+ 2026-01-13 17:09:00,065 INFO [train.py:895] (0/2) Epoch 1, batch 100, loss[loss=0.9872, simple_loss=0.8589, pruned_loss=1.031, over 2891.00 frames. ], tot_loss[loss=1.553, simple_loss=1.394, pruned_loss=1.445, over 216716.41 frames. ], batch size: 8, lr: 3.00e-02, grad_scale: 2.0
132
+ 2026-01-13 17:09:13,046 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=5.61 vs. limit=2.0
133
+ 2026-01-13 17:09:17,396 INFO [zipformer.py:1188] (0/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=144.0, num_to_drop=2, layers_to_drop={1, 3}
134
+ 2026-01-13 17:09:20,086 INFO [train.py:895] (0/2) Epoch 1, batch 150, loss[loss=0.8253, simple_loss=0.7036, pruned_loss=0.8828, over 2774.00 frames. ], tot_loss[loss=1.313, simple_loss=1.163, pruned_loss=1.277, over 290051.50 frames. ], batch size: 7, lr: 3.25e-02, grad_scale: 2.0
135
+ 2026-01-13 17:09:40,653 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 1.991e+01 2.552e+01 2.903e+01 3.376e+01 6.929e+01, threshold=5.806e+01, percent-clipped=0.0
136
+ 2026-01-13 17:09:40,692 INFO [train.py:895] (0/2) Epoch 1, batch 200, loss[loss=1.141, simple_loss=0.9689, pruned_loss=1.16, over 2637.00 frames. ], tot_loss[loss=1.177, simple_loss=1.03, pruned_loss=1.172, over 347964.82 frames. ], batch size: 16, lr: 3.50e-02, grad_scale: 2.0
137
+ 2026-01-13 17:09:43,454 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=4.43 vs. limit=2.0
138
+ 2026-01-13 17:09:46,242 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=37.43 vs. limit=5.0
139
+ 2026-01-13 17:09:56,985 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=42.78 vs. limit=5.0
140
+ 2026-01-13 17:09:59,109 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=14.33 vs. limit=2.0
141
+ 2026-01-13 17:10:00,956 INFO [train.py:895] (0/2) Epoch 1, batch 250, loss[loss=0.8344, simple_loss=0.6973, pruned_loss=0.8492, over 2755.00 frames. ], tot_loss[loss=1.088, simple_loss=0.942, pruned_loss=1.093, over 392480.77 frames. ], batch size: 11, lr: 3.75e-02, grad_scale: 2.0
142
+ 2026-01-13 17:10:19,125 INFO [zipformer.py:1188] (0/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=296.0, num_to_drop=1, layers_to_drop={1}
143
+ 2026-01-13 17:10:20,733 INFO [zipformer.py:1188] (0/2) warmup_begin=1333.3, warmup_end=2000.0, batch_count=300.0, num_to_drop=2, layers_to_drop={1, 3}
144
+ 2026-01-13 17:10:21,012 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 2.628e+01 3.406e+01 3.889e+01 4.977e+01 1.495e+02, threshold=7.778e+01, percent-clipped=13.0
145
+ 2026-01-13 17:10:21,048 INFO [train.py:895] (0/2) Epoch 1, batch 300, loss[loss=0.8336, simple_loss=0.6967, pruned_loss=0.8042, over 2892.00 frames. ], tot_loss[loss=1.026, simple_loss=0.8803, pruned_loss=1.03, over 428733.71 frames. ], batch size: 10, lr: 4.00e-02, grad_scale: 2.0
146
+ 2026-01-13 17:10:26,809 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=2.54 vs. limit=2.0
147
+ 2026-01-13 17:10:39,996 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0
148
+ 2026-01-13 17:10:40,639 INFO [train.py:895] (0/2) Epoch 1, batch 350, loss[loss=0.8286, simple_loss=0.6797, pruned_loss=0.8085, over 2696.00 frames. ], tot_loss[loss=0.9781, simple_loss=0.832, pruned_loss=0.9753, over 455853.48 frames. ], batch size: 7, lr: 4.25e-02, grad_scale: 2.0
149
+ 2026-01-13 17:10:43,170 INFO [zipformer.py:1188] (0/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=357.0, num_to_drop=2, layers_to_drop={1, 3}
150
+ 2026-01-13 17:10:48,892 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=96, metric=2.92 vs. limit=2.0
151
+ 2026-01-13 17:10:53,144 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=14.35 vs. limit=5.0
152
+ 2026-01-13 17:10:54,931 INFO [zipformer.py:1188] (0/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=387.0, num_to_drop=1, layers_to_drop={1}
153
+ 2026-01-13 17:10:55,070 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=12.67 vs. limit=2.0
154
+ 2026-01-13 17:10:56,683 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=11.14 vs. limit=5.0
155
+ 2026-01-13 17:11:00,683 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 2.792e+01 3.460e+01 4.419e+01 5.164e+01 2.002e+02, threshold=8.837e+01, percent-clipped=7.0
156
+ 2026-01-13 17:11:00,720 INFO [train.py:895] (0/2) Epoch 1, batch 400, loss[loss=0.8411, simple_loss=0.6884, pruned_loss=0.7904, over 2705.00 frames. ], tot_loss[loss=0.9489, simple_loss=0.8001, pruned_loss=0.9381, over 475942.16 frames. ], batch size: 8, lr: 4.50e-02, grad_scale: 4.0
157
+ 2026-01-13 17:11:07,553 INFO [scaling.py:681] (0/2) Whitening: num_groups=8, num_channels=192, metric=10.28 vs. limit=2.0
158
+ 2026-01-13 17:11:11,067 INFO [zipformer.py:2441] (0/2) attn_weights_entropy = tensor([4.6921, 5.1287, 4.3867, 4.6206, 3.9530, 5.0709, 4.4906, 4.6190],
159
+ device='cuda:0'), covar=tensor([0.0421, 0.0065, 0.0737, 0.0372, 0.1756, 0.0083, 0.0905, 0.0393],
160
+ device='cuda:0'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0010, 0.0009, 0.0009, 0.0009],
161
+ device='cuda:0'), out_proj_covar=tensor([9.0232e-06, 8.7080e-06, 9.0746e-06, 8.3539e-06, 1.0025e-05, 8.5254e-06,
162
+ 8.9379e-06, 8.6380e-06], device='cuda:0')
163
+ 2026-01-13 17:11:15,769 INFO [zipformer.py:1188] (0/2) warmup_begin=1333.3, warmup_end=2000.0, batch_count=439.0, num_to_drop=2, layers_to_drop={1, 2}
164
+ 2026-01-13 17:11:19,201 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=10.70 vs. limit=5.0
165
+ 2026-01-13 17:11:19,449 INFO [zipformer.py:1188] (0/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=448.0, num_to_drop=2, layers_to_drop={0, 3}
166
+ 2026-01-13 17:11:20,503 INFO [train.py:895] (0/2) Epoch 1, batch 450, loss[loss=0.7667, simple_loss=0.6296, pruned_loss=0.6868, over 2660.00 frames. ], tot_loss[loss=0.9287, simple_loss=0.7768, pruned_loss=0.9064, over 492502.80 frames. ], batch size: 8, lr: 4.75e-02, grad_scale: 4.0
167
+ 2026-01-13 17:11:26,921 INFO [scaling.py:681] (0/2) Whitening: num_groups=1, num_channels=384, metric=10.20 vs. limit=5.0
168
+ 2026-01-13 17:11:40,428 INFO [optim.py:365] (0/2) Clipping_scale=2.0, grad-norm quartiles 2.871e+01 3.474e+01 4.145e+01 5.434e+01 1.454e+02, threshold=8.291e+01, percent-clipped=4.0
169
+ 2026-01-13 17:11:40,465 INFO [train.py:895] (0/2) Epoch 1, batch 500, loss[loss=0.8643, simple_loss=0.6974, pruned_loss=0.7811, over 2804.00 frames. ], tot_loss[loss=0.9117, simple_loss=0.7566, pruned_loss=0.8773, over 506680.85 frames. ], batch size: 10, lr: 4.99e-02, grad_scale: 4.0
log/log-train-2026-01-13-17-06-37-1 ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-13 17:06:37,791 INFO [train.py:967] (1/2) Training started
2
+ 2026-01-13 17:06:37,791 INFO [train.py:977] (1/2) Device: cuda:1
3
+ 2026-01-13 17:06:37,793 INFO [train.py:986] (1/2) {
4
+ "am_scale": 0.0,
5
+ "attention_dims": "192,192,192,192,192",
6
+ "average_period": 200,
7
+ "base_lr": 0.05,
8
+ "batch_idx_train": 0,
9
+ "best_train_epoch": -1,
10
+ "best_train_loss": Infinity,
11
+ "best_valid_epoch": -1,
12
+ "best_valid_loss": Infinity,
13
+ "blank_id": 0,
14
+ "bpe_model": "/kaggle/working/amharic_training/bpe/bpe.model",
15
+ "bucketing_sampler": true,
16
+ "cnn_module_kernels": "31,31,31,31,31",
17
+ "concatenate_cuts": false,
18
+ "context_size": 2,
19
+ "decode_chunk_len": 32,
20
+ "decoder_dim": 512,
21
+ "drop_last": true,
22
+ "duration_factor": 1.0,
23
+ "enable_musan": false,
24
+ "enable_spec_aug": true,
25
+ "encoder_dims": "384,384,384,384,384",
26
+ "encoder_unmasked_dims": "256,256,256,256,256",
27
+ "env_info": {
28
+ "IP address": "172.19.2.2",
29
+ "hostname": "6ec37ec2ba95",
30
+ "icefall-git-branch": "master",
31
+ "icefall-git-date": "Fri Nov 28 03:42:20 2025",
32
+ "icefall-git-sha1": "0904e490-clean",
33
+ "icefall-path": "/kaggle/working/icefall",
34
+ "k2-build-type": "Release",
35
+ "k2-git-date": "Thu Jul 25 03:34:26 2024",
36
+ "k2-git-sha1": "40e8d1676f6062e46458dc32ad21229c93cc9c50",
37
+ "k2-path": "/usr/local/lib/python3.12/dist-packages/k2/__init__.py",
38
+ "k2-version": "1.24.4",
39
+ "k2-with-cuda": true,
40
+ "lhotse-path": "/usr/local/lib/python3.12/dist-packages/lhotse/__init__.py",
41
+ "lhotse-version": "1.32.1",
42
+ "python-version": "3.12",
43
+ "torch-cuda-available": true,
44
+ "torch-cuda-version": "12.1",
45
+ "torch-version": "2.4.0+cu121"
46
+ },
47
+ "exp_dir": "/kaggle/working/amharic_training/exp_amharic_streaming",
48
+ "feature_dim": 80,
49
+ "feedforward_dims": "1024,1024,2048,2048,1024",
50
+ "full_libri": false,
51
+ "gap": 1.0,
52
+ "inf_check": false,
53
+ "input_strategy": "PrecomputedFeatures",
54
+ "joiner_dim": 512,
55
+ "keep_last_k": 5,
56
+ "lm_scale": 0.25,
57
+ "log_interval": 50,
58
+ "lr_batches": 5000,
59
+ "lr_epochs": 3.5,
60
+ "manifest_dir": "/kaggle/working/amharic_training/manifests",
61
+ "master_port": 12354,
62
+ "max_duration": 120,
63
+ "mini_libri": false,
64
+ "nhead": "8,8,8,8,8",
65
+ "num_buckets": 30,
66
+ "num_encoder_layers": "2,4,3,2,4",
67
+ "num_epochs": 50,
68
+ "num_left_chunks": 4,
69
+ "num_workers": 2,
70
+ "on_the_fly_feats": false,
71
+ "print_diagnostics": false,
72
+ "prune_range": 5,
73
+ "reset_interval": 200,
74
+ "return_cuts": true,
75
+ "save_every_n": 1000,
76
+ "seed": 42,
77
+ "short_chunk_size": 50,
78
+ "shuffle": true,
79
+ "simple_loss_scale": 0.5,
80
+ "spec_aug_time_warp_factor": 80,
81
+ "start_batch": 0,
82
+ "start_epoch": 1,
83
+ "subsampling_factor": 4,
84
+ "tensorboard": true,
85
+ "use_fp16": true,
86
+ "valid_interval": 1600,
87
+ "vocab_size": 1000,
88
+ "warm_step": 2000,
89
+ "world_size": 2,
90
+ "zipformer_downsampling_factors": "1,2,4,8,2"
91
+ }
92
+ 2026-01-13 17:06:37,794 INFO [train.py:988] (1/2) About to create model
93
+ 2026-01-13 17:06:38,385 INFO [zipformer.py:405] (1/2) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8.
94
+ 2026-01-13 17:06:38,403 INFO [train.py:992] (1/2) Number of model parameters: 71330891
95
+ 2026-01-13 17:06:38,512 INFO [train.py:1007] (1/2) Using DDP
96
+ 2026-01-13 17:06:40,469 INFO [asr_datamodule.py:422] (1/2) About to get train-clean-100 cuts
97
+ 2026-01-13 17:06:40,470 INFO [asr_datamodule.py:239] (1/2) Disable MUSAN
98
+ 2026-01-13 17:06:40,470 INFO [asr_datamodule.py:257] (1/2) Enable SpecAugment
99
+ 2026-01-13 17:06:40,471 INFO [asr_datamodule.py:258] (1/2) Time warp factor: 80
100
+ 2026-01-13 17:06:40,471 INFO [asr_datamodule.py:268] (1/2) Num frame mask: 10
101
+ 2026-01-13 17:06:40,471 INFO [asr_datamodule.py:281] (1/2) About to create train dataset
102
+ 2026-01-13 17:06:40,471 INFO [asr_datamodule.py:308] (1/2) Using DynamicBucketingSampler.
103
+ 2026-01-13 17:06:40,872 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader
104
+ 2026-01-13 17:06:40,872 INFO [asr_datamodule.py:460] (1/2) About to get dev-clean cuts
105
+ 2026-01-13 17:06:40,873 INFO [asr_datamodule.py:467] (1/2) About to get dev-other cuts
106
+ 2026-01-13 17:06:40,873 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset
107
+ 2026-01-13 17:06:41,102 INFO [asr_datamodule.py:372] (1/2) About to create dev dataloader
108
+ 2026-01-13 17:06:56,061 INFO [train.py:895] (1/2) Epoch 1, batch 0, loss[loss=8.191, simple_loss=7.455, pruned_loss=7.342, over 2645.00 frames. ], tot_loss[loss=8.191, simple_loss=7.455, pruned_loss=7.342, over 2645.00 frames. ], batch size: 7, lr: 2.50e-02, grad_scale: 2.0
109
+ 2026-01-13 17:06:56,062 INFO [train.py:920] (1/2) Computing validation loss
110
+ 2026-01-13 17:08:00,436 INFO [zipformer.py:2441] (1/2) attn_weights_entropy = tensor([2.9155, 2.9157, 2.9161, 2.9129, 2.9154, 2.9159, 2.9159, 2.9159],
111
+ device='cuda:1'), covar=tensor([0.0037, 0.0062, 0.0063, 0.0034, 0.0040, 0.0046, 0.0067, 0.0041],
112
+ device='cuda:1'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
113
+ device='cuda:1'), out_proj_covar=tensor([8.5573e-06, 8.6460e-06, 8.6547e-06, 8.5689e-06, 8.8456e-06, 8.6908e-06,
114
+ 8.7531e-06, 8.7239e-06], device='cuda:1')
115
+ 2026-01-13 17:08:21,491 INFO [train.py:929] (1/2) Epoch 1, validation: loss=8.291, simple_loss=7.534, pruned_loss=7.553, over 1639044.00 frames.
116
+ 2026-01-13 17:08:21,492 INFO [train.py:930] (1/2) Maximum memory allocated so far is 2801MB
117
+ 2026-01-13 17:08:23,182 INFO [zipformer.py:1188] (1/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5.0, num_to_drop=2, layers_to_drop={0, 1}
118
+ 2026-01-13 17:08:29,961 INFO [zipformer.py:1188] (1/2) warmup_begin=666.7, warmup_end=1333.3, batch_count=23.0, num_to_drop=1, layers_to_drop={1}
119
+ 2026-01-13 17:08:32,502 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=4.95 vs. limit=2.0
120
+ 2026-01-13 17:08:33,581 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=5.26 vs. limit=2.0
121
+ 2026-01-13 17:08:37,075 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=192, metric=13.56 vs. limit=2.0
122
+ 2026-01-13 17:08:40,737 INFO [train.py:895] (1/2) Epoch 1, batch 50, loss[loss=1.051, simple_loss=0.9344, pruned_loss=1.049, over 2766.00 frames. ], tot_loss[loss=2.147, simple_loss=1.951, pruned_loss=1.882, over 122589.82 frames. ], batch size: 7, lr: 2.75e-02, grad_scale: 2.0
123
+ 2026-01-13 17:08:52,476 INFO [zipformer.py:2441] (1/2) attn_weights_entropy = tensor([5.1140, 5.1141, 5.1095, 5.1045, 5.1129, 5.1137, 5.1126, 5.1135],
124
+ device='cuda:1'), covar=tensor([0.0009, 0.0023, 0.0019, 0.0016, 0.0006, 0.0018, 0.0013, 0.0012],
125
+ device='cuda:1'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
126
+ device='cuda:1'), out_proj_covar=tensor([8.5574e-06, 8.6733e-06, 8.6879e-06, 8.5465e-06, 8.8501e-06, 8.6969e-06,
127
+ 8.7308e-06, 8.6867e-06], device='cuda:1')
128
+ 2026-01-13 17:08:53,257 INFO [zipformer.py:1188] (1/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83.0, num_to_drop=1, layers_to_drop={1}
129
+ 2026-01-13 17:08:57,912 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=3.48 vs. limit=2.0
130
+ 2026-01-13 17:09:00,027 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.726e+01 2.698e+01 5.079e+01 1.890e+02 2.214e+03, threshold=1.016e+02, percent-clipped=0.0
131
+ 2026-01-13 17:09:00,066 INFO [train.py:895] (1/2) Epoch 1, batch 100, loss[loss=0.9553, simple_loss=0.8285, pruned_loss=1.015, over 2893.00 frames. ], tot_loss[loss=1.55, simple_loss=1.391, pruned_loss=1.443, over 216705.88 frames. ], batch size: 8, lr: 3.00e-02, grad_scale: 2.0
132
+ 2026-01-13 17:09:17,393 INFO [zipformer.py:1188] (1/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=144.0, num_to_drop=2, layers_to_drop={1, 2}
133
+ 2026-01-13 17:09:20,085 INFO [train.py:895] (1/2) Epoch 1, batch 150, loss[loss=0.8435, simple_loss=0.7125, pruned_loss=0.9396, over 2786.00 frames. ], tot_loss[loss=1.311, simple_loss=1.161, pruned_loss=1.277, over 290274.69 frames. ], batch size: 7, lr: 3.25e-02, grad_scale: 2.0
134
+ 2026-01-13 17:09:39,620 INFO [zipformer.py:2441] (1/2) attn_weights_entropy = tensor([4.2447, 4.2507, 4.2467, 4.2488, 4.2510, 4.2508, 4.2486, 4.2512],
135
+ device='cuda:1'), covar=tensor([0.0020, 0.0018, 0.0026, 0.0025, 0.0023, 0.0022, 0.0015, 0.0020],
136
+ device='cuda:1'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
137
+ device='cuda:1'), out_proj_covar=tensor([8.8484e-06, 8.8488e-06, 8.6189e-06, 8.8502e-06, 8.6395e-06, 8.7008e-06,
138
+ 8.7248e-06, 8.7738e-06], device='cuda:1')
139
+ 2026-01-13 17:09:40,650 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.991e+01 2.552e+01 2.903e+01 3.376e+01 6.929e+01, threshold=5.806e+01, percent-clipped=0.0
140
+ 2026-01-13 17:09:40,689 INFO [train.py:895] (1/2) Epoch 1, batch 200, loss[loss=1.119, simple_loss=0.947, pruned_loss=1.155, over 2665.00 frames. ], tot_loss[loss=1.179, simple_loss=1.032, pruned_loss=1.176, over 347390.37 frames. ], batch size: 16, lr: 3.50e-02, grad_scale: 2.0
141
+ 2026-01-13 17:09:53,935 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=5.48 vs. limit=2.0
142
+ 2026-01-13 17:10:00,956 INFO [train.py:895] (1/2) Epoch 1, batch 250, loss[loss=1.031, simple_loss=0.8593, pruned_loss=1.061, over 2763.00 frames. ], tot_loss[loss=1.091, simple_loss=0.9446, pruned_loss=1.099, over 391985.25 frames. ], batch size: 11, lr: 3.75e-02, grad_scale: 2.0
143
+ 2026-01-13 17:10:01,151 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=3.75 vs. limit=2.0
144
+ 2026-01-13 17:10:01,977 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=2.53 vs. limit=2.0
145
+ 2026-01-13 17:10:12,105 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0
146
+ 2026-01-13 17:10:19,140 INFO [zipformer.py:1188] (1/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=296.0, num_to_drop=1, layers_to_drop={1}
147
+ 2026-01-13 17:10:20,730 INFO [zipformer.py:1188] (1/2) warmup_begin=1333.3, warmup_end=2000.0, batch_count=300.0, num_to_drop=2, layers_to_drop={0, 1}
148
+ 2026-01-13 17:10:21,011 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.628e+01 3.406e+01 3.889e+01 4.977e+01 1.495e+02, threshold=7.778e+01, percent-clipped=13.0
149
+ 2026-01-13 17:10:21,048 INFO [train.py:895] (1/2) Epoch 1, batch 300, loss[loss=0.8037, simple_loss=0.6637, pruned_loss=0.807, over 2885.00 frames. ], tot_loss[loss=1.029, simple_loss=0.8823, pruned_loss=1.034, over 428005.97 frames. ], batch size: 10, lr: 4.00e-02, grad_scale: 2.0
150
+ 2026-01-13 17:10:26,867 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0
151
+ 2026-01-13 17:10:31,380 INFO [zipformer.py:2441] (1/2) attn_weights_entropy = tensor([3.6023, 3.6043, 3.6010, 3.6077, 3.6021, 3.6072, 3.6062, 3.6057],
152
+ device='cuda:1'), covar=tensor([0.0033, 0.0036, 0.0040, 0.0028, 0.0029, 0.0039, 0.0052, 0.0038],
153
+ device='cuda:1'), in_proj_covar=tensor([0.0009, 0.0008, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009],
154
+ device='cuda:1'), out_proj_covar=tensor([9.0297e-06, 8.5946e-06, 8.6648e-06, 8.7116e-06, 8.8798e-06, 8.5990e-06,
155
+ 8.8286e-06, 8.7837e-06], device='cuda:1')
156
+ 2026-01-13 17:10:33,433 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=192, metric=19.84 vs. limit=2.0
157
+ 2026-01-13 17:10:33,517 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=96, metric=2.32 vs. limit=2.0
158
+ 2026-01-13 17:10:40,638 INFO [train.py:895] (1/2) Epoch 1, batch 350, loss[loss=0.9178, simple_loss=0.7621, pruned_loss=0.863, over 2690.00 frames. ], tot_loss[loss=0.9843, simple_loss=0.8369, pruned_loss=0.9828, over 455041.83 frames. ], batch size: 7, lr: 4.25e-02, grad_scale: 2.0
159
+ 2026-01-13 17:10:43,192 INFO [zipformer.py:1188] (1/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=357.0, num_to_drop=2, layers_to_drop={2, 3}
160
+ 2026-01-13 17:10:51,167 INFO [scaling.py:681] (1/2) Whitening: num_groups=1, num_channels=384, metric=15.06 vs. limit=5.0
161
+ 2026-01-13 17:10:54,931 INFO [zipformer.py:1188] (1/2) warmup_begin=2666.7, warmup_end=3333.3, batch_count=387.0, num_to_drop=1, layers_to_drop={0}
162
+ 2026-01-13 17:11:00,683 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.792e+01 3.460e+01 4.419e+01 5.164e+01 2.002e+02, threshold=8.837e+01, percent-clipped=7.0
163
+ 2026-01-13 17:11:00,720 INFO [train.py:895] (1/2) Epoch 1, batch 400, loss[loss=0.7745, simple_loss=0.6329, pruned_loss=0.7313, over 2716.00 frames. ], tot_loss[loss=0.9508, simple_loss=0.8014, pruned_loss=0.9415, over 474747.91 frames. ], batch size: 8, lr: 4.50e-02, grad_scale: 4.0
164
+ 2026-01-13 17:11:14,445 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=192, metric=15.28 vs. limit=2.0
165
+ 2026-01-13 17:11:15,775 INFO [zipformer.py:1188] (1/2) warmup_begin=1333.3, warmup_end=2000.0, batch_count=439.0, num_to_drop=2, layers_to_drop={0, 2}
166
+ 2026-01-13 17:11:19,454 INFO [zipformer.py:1188] (1/2) warmup_begin=3333.3, warmup_end=4000.0, batch_count=448.0, num_to_drop=2, layers_to_drop={0, 1}
167
+ 2026-01-13 17:11:20,503 INFO [train.py:895] (1/2) Epoch 1, batch 450, loss[loss=0.8531, simple_loss=0.6946, pruned_loss=0.7819, over 2672.00 frames. ], tot_loss[loss=0.9284, simple_loss=0.7761, pruned_loss=0.9079, over 491498.11 frames. ], batch size: 8, lr: 4.75e-02, grad_scale: 4.0
168
+ 2026-01-13 17:11:23,205 INFO [scaling.py:681] (1/2) Whitening: num_groups=8, num_channels=192, metric=14.11 vs. limit=2.0
169
+ 2026-01-13 17:11:30,442 INFO [scaling.py:681] (1/2) Whitening: num_groups=1, num_channels=384, metric=9.21 vs. limit=5.0
170
+ 2026-01-13 17:11:40,428 INFO [optim.py:365] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.871e+01 3.474e+01 4.145e+01 5.434e+01 1.454e+02, threshold=8.291e+01, percent-clipped=4.0
171
+ 2026-01-13 17:11:40,465 INFO [train.py:895] (1/2) Epoch 1, batch 500, loss[loss=0.8659, simple_loss=0.6927, pruned_loss=0.7988, over 2806.00 frames. ], tot_loss[loss=0.9103, simple_loss=0.7551, pruned_loss=0.8777, over 504851.47 frames. ], batch size: 10, lr: 4.99e-02, grad_scale: 4.0
tensorboard/events.out.tfevents.1768323136.6ec37ec2ba95.217.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da79842ce2109414f3d22d8d057a74fa6834d0564f65cefc7609348b0bbb6050
3
+ size 88
tensorboard/events.out.tfevents.1768323216.6ec37ec2ba95.324.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6eab36efde4b857cc7e5d1e931baac6568e9d7b3d2b83a2b0663cd07690cb99
3
+ size 135
tensorboard/events.out.tfevents.1768323254.6ec37ec2ba95.501.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c51558b37d7235f244e95101572a614ddd83dcc0da82cc68be39db0f2974c45
3
+ size 135
tensorboard/events.out.tfevents.1768323638.6ec37ec2ba95.678.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9722bfba8665b11089af7438d0d28f3e412a917f90090109dd167e2c7825351
3
+ size 135
tensorboard/events.out.tfevents.1768323674.6ec37ec2ba95.851.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ccef400d06e7a278057a3931d6e1289fc8bab3f66e2544c741334ce1519f65c
3
+ size 135
tensorboard/events.out.tfevents.1768323997.6ec37ec2ba95.1021.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56ed9a581c980c0d759e87e48e3006fdccbf4840ae4dc9ae582adb49b0c5ace8
3
+ size 3584