odg123 commited on
Commit
e8237cb
·
verified ·
1 Parent(s): d8d48c5

Upload 30 files

Browse files
Files changed (31) hide show
  1. .gitattributes +1 -0
  2. egs/ami/ASR/xlsr_transducer/inference_results/.ipynb_checkpoints/log-inference-ihm-2026-01-26-04-45-26-checkpoint +264 -0
  3. egs/ami/ASR/xlsr_transducer/inference_results/.ipynb_checkpoints/log-inference-ihm-2026-01-26-05-05-16-checkpoint +111 -0
  4. egs/ami/ASR/xlsr_transducer/inference_results/.ipynb_checkpoints/log-inference-ihm-2026-01-26-05-20-04-checkpoint +22 -0
  5. egs/ami/ASR/xlsr_transducer/inference_results/.ipynb_checkpoints/log-inference-ihm-2026-01-26-05-29-29-checkpoint +22 -0
  6. egs/ami/ASR/xlsr_transducer/inference_results/hyp-ihm.txt +0 -0
  7. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-25-15-47-40 +32 -0
  8. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-43-42 +45 -0
  9. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-44-36 +45 -0
  10. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-45-26 +0 -0
  11. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-57-24 +57 -0
  12. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-58-20 +32 -0
  13. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-59-21 +111 -0
  14. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-02-37 +32 -0
  15. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-03-42 +32 -0
  16. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-05-16 +529 -0
  17. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-13-05 +22 -0
  18. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-14-59 +22 -0
  19. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-17-40 +22 -0
  20. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-20-04 +22 -0
  21. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-29-29 +22 -0
  22. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-48-19 +22 -0
  23. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-50-10 +22 -0
  24. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-54-32 +28 -0
  25. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-06-02-34 +77 -0
  26. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-06-04-30 +72 -0
  27. egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-06-07-36 +0 -0
  28. egs/ami/ASR/xlsr_transducer/inference_results/metrics-ihm.txt +16 -0
  29. egs/ami/ASR/xlsr_transducer/inference_results/ref-ihm.txt +0 -0
  30. egs/ami/ASR/xlsr_transducer/log/log-train-2026-01-25-02-57-28 +3 -0
  31. egs/ami/ASR/xlsr_transducer/tensorboard/events.out.tfevents.1769309848.3edaabdb707c.1028020.0 +3 -0
.gitattributes CHANGED
@@ -55,3 +55,4 @@ egs/libricss/SURT/heat.png filter=lfs diff=lfs merge=lfs -text
55
  egs/libricss/SURT/surt.png filter=lfs diff=lfs merge=lfs -text
56
  egs/librispeech/WSASR/figures/otc_training_graph.drawio.png filter=lfs diff=lfs merge=lfs -text
57
  egs/speech_llm/ASR_LLM/assets/framework.png filter=lfs diff=lfs merge=lfs -text
 
 
55
  egs/libricss/SURT/surt.png filter=lfs diff=lfs merge=lfs -text
56
  egs/librispeech/WSASR/figures/otc_training_graph.drawio.png filter=lfs diff=lfs merge=lfs -text
57
  egs/speech_llm/ASR_LLM/assets/framework.png filter=lfs diff=lfs merge=lfs -text
58
+ egs/ami/ASR/xlsr_transducer/log/log-train-2026-01-25-02-57-28 filter=lfs diff=lfs merge=lfs -text
egs/ami/ASR/xlsr_transducer/inference_results/.ipynb_checkpoints/log-inference-ihm-2026-01-26-04-45-26-checkpoint ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 04:45:26,876 INFO [inference.py:419] ================================================================================
2
+ 2026-01-26 04:45:26,877 INFO [inference.py:420] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 04:45:26,877 INFO [inference.py:421] ================================================================================
4
+ 2026-01-26 04:45:26,877 INFO [inference.py:422] Experiment dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 04:45:26,877 INFO [inference.py:423] Output dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 04:45:26,877 INFO [inference.py:424] Test set: ihm
7
+ 2026-01-26 04:45:26,877 INFO [inference.py:425] Decoding method: greedy_search
8
+ 2026-01-26 04:45:26,877 INFO [inference.py:431] Device: cpu
9
+ 2026-01-26 04:45:26,877 INFO [inference.py:434] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 04:45:26,878 INFO [inference.py:442] Vocabulary size: 500
11
+ 2026-01-26 04:45:26,879 INFO [inference.py:443] Blank ID: 0
12
+ 2026-01-26 04:45:26,879 INFO [inference.py:446] Creating model
13
+ 2026-01-26 04:45:28,435 INFO [inference.py:453] Loading checkpoint: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/best-train-loss.pt
14
+ 2026-01-26 04:45:28,436 INFO [checkpoint.py:111] Loading checkpoint from /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/best-train-loss.pt
15
+ 2026-01-26 04:45:34,301 INFO [inference.py:482] Number of model parameters: 317,511,772
16
+ 2026-01-26 04:45:34,302 INFO [inference.py:485] Loading test data
17
+ 2026-01-26 04:45:34,302 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 04:45:35,453 INFO [inference.py:496] Number of test utterances: 6676
19
+ 2026-01-26 04:45:35,453 INFO [inference.py:499] Starting inference...
20
+ 2026-01-26 04:45:36,710 INFO [inference.py:318] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 04:45:36,713 INFO [inference.py:319] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 04:45:36,715 INFO [inference.py:320] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 04:45:46,614 INFO [inference.py:341] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 04:45:46,615 INFO [inference.py:342] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 04:45:46,615 INFO [inference.py:343] Encoder out range: [-13.684, 12.764]
26
+ 2026-01-26 04:45:55,306 INFO [inference.py:353] Number of hypotheses: 6
27
+ 2026-01-26 04:45:55,307 INFO [inference.py:355] First hypothesis: [290, 289, 20, 262, 20, 262, 20, 262, 20, 262, 20, 262, 20, 262, 240, 199, 35, 8, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 13, 77, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 15, 83, 7, 8, 56, 12, 10, 74, 74, 19, 46, 74, 19, 46, 74, 19, 46, 74, 19, 190, 162, 5, 14, 43, 8, 119, 5, 14, 43, 8, 5, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 23, 4, 2, 48, 33, 48, 33, 48, 33, 48, 33, 48, 33, 10, 33, 10, 33, 10, 33, 10, 33, 10, 33, 10, 38, 14, 29, 5, 52, 15, 7, 27, 154, 38, 4, 185, 16, 95, 10, 10, 118, 231, 10, 13, 160, 202]
28
+ 2026-01-26 04:45:55,323 INFO [inference.py:318] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 04:45:55,324 INFO [inference.py:319] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 04:45:55,325 INFO [inference.py:320] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
33
+ 2026-01-26 04:46:02,522 INFO [inference.py:341] Encoder out shape: torch.Size([23, 209, 1024])
34
+ 2026-01-26 04:46:02,524 INFO [inference.py:342] Encoder out lens: tensor([209, 207, 207, 200, 198, 196, 190, 190, 185, 184, 182, 174, 170, 163,
35
+ 162, 157, 157, 156, 151, 149, 148, 139, 137])
36
+ 2026-01-26 04:46:02,524 INFO [inference.py:343] Encoder out range: [-12.514, 12.004]
37
+ 2026-01-26 04:46:14,119 INFO [inference.py:353] Number of hypotheses: 23
38
+ 2026-01-26 04:46:14,119 INFO [inference.py:355] First hypothesis: [10, 7, 5, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 4, 5, 21, 306, 210, 96, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 156, 6, 4, 24, 9, 102, 9, 102, 9, 102, 9, 102, 9, 102, 9, 102, 130, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 18, 18, 27, 60, 30, 14, 60, 60, 70, 60, 60, 28, 60, 60, 28, 60, 60, 28, 60, 60, 28, 24, 14, 36, 75, 41, 86, 97, 25, 75, 14, 43, 8, 4, 14, 4, 7, 197, 4, 7, 197, 4, 7, 197, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 17, 169, 17, 169, 17, 169, 17, 169, 17, 169, 17, 147, 169, 147, 169, 147, 169, 147, 169, 147, 169, 25, 147, 147, 147, 147, 147, 147, 147, 147, 147, 4, 7, 197, 112, 4, 7, 197, 112, 4, 7, 197, 112, 4, 7, 197, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 17, 169, 17, 169, 17, 169, 17, 169, 17, 169, 25, 75, 8, 54, 44, 24, 363, 328, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 199, 205, 199, 199, 92, 14, 199, 199, 92, 14, 8, 67, 67, 32, 26, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 27, 8, 8, 119, 8, 19, 201, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 234, 16, 16, 234, 16, 16, 234, 16, 16, 234, 58, 55, 43, 30, 55, 43, 30, 55, 43, 30, 55, 19, 55, 19, 55, 19, 55, 19, 55, 19, 55, 19, 55, 19, 55, 19, 55, 19, 55, 19, 36, 22, 41, 80, 219, 18, 77, 171, 18, 77, 171, 18, 77, 171, 18, 77, 171, 18, 77, 171, 18, 77, 171, 18, 77, 171, 15, 72]
39
+ 2026-01-26 04:46:14,135 INFO [inference.py:318] Audio shape: torch.Size([39, 40640]), dtype: torch.float32
40
+ 2026-01-26 04:46:14,136 INFO [inference.py:319] Audio range: [-0.439, 0.480]
41
+ 2026-01-26 04:46:14,137 INFO [inference.py:320] Audio lengths: tensor([40640, 37279, 36799, 36480, 36480, 33280, 33279, 32320, 32159, 30400,
42
+ 28800, 28480, 28160, 23520, 23039, 22880, 22400, 21920, 21920, 20960,
43
+ 20160, 20000, 19200, 19040, 18880, 18240, 16480, 14720, 13600, 12960,
44
+ 12320, 11680, 11520, 10880, 9440, 9120, 7840, 5920, 5760],
45
+ dtype=torch.int32)
46
+ 2026-01-26 04:46:21,443 INFO [inference.py:341] Encoder out shape: torch.Size([39, 126, 1024])
47
+ 2026-01-26 04:46:21,499 INFO [inference.py:342] Encoder out lens: tensor([126, 116, 114, 113, 113, 103, 103, 100, 100, 94, 89, 88, 87, 73,
48
+ 71, 71, 69, 68, 68, 65, 62, 62, 59, 59, 58, 56, 51, 45,
49
+ 42, 40, 38, 36, 35, 33, 29, 28, 24, 18, 17])
50
+ 2026-01-26 04:46:21,500 INFO [inference.py:343] Encoder out range: [-11.444, 10.811]
51
+ 2026-01-26 04:46:29,134 INFO [inference.py:353] Number of hypotheses: 39
52
+ 2026-01-26 04:46:29,134 INFO [inference.py:355] First hypothesis: [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 4, 2, 64, 10, 7, 5, 51, 13, 227, 211, 120, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 17, 7, 5, 17, 113, 9, 113, 9, 113, 9, 113, 13, 4, 5, 8, 5, 136, 40, 5, 16, 136, 40, 30, 5, 8, 136, 30, 8, 14, 5, 310, 23, 177, 63, 54, 136, 40, 30, 8, 14, 5, 16, 272, 16, 272, 16, 272, 16, 272]
53
+ 2026-01-26 04:46:29,199 INFO [inference.py:318] Audio shape: torch.Size([23, 68000]), dtype: torch.float32
54
+ 2026-01-26 04:46:29,200 INFO [inference.py:319] Audio range: [-0.314, 0.332]
55
+ 2026-01-26 04:46:29,201 INFO [inference.py:320] Audio lengths: tensor([68000, 65920, 65599, 64799, 64160, 63520, 62400, 61600, 59040, 58239,
56
+ 56480, 55840, 55520, 55359, 54719, 53440, 52800, 52640, 47200, 46239,
57
+ 46079, 45280, 44960], dtype=torch.int32)
58
+ 2026-01-26 04:46:36,615 INFO [inference.py:341] Encoder out shape: torch.Size([23, 212, 1024])
59
+ 2026-01-26 04:46:36,617 INFO [inference.py:342] Encoder out lens: tensor([212, 205, 204, 202, 200, 198, 194, 192, 184, 181, 176, 174, 173, 172,
60
+ 170, 166, 164, 164, 147, 144, 143, 141, 140])
61
+ 2026-01-26 04:46:36,617 INFO [inference.py:343] Encoder out range: [-13.261, 11.090]
62
+ 2026-01-26 04:46:45,499 INFO [inference.py:353] Number of hypotheses: 23
63
+ 2026-01-26 04:46:45,500 INFO [inference.py:355] First hypothesis: [31, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 20, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 32, 28, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 130, 4, 2, 11]
64
+ 2026-01-26 04:46:45,516 INFO [inference.py:318] Audio shape: torch.Size([5, 317280]), dtype: torch.float32
65
+ 2026-01-26 04:46:45,516 INFO [inference.py:319] Audio range: [-0.323, 0.414]
66
+ 2026-01-26 04:46:45,517 INFO [inference.py:320] Audio lengths: tensor([317280, 298079, 298080, 294559, 292480], dtype=torch.int32)
67
+ 2026-01-26 04:46:53,506 INFO [inference.py:341] Encoder out shape: torch.Size([5, 991, 1024])
68
+ 2026-01-26 04:46:53,507 INFO [inference.py:342] Encoder out lens: tensor([991, 931, 931, 920, 913])
69
+ 2026-01-26 04:46:53,507 INFO [inference.py:343] Encoder out range: [-14.241, 14.344]
70
+ 2026-01-26 04:47:07,903 INFO [inference.py:353] Number of hypotheses: 5
71
+ 2026-01-26 04:47:07,903 INFO [inference.py:355] First hypothesis: [37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 4, 2, 12, 23, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 7, 69, 7, 69, 7, 69, 7, 69, 7, 69, 7, 5, 116, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 82, 98, 27, 267, 63, 137, 27, 267, 63, 137, 27, 267, 63, 137, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 19, 5, 14, 5, 12, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 31, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 4, 257, 4, 257, 4, 257, 4, 257, 4, 257, 4, 257, 4, 257, 4, 257, 4, 257, 4, 257, 4, 257, 24, 28, 257, 24, 28, 257, 24, 28, 257, 24, 28, 24, 14, 24, 24, 5, 147, 17, 147, 61, 4, 28, 30, 8, 25, 4, 28, 30, 8, 25, 4, 28, 28, 30, 8, 25, 4, 28, 28, 30, 8, 25, 4, 28, 28, 159, 13, 30, 8, 28, 28, 30, 8, 28, 28, 30, 24, 14, 8, 28, 24, 14, 8, 28, 24, 14, 24, 24, 21, 24, 24, 21, 24, 24, 21, 24, 24, 21, 24, 21, 24, 21, 24, 21, 24, 21, 24, 21, 8, 21, 5, 14, 21, 5, 14, 21, 5, 14, 5, 5, 71, 14, 43, 8, 5, 119, 55, 14, 43, 8, 119, 55, 80, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 96, 40, 275, 32, 5, 275, 32, 5, 275, 32, 5, 275, 32, 5, 24, 325, 5, 43, 40, 43, 40, 43, 40, 43, 40, 43, 40, 43, 40, 43, 40, 19, 190, 40, 137, 43, 40, 19, 40, 43, 40, 19, 40, 43, 30, 27, 14, 29, 14, 43, 19, 29, 14, 43, 19, 29, 8, 103, 5, 103, 309, 5, 103, 309, 5, 103, 309, 5, 103, 309, 5, 103, 309, 5, 103, 309, 5, 103, 309, 5, 103, 309, 5, 103, 309, 5, 103, 309, 20, 98, 30, 82, 82, 4, 223, 82, 82, 4, 223, 82, 4, 223, 82, 4, 223, 82, 4, 223, 82, 4, 223, 82, 4, 223, 82, 4, 223, 82, 4, 223, 28, 4, 223, 28, 4, 223, 28, 4, 223, 190, 5, 20, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 106, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 119, 53, 21, 29, 29, 119, 53, 21, 29, 29, 119, 5, 14, 190, 8, 5, 20, 20, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 18, 7, 27, 220, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 6, 24, 138, 67, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 328, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 47, 7, 8, 4, 5, 23, 36, 63, 46, 12, 6, 4, 5, 93, 4, 5, 93, 4, 5, 93, 4, 5, 93, 93, 210, 4, 5, 93, 93, 210, 4, 5, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 91, 5, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 98, 98, 30, 82, 82, 82, 82, 82, 82, 82, 82, 98, 30, 82, 98, 30, 82, 98, 30, 82, 98, 30, 82, 98, 30, 82, 98, 30, 82, 98, 30, 82, 4, 223, 82, 4, 223, 82, 4, 223, 82, 82, 30, 60, 30, 36, 82, 30, 60, 30, 36, 82, 30, 36, 63, 79, 14, 190, 8, 30, 43, 8, 30, 43, 8, 30, 43, 8, 5, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 53, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 14, 43, 8, 5, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 13, 4, 28, 28, 4, 28, 28, 4, 28, 28, 4, 28, 28, 4, 28, 28, 4, 28, 28, 4, 28, 28, 4, 28, 28, 4, 28, 28, 4, 28, 28, 4, 28, 28, 30, 36, 63, 4, 28, 28, 30, 36, 63, 22, 30, 30, 22, 19, 28, 22, 30, 30, 22, 19, 28, 30, 30, 22, 19, 28, 30, 30, 22, 19, 28, 30, 8, 28, 28, 30, 8, 28, 28, 30, 8, 28, 28, 22, 70, 25, 13, 30, 8, 28, 29, 14, 25, 29, 29, 21, 19, 21, 42, 41, 19, 21, 19, 40, 19, 41, 4, 2, 45, 78, 62]
72
+ 2026-01-26 04:47:08,602 INFO [inference.py:318] Audio shape: torch.Size([40, 39360]), dtype: torch.float32
73
+ 2026-01-26 04:47:08,603 INFO [inference.py:319] Audio range: [-0.274, 0.362]
74
+ 2026-01-26 04:47:08,604 INFO [inference.py:320] Audio lengths: tensor([39359, 39199, 39039, 38080, 36000, 35200, 34880, 34880, 33760, 33760,
75
+ 33600, 33120, 29440, 29280, 27360, 24960, 24960, 23680, 21760, 21600,
76
+ 20800, 16800, 16320, 16160, 16000, 15679, 15040, 13440, 12320, 7040,
77
+ 6560, 6400, 5760, 5760, 5120, 4800, 4800, 4640, 4480, 3360],
78
+ dtype=torch.int32)
79
+ 2026-01-26 04:47:15,213 INFO [inference.py:341] Encoder out shape: torch.Size([40, 122, 1024])
80
+ 2026-01-26 04:47:15,214 INFO [inference.py:342] Encoder out lens: tensor([122, 122, 121, 118, 112, 109, 108, 108, 105, 105, 104, 103, 91, 91,
81
+ 85, 77, 77, 73, 67, 67, 64, 52, 50, 50, 49, 48, 46, 41,
82
+ 38, 21, 20, 19, 17, 17, 15, 14, 14, 14, 13, 10])
83
+ 2026-01-26 04:47:15,214 INFO [inference.py:343] Encoder out range: [-11.784, 11.570]
84
+ 2026-01-26 04:47:22,413 INFO [inference.py:353] Number of hypotheses: 40
85
+ 2026-01-26 04:47:22,414 INFO [inference.py:355] First hypothesis: [45, 78, 62, 4, 2, 45, 78, 62, 4, 2, 31, 4, 2, 11, 4, 2, 11]
86
+ 2026-01-26 04:47:22,425 INFO [inference.py:318] Audio shape: torch.Size([23, 66880]), dtype: torch.float32
87
+ 2026-01-26 04:47:22,426 INFO [inference.py:319] Audio range: [-0.514, 0.393]
88
+ 2026-01-26 04:47:22,427 INFO [inference.py:320] Audio lengths: tensor([66880, 65439, 60799, 60320, 59520, 58240, 57280, 56320, 55520, 54080,
89
+ 51840, 51520, 50720, 49920, 49600, 48319, 48320, 47999, 46880, 46079,
90
+ 44640, 44320, 44160], dtype=torch.int32)
91
+ 2026-01-26 04:47:29,228 INFO [inference.py:341] Encoder out shape: torch.Size([23, 208, 1024])
92
+ 2026-01-26 04:47:29,229 INFO [inference.py:342] Encoder out lens: tensor([208, 204, 189, 188, 185, 181, 178, 175, 173, 168, 161, 160, 158, 155,
93
+ 154, 150, 150, 149, 146, 143, 139, 138, 137])
94
+ 2026-01-26 04:47:29,229 INFO [inference.py:343] Encoder out range: [-12.152, 11.060]
95
+ 2026-01-26 04:47:36,938 INFO [inference.py:353] Number of hypotheses: 23
96
+ 2026-01-26 04:47:36,938 INFO [inference.py:355] First hypothesis: [4, 5, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 4, 2, 11, 15, 11, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 58, 40, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 21, 30, 36, 19, 5, 30, 14, 222, 5, 26, 5, 26, 5, 26, 5, 26, 5, 26, 5, 26, 58, 30, 36, 63, 22, 54, 240, 20, 20, 4, 5, 8, 136, 21, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 347, 265, 147, 265, 20, 10, 7, 5, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 22, 104, 108, 8, 183, 25, 10, 33, 10, 7, 5, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 113, 5, 14, 5, 5, 46, 4, 96, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 17, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 68, 71, 26, 5]
97
+ 2026-01-26 04:47:37,011 INFO [inference.py:318] Audio shape: torch.Size([24, 65600]), dtype: torch.float32
98
+ 2026-01-26 04:47:37,011 INFO [inference.py:319] Audio range: [-0.416, 0.458]
99
+ 2026-01-26 04:47:37,012 INFO [inference.py:320] Audio lengths: tensor([65600, 64000, 63680, 61280, 60000, 58080, 55200, 52960, 51359, 51200,
100
+ 50720, 50720, 50080, 49280, 48639, 47840, 47360, 46880, 46400, 46240,
101
+ 45920, 44640, 43040, 42720], dtype=torch.int32)
102
+ 2026-01-26 04:47:44,400 INFO [inference.py:341] Encoder out shape: torch.Size([24, 204, 1024])
103
+ 2026-01-26 04:47:44,401 INFO [inference.py:342] Encoder out lens: tensor([204, 199, 198, 191, 187, 181, 172, 165, 160, 159, 158, 158, 156, 153,
104
+ 151, 149, 147, 146, 144, 144, 143, 139, 134, 133])
105
+ 2026-01-26 04:47:44,401 INFO [inference.py:343] Encoder out range: [-12.007, 11.624]
106
+ 2026-01-26 04:47:51,732 INFO [inference.py:353] Number of hypotheses: 24
107
+ 2026-01-26 04:47:51,732 INFO [inference.py:355] First hypothesis: [11, 4, 2, 11, 17, 7, 5, 59, 19, 75, 8, 164, 21, 27, 4, 2, 9, 49, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 15, 7, 27, 154, 34, 16, 125, 10, 13, 24, 19, 5, 8, 41, 5, 92, 55, 490, 86, 97, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 5, 19, 24, 14, 5, 20, 84, 18, 7, 85, 34, 13, 227, 211, 120, 4, 2, 64, 18, 7, 85, 116, 73, 13, 227, 211, 120, 46, 13, 227, 211, 120, 46, 13, 227, 211, 120, 46, 13, 227, 211, 120, 46, 13, 227, 211, 120, 46, 66, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 18, 34, 16, 34, 13, 227, 211, 120, 25, 6, 24, 9, 7, 24, 68, 218, 52, 15, 34, 16, 34, 13, 74, 19, 5, 8, 41, 19, 86, 97, 25, 75, 41, 86, 97, 25, 75, 41, 86, 97, 25, 75, 41, 86, 97, 25, 75, 41, 86, 97, 25, 75, 41, 86, 97, 25, 75, 41, 86, 97, 25, 75, 41, 86, 97, 25, 75, 41, 86, 97, 25, 75, 41, 86, 97, 369]
108
+ 2026-01-26 04:47:51,743 INFO [inference.py:318] Audio shape: torch.Size([9, 176960]), dtype: torch.float32
109
+ 2026-01-26 04:47:51,744 INFO [inference.py:319] Audio range: [-0.135, 0.191]
110
+ 2026-01-26 04:47:51,745 INFO [inference.py:320] Audio lengths: tensor([176960, 170720, 164480, 155840, 154559, 151839, 151840, 151360, 147040],
111
+ dtype=torch.int32)
112
+ 2026-01-26 04:48:00,501 INFO [inference.py:341] Encoder out shape: torch.Size([9, 552, 1024])
113
+ 2026-01-26 04:48:00,502 INFO [inference.py:342] Encoder out lens: tensor([552, 533, 513, 486, 482, 474, 474, 472, 459])
114
+ 2026-01-26 04:48:00,502 INFO [inference.py:343] Encoder out range: [-13.325, 12.083]
115
+ 2026-01-26 04:48:10,617 INFO [inference.py:353] Number of hypotheses: 9
116
+ 2026-01-26 04:48:10,617 INFO [inference.py:355] First hypothesis: [89, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 21, 94, 221, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 17, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 33, 113, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 4, 5, 21, 5, 97, 5, 5, 19, 5, 14, 379, 5, 292, 379, 5, 292, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 379, 130, 101, 101, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 106, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 29, 29, 119, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 106, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 29, 21, 29, 29, 28, 29, 29, 28, 29, 29, 119, 149, 29, 29, 119, 149, 29, 29, 119, 149, 29, 14, 43, 8, 21, 29, 29, 119, 149, 29, 14, 43, 8, 119, 55, 18, 77, 77, 18, 77, 171, 18, 77, 171, 18, 77, 171, 18, 77, 171, 18, 77, 171, 18, 77, 171, 18, 77, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 81, 7, 27, 12, 52, 10, 52, 10, 52, 10, 52, 10, 52, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 324, 115, 54, 10, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 71, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 53, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 134, 36, 58, 140, 36, 58, 140, 36, 58, 140, 36, 119, 149, 22, 18, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 4, 63, 19, 41, 4, 63, 19, 41, 4, 63, 21, 82, 19, 70, 4, 185, 4, 185, 4, 185, 4, 185, 82, 185, 88, 100, 16, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 36, 58, 58, 36, 58, 58, 36, 58, 58, 36, 58, 140, 36, 58, 140, 36, 58, 140, 36, 58, 134, 36, 121, 46, 74, 58, 140, 36, 19, 36, 19, 36, 19, 36, 19, 36, 19, 36, 19, 36, 19, 36, 19, 36, 19, 36, 19, 36, 79, 14, 43, 8, 119, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 6, 106, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 29, 29, 53, 21, 29, 29, 119, 53, 21, 29, 29, 119, 53, 21, 29, 29, 14, 53, 21, 29, 29, 14, 53, 21, 29, 29, 21, 29, 29, 21, 29, 29, 21, 79, 14, 29, 14, 21, 29, 14, 21, 29, 14, 43, 8, 119, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 84, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 81, 7, 27, 10, 10, 53, 28, 29, 29, 90, 53, 28, 29, 29, 90, 53, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 28, 30, 29, 28, 30, 29, 28, 30, 29, 28, 28, 29, 29, 28, 28, 29, 29, 28, 28, 29, 29, 14, 43, 8, 30, 29, 29, 14, 43, 8, 5, 29, 14, 43, 8, 5, 29, 14, 43, 8, 5, 23, 17, 7, 5, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 205, 205, 5, 17, 13, 43, 43, 28, 133, 180, 43, 30, 24, 180, 8, 86, 180, 8, 142, 19, 21, 42, 26, 35, 87, 17, 174, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 12]
117
+ 2026-01-26 04:48:10,629 INFO [inference.py:318] Audio shape: torch.Size([14, 112320]), dtype: torch.float32
118
+ 2026-01-26 04:48:10,630 INFO [inference.py:319] Audio range: [-0.469, 0.457]
119
+ 2026-01-26 04:48:10,630 INFO [inference.py:320] Audio lengths: tensor([112320, 105920, 105439, 104000, 103840, 101920, 98720, 98400, 96960,
120
+ 96800, 96320, 95680, 93760, 93600], dtype=torch.int32)
121
+ 2026-01-26 04:48:18,040 INFO [inference.py:341] Encoder out shape: torch.Size([14, 350, 1024])
122
+ 2026-01-26 04:48:18,040 INFO [inference.py:342] Encoder out lens: tensor([350, 330, 329, 324, 324, 318, 308, 307, 302, 302, 300, 298, 292, 292])
123
+ 2026-01-26 04:48:18,041 INFO [inference.py:343] Encoder out range: [-14.286, 11.940]
124
+ 2026-01-26 04:48:29,123 INFO [inference.py:353] Number of hypotheses: 14
125
+ 2026-01-26 04:48:29,123 INFO [inference.py:355] First hypothesis: [39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 11, 31, 4, 24, 4, 32, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 12, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 22, 142, 19, 22, 22, 142, 19, 22, 22, 142, 19, 22, 262, 20, 35, 32, 28, 67, 8, 26, 35, 32, 28, 8, 86, 22, 142, 117, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 22, 21, 8, 8, 21, 8, 8, 21, 8, 8, 142, 86, 22, 142, 67, 22, 142, 67, 22, 142, 67, 22, 142, 67, 22, 108, 22, 142, 67, 22, 108, 22, 142, 67, 32, 67, 14, 86, 32, 67, 14, 86, 32, 67, 14, 86, 32, 67, 14, 86, 32, 67, 14, 86, 32, 67, 14, 32, 67, 14, 32, 67, 14, 32, 67, 14, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 85, 13, 13, 13, 13, 13, 13, 13, 13, 22, 22, 4, 224, 4, 224, 4, 224, 4, 224, 4, 224, 30, 60, 30, 183, 51, 4, 224, 30, 60, 30, 183, 56, 15, 51, 56, 15, 51, 56, 15, 51, 56, 15, 56, 15, 56, 15, 56, 15, 56, 15, 56, 56, 15, 56, 56, 15, 56, 56, 15, 56, 15, 56, 15, 56, 15, 56, 15, 56, 15, 56, 148, 4, 27, 148, 4, 27, 148, 4, 27, 148, 4, 27, 148, 4, 27, 21, 104, 19, 22, 14, 54, 19, 149, 22, 54, 54, 80, 10, 191, 32, 7, 8, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 4, 5, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 26, 52, 18, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 15, 49, 26, 130, 15, 7, 27, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 82, 4, 28, 82, 4, 28, 82, 4, 28, 82, 82, 70, 25, 4, 28, 82, 82, 70, 25, 4, 223, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 104, 19, 82, 104, 19, 82, 104, 19, 82, 104, 19, 104, 19, 104, 19, 104, 19, 104, 19, 104, 19, 263, 80, 48, 33, 48, 33, 262, 446, 446, 446, 446, 446, 446, 446, 446, 446, 446, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 20, 262, 20, 262, 20, 262, 20, 262, 20, 262, 31, 20, 31, 20, 31, 20, 31, 20, 31, 20, 31, 262, 31, 262, 31, 262, 31, 262, 31, 262, 31]
126
+ 2026-01-26 04:48:29,127 INFO [inference.py:544] Processed 206 utterances in 10 batches
127
+ 2026-01-26 04:48:29,135 INFO [inference.py:318] Audio shape: torch.Size([38, 41440]), dtype: torch.float32
128
+ 2026-01-26 04:48:29,136 INFO [inference.py:319] Audio range: [-0.272, 0.322]
129
+ 2026-01-26 04:48:29,137 INFO [inference.py:320] Audio lengths: tensor([41440, 41120, 40160, 35680, 33120, 32960, 32800, 31520, 31040, 30880,
130
+ 30239, 29920, 29120, 27360, 25279, 24480, 23520, 22720, 22720, 21600,
131
+ 20800, 20320, 19840, 19840, 17600, 15520, 13120, 12480, 12320, 11040,
132
+ 10560, 9600, 8640, 7520, 5440, 5120, 5120, 4640],
133
+ dtype=torch.int32)
134
+ 2026-01-26 04:48:35,710 INFO [inference.py:341] Encoder out shape: torch.Size([38, 129, 1024])
135
+ 2026-01-26 04:48:35,710 INFO [inference.py:342] Encoder out lens: tensor([129, 128, 125, 111, 103, 102, 102, 98, 96, 96, 94, 93, 90, 85,
136
+ 78, 76, 73, 70, 70, 67, 64, 63, 61, 61, 54, 48, 40, 38,
137
+ 38, 34, 32, 29, 26, 23, 16, 15, 15, 14])
138
+ 2026-01-26 04:48:35,711 INFO [inference.py:343] Encoder out range: [-13.512, 11.822]
139
+ 2026-01-26 04:48:43,120 INFO [inference.py:353] Number of hypotheses: 38
140
+ 2026-01-26 04:48:43,121 INFO [inference.py:355] First hypothesis: [9, 7, 85, 13, 4, 27, 5, 30, 27, 4, 27, 5, 30, 27, 4, 27, 5, 30, 5, 5, 19, 97, 5, 5, 19, 97, 5, 5, 19, 97, 5, 5, 19, 97, 5, 5, 19, 97, 5, 5, 19, 5, 14, 43, 19, 5, 14, 43, 19, 5, 43, 19, 5, 43, 19, 5, 43, 19, 5, 43, 21, 29, 14, 43, 19, 21, 43, 8, 19, 201, 14, 190, 19, 201, 14, 190, 19, 201, 14, 190, 8, 19, 201, 14, 8, 119, 55, 80, 118, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 10, 4, 14, 199, 199, 202, 199, 199, 202, 199, 199, 202, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 71, 14, 14, 75, 75, 14, 14, 75, 75, 14, 43, 8, 26, 130, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11, 4, 2, 11]
141
+ 2026-01-26 04:48:43,132 INFO [inference.py:318] Audio shape: torch.Size([38, 41280]), dtype: torch.float32
142
+ 2026-01-26 04:48:43,133 INFO [inference.py:319] Audio range: [-0.080, 0.105]
143
+ 2026-01-26 04:48:43,133 INFO [inference.py:320] Audio lengths: tensor([41280, 40320, 36800, 35680, 34880, 34879, 34080, 34080, 32000, 30400,
144
+ 29280, 29280, 28320, 24000, 23040, 20960, 20960, 20960, 20160, 16960,
145
+ 14080, 13280, 12640, 12160, 10720, 9440, 8640, 6240, 6080, 5440,
146
+ 5440, 5120, 4800, 4800, 4640, 4480, 4320, 4160],
147
+ dtype=torch.int32)
148
+ 2026-01-26 04:48:50,029 INFO [inference.py:341] Encoder out shape: torch.Size([38, 128, 1024])
149
+ 2026-01-26 04:48:50,030 INFO [inference.py:342] Encoder out lens: tensor([128, 125, 114, 111, 108, 108, 106, 106, 99, 94, 91, 91, 88, 74,
150
+ 71, 65, 65, 65, 62, 52, 43, 41, 39, 37, 33, 29, 26, 19,
151
+ 18, 16, 16, 15, 14, 14, 14, 13, 13, 12])
152
+ 2026-01-26 04:48:50,030 INFO [inference.py:343] Encoder out range: [-11.071, 11.522]
153
+ 2026-01-26 04:48:56,638 INFO [inference.py:353] Number of hypotheses: 38
154
+ 2026-01-26 04:48:56,638 INFO [inference.py:355] First hypothesis: [4, 62, 4, 2, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 68, 4, 2, 20, 4, 32, 4, 32, 4, 32, 4, 32, 4, 32, 4, 32, 4, 32, 4, 32, 4, 32, 4, 2, 20, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 4, 2, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 7, 5, 65, 19, 36, 63, 54, 17, 6, 124, 32, 28, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 77, 71, 19, 70, 77, 71, 19, 70, 77, 71, 19, 70, 22, 22, 70, 22, 22, 70, 22, 22, 70, 22, 22, 148, 148, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 24, 185, 25, 13, 82, 185, 64]
155
+ 2026-01-26 04:48:56,707 INFO [inference.py:318] Audio shape: torch.Size([38, 41760]), dtype: torch.float32
156
+ 2026-01-26 04:48:56,707 INFO [inference.py:319] Audio range: [-0.246, 0.340]
157
+ 2026-01-26 04:48:56,708 INFO [inference.py:320] Audio lengths: tensor([41760, 39680, 38880, 36799, 36639, 36000, 34559, 34240, 33120, 31840,
158
+ 30720, 30560, 29760, 29280, 24640, 24160, 22720, 21759, 21600, 20960,
159
+ 16320, 14400, 13600, 11360, 10880, 10399, 10400, 9760, 9440, 9280,
160
+ 8320, 8320, 7680, 7360, 6880, 6880, 6240, 6240],
161
+ dtype=torch.int32)
162
+ 2026-01-26 04:49:03,412 INFO [inference.py:341] Encoder out shape: torch.Size([38, 130, 1024])
163
+ 2026-01-26 04:49:03,412 INFO [inference.py:342] Encoder out lens: tensor([130, 123, 121, 114, 114, 112, 107, 106, 103, 99, 95, 95, 92, 91,
164
+ 76, 75, 70, 67, 67, 65, 50, 44, 42, 35, 33, 32, 32, 30,
165
+ 29, 28, 25, 25, 23, 22, 21, 21, 19, 19])
166
+ 2026-01-26 04:49:03,413 INFO [inference.py:343] Encoder out range: [-11.967, 11.229]
167
+ 2026-01-26 04:49:09,503 INFO [inference.py:353] Number of hypotheses: 38
168
+ 2026-01-26 04:49:09,503 INFO [inference.py:355] First hypothesis: [145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 145, 4, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 154, 44, 13, 24, 28, 27, 170, 205, 5, 130, 4, 2, 11, 39]
169
+ 2026-01-26 04:49:09,512 INFO [inference.py:318] Audio shape: torch.Size([9, 170400]), dtype: torch.float32
170
+ 2026-01-26 04:49:09,513 INFO [inference.py:319] Audio range: [-0.370, 0.393]
171
+ 2026-01-26 04:49:09,513 INFO [inference.py:320] Audio lengths: tensor([170400, 166559, 165919, 164800, 156800, 152480, 147520, 146559, 145759],
172
+ dtype=torch.int32)
173
+ 2026-01-26 04:49:17,349 INFO [inference.py:341] Encoder out shape: torch.Size([9, 532, 1024])
174
+ 2026-01-26 04:49:17,349 INFO [inference.py:342] Encoder out lens: tensor([532, 520, 518, 514, 489, 476, 460, 457, 455])
175
+ 2026-01-26 04:49:17,350 INFO [inference.py:343] Encoder out range: [-12.221, 14.348]
176
+ 2026-01-26 04:49:28,539 INFO [inference.py:353] Number of hypotheses: 9
177
+ 2026-01-26 04:49:28,539 INFO [inference.py:355] First hypothesis: [37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 4, 2, 4, 7, 4, 2, 4, 7, 4, 2, 4, 7, 4, 2, 4, 7, 4, 2, 4, 7, 4, 2, 4, 7, 4, 2, 4, 7, 4, 2, 4, 7, 197, 4, 2, 4, 7, 197, 4, 2, 4, 7, 197, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 18, 7, 69, 4, 2, 23, 17, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 51, 13, 43, 8, 232, 80, 4, 2, 23, 57, 18, 7, 85, 116, 55, 5, 28, 36, 30, 55, 80, 17, 17, 7, 5, 13, 43, 8, 5, 17, 7, 5, 13, 43, 8, 5, 17, 7, 5, 13, 43, 8, 5, 25, 6, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 39, 9, 46, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 6, 74, 67, 14, 233, 6, 74, 67, 14, 233, 6, 74, 67, 14, 9, 24, 29, 70, 173, 17, 81, 7, 27, 154, 38, 13, 227, 211, 25, 6, 4, 27, 43, 8, 232, 4, 2, 274, 5, 156, 6, 222, 298, 157, 41, 5, 92, 55, 490, 86, 97, 369, 6, 222, 298, 157, 41, 5, 92, 55, 490, 86, 97, 369, 4, 5, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 4, 2, 23, 15, 47, 122, 6, 226, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 4, 2, 127, 33, 4, 27, 5, 30, 27, 5, 30, 27, 5, 30, 27, 5, 30, 27, 5, 5, 19, 5, 8, 40, 19, 5, 8, 40, 19, 5, 8, 40, 19, 5, 8, 40, 19, 5, 8, 5, 19, 5, 19, 5, 19, 5, 19, 5, 19, 5, 8, 19, 24, 19, 5, 8, 19, 24, 19, 5, 14, 69, 40, 183, 25, 6, 222, 298, 157, 41, 5, 92, 55, 490, 86, 97, 369, 31, 20, 31, 20, 31, 20, 31, 20, 31, 20, 31, 9, 83, 7, 8, 34, 13, 31, 13, 239, 25, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 4, 2, 64, 18, 7, 85, 116, 123, 64, 18, 123, 64, 18, 123, 64, 18, 123, 64, 9, 47, 7, 8, 72, 52, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 4, 2, 64, 4, 2, 64, 4, 2, 64, 4, 2, 64, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 100, 57, 100, 57, 100, 57, 100, 57, 100, 57, 371, 208, 387, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 12, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 74, 67, 14, 12]
178
+ 2026-01-26 04:49:28,610 INFO [inference.py:318] Audio shape: torch.Size([5, 315520]), dtype: torch.float32
179
+ 2026-01-26 04:49:28,611 INFO [inference.py:319] Audio range: [-0.297, 0.334]
180
+ 2026-01-26 04:49:28,612 INFO [inference.py:320] Audio lengths: tensor([315520, 301440, 294399, 292480, 289919], dtype=torch.int32)
181
+ 2026-01-26 04:49:36,428 INFO [inference.py:341] Encoder out shape: torch.Size([5, 985, 1024])
182
+ 2026-01-26 04:49:36,429 INFO [inference.py:342] Encoder out lens: tensor([985, 941, 919, 913, 905])
183
+ 2026-01-26 04:49:36,429 INFO [inference.py:343] Encoder out range: [-12.260, 13.635]
184
+ 2026-01-26 04:49:53,832 INFO [inference.py:353] Number of hypotheses: 5
185
+ 2026-01-26 04:49:53,833 INFO [inference.py:355] First hypothesis: [31, 53, 27, 8, 119, 55, 80, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 24, 113, 5, 113, 5, 14, 5, 5, 23, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 15, 315, 15, 7, 69, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 25, 6, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 7, 5, 87, 7, 5, 87, 7, 5, 87, 7, 5, 51, 272, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 6, 17, 17, 106, 21, 96, 204, 9, 204, 204, 204, 204, 204, 204, 204, 204, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 204, 9, 204, 9, 204, 9, 204, 9, 204, 9, 258, 10, 7, 5, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 7, 5, 154, 231, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 8, 54, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 106, 40, 275, 67, 279, 359, 275, 67, 279, 359, 275, 67, 279, 359, 275, 67, 279, 359, 275, 67, 279, 359, 275, 67, 172, 359, 42, 26, 170, 24, 24, 170, 8, 19, 86, 22, 142, 19, 24, 19, 36, 108, 32, 19, 24, 19, 24, 19, 24, 19, 24, 19, 24, 19, 24, 19, 24, 19, 24, 19, 24, 19, 24, 14, 24, 19, 24, 14, 24, 19, 24, 14, 24, 19, 24, 24, 19, 24, 24, 19, 24, 24, 19, 24, 19, 24, 19, 24, 19, 24, 19, 24, 19, 5, 19, 5, 14, 36, 19, 5, 14, 36, 19, 5, 14, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 14, 15, 4, 42, 15, 4, 42, 15, 4, 27, 4, 14, 4, 14, 4, 14, 4, 14, 4, 24, 30, 24, 24, 92, 173, 115, 54, 16, 25, 4, 96, 4, 96, 4, 96, 4, 96, 4, 96, 4, 96, 108, 209, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 297, 5, 25, 6, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 125, 10, 35, 6, 35, 6, 35, 6, 35, 6, 35, 6, 35, 6, 35, 6, 35, 6, 35, 6, 35, 6, 24, 20, 84, 9, 24, 24, 20, 9, 24, 24, 20, 9, 24, 24, 98, 27, 267, 153, 267, 40, 30, 27, 267, 153, 267, 153, 267, 153, 267, 153, 5, 41, 5, 92, 5, 41, 5, 92, 5, 41, 5, 90, 42, 41, 26, 20, 18, 7, 27, 18, 7, 27, 18, 7, 27, 18, 18, 27, 18, 18, 27, 18, 18, 27, 18, 18, 27, 18, 18, 27, 18, 18, 27, 18, 18, 27, 113, 10, 100, 10, 33, 10, 114, 32, 7, 8, 72, 15, 72, 15, 72, 15, 72, 32, 26, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 30, 96, 244, 20, 38, 53, 38, 53, 38, 53, 38, 53, 38, 53, 38, 53, 38, 53, 38, 53, 38, 53, 38, 53, 38, 53, 21, 29, 35, 369, 38, 86, 38, 35, 32, 28, 104, 108, 8, 26, 189, 19, 5, 162, 33, 10, 33, 5, 30, 5, 71, 19, 5, 71, 19, 5, 71, 19, 5, 71, 19, 5, 71, 19, 5, 71, 19, 5, 71, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 52, 15, 7, 27, 177, 28, 30, 96, 71, 19, 29, 71, 21, 29, 14, 25, 75, 25, 10, 7, 5, 13, 29, 29, 14, 43, 8, 232, 4, 2, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 10, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 16, 8, 16, 16, 16, 16, 16, 16, 16, 16, 136, 19, 22, 54, 16, 56, 136, 54, 16, 56, 249, 249, 249, 249, 249, 249, 249, 249, 249, 249, 10, 7, 5, 13, 16, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 36, 58, 58, 36, 58, 58, 36, 58, 58, 36, 58, 36, 58, 36, 58, 36, 58, 36, 58, 36, 19, 36, 58, 140, 21, 43, 36, 19, 36, 58, 63, 40, 19, 41, 69, 40, 69, 75, 75, 14, 131, 13, 9, 13, 9, 13, 9, 13, 9, 13, 9, 7, 85, 13, 4, 14, 70, 13, 13, 74, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 341, 18, 7, 27, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 47, 154, 154, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 259, 259, 259, 259, 259, 259, 259, 259, 259, 259, 101, 6, 24, 35, 6, 35, 82, 70, 182, 182, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 4, 5, 23, 195, 25, 13, 22, 104, 19, 22, 14, 10, 7, 5, 10, 7, 5, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 4, 27, 5, 30, 27, 5, 30, 27, 5, 30, 27, 5, 5, 19, 5, 5, 19, 5, 5, 19, 5, 5, 19, 5, 5, 30, 5, 159, 31, 46, 46, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 18, 18, 34, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 29, 28, 28, 29, 28, 28, 29, 28, 28, 29, 29, 28, 29, 29, 28, 29, 29, 28, 29, 29, 119, 5, 14, 190, 8, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 159, 33, 4, 96, 40, 275, 32, 5, 24, 86, 22, 142, 5, 43, 40, 19, 60, 40, 79, 137, 43, 40, 19, 60, 137, 43, 40, 19, 60, 137, 43, 40, 19, 60, 137, 43, 40, 19, 60, 137, 19, 60, 137, 19, 60, 137, 19, 60, 137, 19, 82, 19, 40, 19, 82, 19, 40, 19, 82, 19, 29, 14, 21, 29, 14, 21, 29, 14, 21, 29, 29, 14, 43, 8, 5, 29, 14, 29, 29, 14, 29, 29, 14, 29, 5, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 159, 33, 5, 4, 2, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 16, 22, 16, 394, 333, 172, 20, 9, 7, 85, 34, 199, 25, 6, 35, 22, 19, 28, 24, 14, 8, 28, 24, 14, 8, 28, 24, 25, 6, 24, 5, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 24, 24, 63, 42, 33, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5]
186
+ 2026-01-26 04:49:53,846 INFO [inference.py:318] Audio shape: torch.Size([6, 239520]), dtype: torch.float32
187
+ 2026-01-26 04:49:53,847 INFO [inference.py:319] Audio range: [-0.116, 0.111]
188
+ 2026-01-26 04:49:53,848 INFO [inference.py:320] Audio lengths: tensor([239519, 234240, 223840, 223360, 219679, 215680], dtype=torch.int32)
189
+ 2026-01-26 04:50:02,225 INFO [inference.py:341] Encoder out shape: torch.Size([6, 748, 1024])
190
+ 2026-01-26 04:50:02,226 INFO [inference.py:342] Encoder out lens: tensor([748, 731, 699, 697, 686, 673])
191
+ 2026-01-26 04:50:02,226 INFO [inference.py:343] Encoder out range: [-13.591, 10.919]
192
+ 2026-01-26 04:50:10,299 INFO [inference.py:353] Number of hypotheses: 6
193
+ 2026-01-26 04:50:10,299 INFO [inference.py:355] First hypothesis: [49, 4, 2, 20, 84, 18, 7, 27, 154, 38, 13, 227, 211, 120, 412, 23, 23, 4, 2, 31, 20, 84, 18, 7, 27, 154, 34, 16, 34, 13, 4, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 80, 4, 2, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 4, 2, 9, 49, 9, 7, 24, 68, 218, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 9, 7, 24, 68, 218, 52, 18, 7, 27, 154, 34, 16, 34, 13, 4, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 5, 5, 19, 82, 70, 25, 321, 18, 7, 85, 116, 334, 6, 24, 61, 110, 46, 17, 88, 38, 120, 289, 39, 18, 7, 69, 38, 86, 98, 30, 22, 233, 6, 221, 18, 18, 241, 13, 160, 202, 25, 6, 24, 5, 25, 6, 24, 5, 25, 6, 24, 5, 25, 6, 24, 33, 10, 13, 239, 25, 31]
194
+ 2026-01-26 04:50:10,309 INFO [inference.py:318] Audio shape: torch.Size([5, 315200]), dtype: torch.float32
195
+ 2026-01-26 04:50:10,310 INFO [inference.py:319] Audio range: [-0.082, 0.158]
196
+ 2026-01-26 04:50:10,311 INFO [inference.py:320] Audio lengths: tensor([315200, 310560, 300000, 299680, 296959], dtype=torch.int32)
197
+ 2026-01-26 04:50:18,933 INFO [inference.py:341] Encoder out shape: torch.Size([5, 984, 1024])
198
+ 2026-01-26 04:50:18,933 INFO [inference.py:342] Encoder out lens: tensor([984, 970, 937, 936, 927])
199
+ 2026-01-26 04:50:18,934 INFO [inference.py:343] Encoder out range: [-14.589, 11.647]
200
+ 2026-01-26 04:50:32,710 INFO [inference.py:353] Number of hypotheses: 5
201
+ 2026-01-26 04:50:32,710 INFO [inference.py:355] First hypothesis: [59, 4, 7, 197, 4, 7, 197, 4, 7, 197, 4, 7, 197, 4, 7, 197, 4, 7, 197, 4, 7, 197, 23, 15, 7, 27, 249, 56, 15, 249, 56, 15, 249, 56, 15, 249, 56, 15, 7, 27, 310, 310, 310, 310, 310, 310, 310, 18, 7, 27, 38, 26, 35, 331, 4, 32, 67, 32, 67, 32, 67, 32, 67, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 264, 205, 264, 264, 264, 264, 264, 264, 264, 264, 128, 23, 9, 34, 64, 64, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 106, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 56, 217, 217, 36, 16, 259, 16, 56, 217, 36, 19, 55, 5, 14, 36, 5, 14, 36, 5, 14, 36, 5, 14, 36, 5, 14, 36, 5, 14, 36, 5, 14, 36, 5, 14, 36, 5, 14, 36, 5, 14, 36, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 81, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 76, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 245, 36, 245, 134, 46, 46, 131, 34, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 14, 21, 4, 28, 4, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 30, 8, 28, 28, 30, 8, 28, 28, 30, 8, 28, 29, 29, 8, 8, 137, 8, 40, 19, 60, 137, 8, 96, 63, 209, 8, 137, 8, 96, 63, 60, 41, 149, 60, 137, 8, 96, 63, 60, 119, 8, 30, 201, 14, 8, 14, 43, 8, 155, 30, 24, 443, 155, 22, 30, 36, 8, 19, 36, 22, 41, 55, 70, 22, 36, 30, 36, 63, 105, 50, 20, 50, 20, 20, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 152, 25, 274, 5, 20, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 74, 46, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 36, 29, 29, 36, 53, 36, 29, 29, 36, 53, 36, 30, 36, 53, 36, 30, 36, 53, 36, 30, 36, 30, 36, 30, 36, 30, 36, 30, 36, 30, 60, 60, 70, 32, 26, 35, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 36, 53, 36, 29, 80, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 26, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 6, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 76, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 49, 26, 33, 35, 13, 30, 27, 20, 4, 27, 55, 19, 60, 30, 183, 20, 9, 100, 51, 51, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 106, 21, 96, 14, 14, 96, 455, 106, 14, 96, 455, 455, 455, 455, 455, 455, 455, 455, 455, 455, 455, 189, 43, 21, 8, 96, 14, 14, 43, 8, 204, 80, 95, 33, 13, 33, 13, 33, 13, 33, 13, 33, 13, 33, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 4, 5, 19, 5, 5, 19, 5, 5, 19, 5, 5, 19, 5, 5, 19, 5, 5, 19, 5, 5, 19, 5, 5, 19, 228, 19, 228, 19, 228, 19, 228, 19, 228, 19, 228, 19, 228, 19, 228, 19, 228, 19, 5, 19, 5, 19, 5, 19, 5, 19, 5, 19, 5, 8, 5, 8, 5, 8, 5, 8, 5, 8, 5, 8, 5, 8, 8, 28, 8, 8, 28, 8, 8, 28, 8, 8, 28, 24, 8, 28, 24, 8, 28, 24, 8, 28, 24, 14, 8, 28, 24, 14, 8, 28, 24, 14, 8, 28, 24, 14, 8, 28, 24, 14, 8, 28, 24, 14, 24, 24, 29, 36, 8, 14, 29, 29, 54, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 76, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 22, 41, 54, 117, 6, 16, 6, 16, 6, 16, 6, 16, 6, 16, 6, 16, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 71, 132, 21, 132, 19, 131, 370, 132, 21, 370, 132, 21, 370, 132, 21, 370, 132, 19, 131, 370, 41, 13, 13, 4, 70, 4, 140, 82, 4, 70, 69, 36, 4, 140, 82, 4, 70, 69, 36, 69, 185, 267, 153, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 158, 30, 70, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 194, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 47, 152, 25, 124, 32, 28, 8, 86, 32, 28, 8, 86, 13, 32, 28, 8, 21, 24, 14, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 63, 4, 28, 4, 27, 21, 13, 4, 27, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 29, 29, 28, 29, 14, 13, 29, 29, 28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 36, 53, 36, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 383, 26, 4, 27, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 403, 50, 73, 59, 59, 19, 75, 19, 286, 25, 59, 28, 59, 28, 59, 28, 36, 25, 105, 25, 105, 25, 105, 25, 105, 25, 105, 25, 6, 74, 153, 54, 17, 6, 39, 39, 10, 7, 5, 13, 22, 22, 54, 16, 6, 4, 499, 30, 28, 43, 43, 90, 188, 22, 14, 36, 22, 22, 70, 22, 22, 70, 22, 22, 70, 22, 22, 86, 22, 86, 20, 6, 24, 4, 24, 4, 24, 4, 24, 4, 24, 4, 24, 30, 24, 155, 8, 30, 24, 63, 8, 173, 214, 101, 214, 101, 215, 255, 215, 255, 215, 255, 215, 255, 215, 255, 215, 255, 215, 255, 215, 255, 215, 255, 215, 255, 215, 255, 26, 35, 6, 35, 32, 28, 42, 26, 15, 87, 81, 81, 76, 87, 76, 87, 76, 87, 76, 87, 76, 87, 76, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 43, 43, 90, 188, 22, 14, 70, 43, 36, 30, 22, 14, 70, 43, 8, 54, 38, 56, 18, 56, 18, 56, 18, 56, 18, 56, 18, 56, 10, 35, 5, 19, 5, 93, 170, 24, 24, 19, 24, 24, 19, 24, 24, 19, 24, 24, 19, 24, 24, 63, 19, 24, 14, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 29, 29, 14, 43, 8, 38, 53, 30, 82, 38, 38, 38, 38, 38, 38, 38, 38, 38, 35, 6, 165, 32, 30, 24, 67, 32, 153, 32, 226, 331, 33, 59, 19, 75, 75, 19, 5, 75, 75, 19, 5, 75, 75, 19, 104, 19, 75, 75, 19, 104, 19, 75, 75, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 141, 19, 104, 19, 75, 19, 104, 19, 75, 19, 104, 19, 104, 19, 104, 19, 104, 19, 104, 19, 104, 19, 22, 30, 97, 15, 51, 95, 15, 51, 95, 15, 51, 95, 15, 51, 95, 10, 33, 51, 33, 51, 33, 51, 33, 51, 44, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 6, 4, 5, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 122, 15, 122, 32, 135, 225, 225, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 73, 6, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 7, 27, 56, 15, 56, 15, 56, 15, 56, 15, 56, 15, 56, 15, 72, 35, 5, 8, 5, 93, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 24, 9, 7, 24, 9, 7, 24, 9, 7, 24, 125, 10, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 30, 29, 28, 28, 29, 14, 147, 147, 147, 147, 31, 279, 20, 279, 20, 279, 20, 279, 20, 279, 20, 279, 20, 279, 20, 279, 20, 279, 20, 279, 20, 279, 26, 16, 269, 30, 24, 29, 36, 108, 79, 243, 243, 391, 274, 156, 180, 4, 257, 4, 257, 4, 257, 4, 257, 4, 257, 24, 28, 24, 4, 24, 4, 24, 4, 24, 4, 24, 29, 36, 5, 29, 29, 5, 19, 97, 5, 5, 19, 97, 5, 5, 19, 97, 29, 19, 97, 29, 19, 97, 29, 19, 97, 29, 19, 19, 29, 19, 19, 29, 19, 19, 29, 19, 19, 29, 19, 19, 29, 19, 19, 29, 19, 19, 29, 71, 19, 29, 71, 19, 29, 71, 19, 29, 8, 54, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 7, 5, 31, 12, 31]
202
+ 2026-01-26 04:50:32,722 INFO [inference.py:318] Audio shape: torch.Size([6, 237280]), dtype: torch.float32
203
+ 2026-01-26 04:50:32,723 INFO [inference.py:319] Audio range: [-0.130, 0.131]
204
+ 2026-01-26 04:50:32,723 INFO [inference.py:320] Audio lengths: tensor([237280, 228159, 220639, 220480, 219359, 213119], dtype=torch.int32)
205
+ 2026-01-26 04:50:40,430 INFO [inference.py:341] Encoder out shape: torch.Size([6, 741, 1024])
206
+ 2026-01-26 04:50:40,430 INFO [inference.py:342] Encoder out lens: tensor([741, 712, 689, 688, 685, 665])
207
+ 2026-01-26 04:50:40,435 INFO [inference.py:343] Encoder out range: [-13.120, 12.506]
208
+ 2026-01-26 04:50:52,401 INFO [inference.py:353] Number of hypotheses: 6
209
+ 2026-01-26 04:50:52,402 INFO [inference.py:355] First hypothesis: [105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 4, 257, 24, 92, 60, 92, 7, 8, 326, 116, 4, 257, 24, 92, 60, 4, 257, 24, 92, 60, 4, 257, 24, 92, 60, 28, 37, 4, 7, 295, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 31, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 2, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 6, 91, 18, 7, 27, 154, 259, 16, 16, 16, 16, 16, 16, 16, 16, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 204, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 245, 14, 43, 8, 349, 349, 349, 349, 349, 349, 349, 349, 349, 349, 349, 349, 349, 349, 349, 349, 349, 112, 283, 283, 283, 283, 283, 283, 283, 283, 283, 18, 7, 27, 13, 43, 43, 90, 58, 21, 58, 21, 58, 21, 58, 21, 58, 21, 43, 43, 90, 22, 41, 275, 32, 41, 275, 32, 41, 275, 32, 41, 275, 32, 41, 327, 4, 27, 5, 30, 27, 5, 19, 5, 162, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 18, 7, 85, 18, 18, 204, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 43, 8, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 14, 5, 5, 19, 5, 14, 5, 5, 19, 5, 14, 5, 113, 5, 14, 5, 5, 26, 4, 5, 26, 4, 5, 26, 4, 5, 26, 248, 130, 128, 26, 122, 32, 248, 130, 128, 26, 130, 4, 223, 75, 59, 28, 29, 59, 28, 29, 59, 28, 29, 59, 28, 36, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 329, 159, 33, 16, 136, 19, 22, 54, 16, 136, 19, 22, 54, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 323, 16, 323, 16, 323, 16, 323, 16, 323, 16, 6, 157, 33, 13, 196, 5, 94, 271, 16, 6, 4, 5, 29, 29, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 4, 5, 4, 2, 11, 17, 7, 5, 13, 4, 5, 4, 2, 64, 9, 7, 85, 34, 13, 239, 25, 105, 25, 6, 91, 109, 5, 61, 335, 15, 7, 27, 105, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 2, 31, 23, 17, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 52, 10, 52, 10, 52, 10, 52, 10, 52, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 49, 193, 81, 49, 49, 101, 220, 50, 31, 50, 6, 98, 21, 22, 332, 243, 243, 5, 25, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 98, 30, 82, 82, 40, 98, 30, 82, 40, 30, 82, 40, 30, 82, 40, 30, 82, 30, 36, 63, 113, 366, 113, 64, 113, 366, 113, 64, 113, 10, 113, 13, 113, 13, 113, 13, 113, 79, 14, 5, 5, 183, 5, 25, 6, 4, 5, 21, 97, 49, 193, 49, 193, 49, 193, 49, 193, 49, 193, 49, 193, 49, 10, 7, 5, 10, 7, 5, 154, 38, 35, 6, 35, 22, 14, 5, 35, 6, 35, 22, 19, 28, 168, 94, 14, 4, 2, 23, 98, 30, 42, 233, 98, 30, 42, 233, 98, 30, 42, 215, 165, 32, 22, 233, 98, 36, 67, 98, 36, 67, 98, 36, 67, 98, 36, 67, 98, 36, 67, 14, 9, 102, 9, 7, 24, 154, 34, 124, 202, 20, 101, 101, 31, 9, 7, 85, 34, 13, 169, 25, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 73, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 98, 30, 82, 82, 70, 101, 150, 101, 17, 101, 17, 101, 17, 101, 17, 101, 17, 101, 17, 101, 6, 150, 17, 52, 15, 234, 16, 6, 106, 141, 19, 5, 14, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 13, 211, 25, 75, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 329, 185, 57, 18, 7, 85, 184, 105, 6, 329, 123, 38, 13, 227, 211, 25, 75, 90, 8, 325, 25, 6, 222, 298, 157, 41, 5, 92, 55, 490, 86, 97, 369, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 27, 60, 30, 27, 5, 30, 27, 5, 30, 27, 5, 30, 27, 5, 30, 27, 5, 30, 27, 5, 30, 27, 5, 30, 24, 29, 70, 173, 5, 15, 47, 56, 15, 49, 18, 114, 38, 35, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 296, 380, 5, 11]
210
+ 2026-01-26 04:50:52,416 INFO [inference.py:318] Audio shape: torch.Size([17, 91040]), dtype: torch.float32
211
+ 2026-01-26 04:50:52,417 INFO [inference.py:319] Audio range: [-0.574, 0.629]
212
+ 2026-01-26 04:50:52,418 INFO [inference.py:320] Audio lengths: tensor([91040, 90240, 89119, 88480, 87520, 86079, 83680, 82880, 81120, 79520,
213
+ 79520, 78079, 76800, 76480, 73760, 73600, 73599], dtype=torch.int32)
214
+ 2026-01-26 04:50:59,721 INFO [inference.py:341] Encoder out shape: torch.Size([17, 284, 1024])
215
+ 2026-01-26 04:50:59,722 INFO [inference.py:342] Encoder out lens: tensor([284, 281, 278, 276, 273, 268, 261, 258, 253, 248, 248, 243, 239, 238,
216
+ 230, 229, 229])
217
+ 2026-01-26 04:50:59,722 INFO [inference.py:343] Encoder out range: [-13.703, 11.821]
218
+ 2026-01-26 04:51:09,014 INFO [inference.py:353] Number of hypotheses: 17
219
+ 2026-01-26 04:51:09,014 INFO [inference.py:355] First hypothesis: [50, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 16, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 214, 98, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 24, 67, 205, 205, 5, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 16, 259, 259, 16, 259, 259, 16, 259, 259, 16, 259, 259, 259, 259, 259, 259, 259, 259, 259, 259, 259, 101, 15, 72, 15, 72, 15, 72, 15, 72, 15, 72, 101, 6, 4, 5, 8, 136, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 265, 147, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 205, 5, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 15, 34, 13, 4, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 4, 5, 19, 5, 14, 5, 5, 19, 5, 14, 5, 5, 19, 5, 14, 5, 5, 19, 5, 14, 5, 5, 19, 5, 14, 5, 5, 19, 5, 8, 41, 5, 92, 5, 8, 41, 5, 92, 55, 490, 86, 97, 5, 5, 19, 24, 19, 5, 8, 41, 5, 92, 55, 490, 86, 97, 5, 5, 19, 5, 8, 41, 5, 92, 55, 490, 86, 97, 5, 5, 19, 22, 14, 5, 5, 19, 22, 14, 5, 5, 19, 22, 14, 5, 5, 19, 24, 94, 5, 113, 9, 113, 9, 113, 9, 113, 9, 113, 9, 113, 9, 113, 9, 113, 9, 113, 9, 113, 9, 113, 79, 14, 5, 5, 19, 5, 71, 19, 5, 71, 19, 5, 71, 19, 5, 71, 19, 5, 71, 19, 5, 71, 19, 5, 71, 19, 29, 71, 19, 29, 71, 19, 29, 71, 21, 29, 5, 8, 40, 19, 29, 71, 21, 29, 5, 4, 2, 9, 49, 23, 9, 49, 9, 7, 85, 34, 13, 74, 19, 5, 14, 36, 75, 19, 80]
220
+ 2026-01-26 04:51:09,027 INFO [inference.py:318] Audio shape: torch.Size([23, 68960]), dtype: torch.float32
221
+ 2026-01-26 04:51:09,030 INFO [inference.py:319] Audio range: [-0.269, 0.266]
222
+ 2026-01-26 04:51:09,031 INFO [inference.py:320] Audio lengths: tensor([68959, 66880, 64800, 64479, 61920, 59680, 54400, 53440, 52479, 52319,
223
+ 51840, 46880, 46559, 45120, 44480, 43360, 43360, 43360, 43040, 43040,
224
+ 43040, 42880, 42560], dtype=torch.int32)
225
+ 2026-01-26 04:51:16,407 INFO [inference.py:341] Encoder out shape: torch.Size([23, 215, 1024])
226
+ 2026-01-26 04:51:16,408 INFO [inference.py:342] Encoder out lens: tensor([215, 208, 202, 201, 193, 186, 169, 166, 163, 163, 161, 146, 145, 140,
227
+ 138, 135, 135, 135, 134, 134, 134, 133, 132])
228
+ 2026-01-26 04:51:16,408 INFO [inference.py:343] Encoder out range: [-13.477, 12.445]
229
+ 2026-01-26 04:51:24,735 INFO [inference.py:353] Number of hypotheses: 23
230
+ 2026-01-26 04:51:24,735 INFO [inference.py:355] First hypothesis: [4, 7, 295, 4, 2, 11, 4, 2, 9, 7, 85, 151, 4, 28, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 225, 4, 2, 9, 7, 24, 51, 34, 13, 4, 5, 21, 69, 5, 8, 41, 5, 92, 55, 490, 86, 97, 4, 2, 31, 20, 4, 133, 133, 22, 5, 51, 4, 2, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
231
+ 2026-01-26 04:51:24,800 INFO [inference.py:544] Processed 391 utterances in 20 batches
232
+ 2026-01-26 04:51:24,806 INFO [inference.py:318] Audio shape: torch.Size([17, 92320]), dtype: torch.float32
233
+ 2026-01-26 04:51:24,807 INFO [inference.py:319] Audio range: [-0.234, 0.300]
234
+ 2026-01-26 04:51:24,808 INFO [inference.py:320] Audio lengths: tensor([92320, 91200, 91200, 90560, 89120, 84000, 83840, 83360, 82880, 82079,
235
+ 79840, 79520, 76800, 73760, 73280, 70079, 69600], dtype=torch.int32)
236
+ 2026-01-26 04:51:32,739 INFO [inference.py:341] Encoder out shape: torch.Size([17, 288, 1024])
237
+ 2026-01-26 04:51:32,741 INFO [inference.py:342] Encoder out lens: tensor([288, 284, 284, 282, 278, 262, 261, 260, 258, 256, 249, 248, 239, 230,
238
+ 228, 218, 217])
239
+ 2026-01-26 04:51:32,741 INFO [inference.py:343] Encoder out range: [-13.483, 12.297]
240
+ 2026-01-26 04:51:45,129 INFO [inference.py:353] Number of hypotheses: 17
241
+ 2026-01-26 04:51:45,129 INFO [inference.py:355] First hypothesis: [39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 191, 56, 56, 196, 104, 19, 22, 30, 70, 22, 435, 55, 185, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 36, 58, 36, 58, 36, 58, 36, 58, 36, 58, 36, 19, 36, 58, 36, 19, 36, 58, 36, 19, 36, 121, 54, 121, 5, 121, 54, 121, 5, 121, 54, 121, 5, 15, 7, 85, 274, 50, 180, 43, 92, 7, 8, 274, 50, 180, 43, 92, 50, 8, 28, 24, 5, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 93, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 83, 14, 84, 84, 84, 84, 84, 84, 84, 84, 84, 83, 14, 83, 84, 84, 83, 14, 83, 84, 84, 83, 7, 8, 83, 14, 84, 83, 7, 8, 83, 14, 83, 14, 83, 14, 83, 14, 83, 14, 83, 7, 8, 217, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 56, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 43, 30, 5, 14, 190, 189, 43, 30, 5, 14, 190, 4, 190, 189, 97, 5, 5, 29, 14, 245, 14, 43, 8, 245, 21, 43, 245, 14, 43, 8, 349, 245, 21, 43, 8, 349, 245, 21, 43, 8, 119, 21, 29, 14, 43, 19, 201, 14, 190, 8, 119, 8, 54, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 34, 150, 20, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84]
242
+ 2026-01-26 04:51:45,142 INFO [inference.py:318] Audio shape: torch.Size([23, 68800]), dtype: torch.float32
243
+ 2026-01-26 04:51:45,143 INFO [inference.py:319] Audio range: [-0.321, 0.370]
244
+ 2026-01-26 04:51:45,144 INFO [inference.py:320] Audio lengths: tensor([68799, 66720, 62560, 62240, 61919, 60160, 59840, 58080, 57920, 57280,
245
+ 53920, 52960, 51040, 50080, 49920, 49280, 48160, 48160, 47680, 47200,
246
+ 44800, 44000, 42560], dtype=torch.int32)
247
+ 2026-01-26 04:51:52,920 INFO [inference.py:341] Encoder out shape: torch.Size([23, 214, 1024])
248
+ 2026-01-26 04:51:52,921 INFO [inference.py:342] Encoder out lens: tensor([214, 208, 195, 194, 193, 187, 186, 181, 180, 178, 168, 165, 159, 156,
249
+ 155, 153, 150, 150, 148, 147, 139, 137, 132])
250
+ 2026-01-26 04:51:52,921 INFO [inference.py:343] Encoder out range: [-11.273, 12.003]
251
+ 2026-01-26 04:52:00,812 INFO [inference.py:353] Number of hypotheses: 23
252
+ 2026-01-26 04:52:00,813 INFO [inference.py:355] First hypothesis: [218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 4, 2, 11, 17, 7, 5, 13, 4, 5, 21, 69, 5, 8, 41, 5, 92, 55, 490, 86, 97, 5, 5, 19, 5, 8, 40, 19, 29, 8, 5, 17, 7, 5, 13, 211, 120, 412, 20, 265]
253
+ 2026-01-26 04:52:00,823 INFO [inference.py:318] Audio shape: torch.Size([38, 42080]), dtype: torch.float32
254
+ 2026-01-26 04:52:00,824 INFO [inference.py:319] Audio range: [-0.400, 0.452]
255
+ 2026-01-26 04:52:00,825 INFO [inference.py:320] Audio lengths: tensor([42080, 39200, 37439, 36960, 35520, 34560, 34079, 33599, 33600, 33280,
256
+ 31520, 31200, 29760, 28160, 28000, 27200, 26720, 25600, 25120, 23200,
257
+ 22880, 21280, 20800, 20000, 19680, 19520, 19200, 18080, 17600, 17600,
258
+ 16320, 13120, 12320, 11680, 8000, 6400, 5120, 3840],
259
+ dtype=torch.int32)
260
+ 2026-01-26 04:52:07,931 INFO [inference.py:341] Encoder out shape: torch.Size([38, 131, 1024])
261
+ 2026-01-26 04:52:07,932 INFO [inference.py:342] Encoder out lens: tensor([131, 122, 116, 115, 110, 107, 106, 104, 104, 103, 98, 97, 92, 87,
262
+ 87, 84, 83, 79, 78, 72, 71, 66, 64, 62, 61, 60, 59, 56,
263
+ 54, 54, 50, 40, 38, 36, 24, 19, 15, 11])
264
+ 2026-01-26 04:52:07,932 INFO [inference.py:343] Encoder out range: [-11.872, 11.798]
egs/ami/ASR/xlsr_transducer/inference_results/.ipynb_checkpoints/log-inference-ihm-2026-01-26-05-05-16-checkpoint ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:05:16,888 INFO [inference.py:410] ================================================================================
2
+ 2026-01-26 05:05:16,888 INFO [inference.py:411] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:05:16,888 INFO [inference.py:412] ================================================================================
4
+ 2026-01-26 05:05:16,888 INFO [inference.py:413] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:05:16,888 INFO [inference.py:414] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:05:16,888 INFO [inference.py:415] Test set: ihm
7
+ 2026-01-26 05:05:16,888 INFO [inference.py:416] Decoding method: modified_beam_search
8
+ 2026-01-26 05:05:16,888 INFO [inference.py:422] Device: cpu
9
+ 2026-01-26 05:05:16,888 INFO [inference.py:425] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 05:05:16,890 INFO [inference.py:433] Vocabulary size: 500
11
+ 2026-01-26 05:05:16,890 INFO [inference.py:434] Blank ID: 0
12
+ 2026-01-26 05:05:16,890 INFO [inference.py:437] Creating model
13
+ 2026-01-26 05:05:18,544 INFO [inference.py:444] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-train-loss.pt
14
+ 2026-01-26 05:05:18,544 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-train-loss.pt
15
+ 2026-01-26 05:05:23,319 INFO [inference.py:473] Number of model parameters: 317,511,772
16
+ 2026-01-26 05:05:23,320 INFO [inference.py:476] Loading test data
17
+ 2026-01-26 05:05:23,320 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 05:05:24,403 INFO [inference.py:487] Number of test utterances: 6676
19
+ 2026-01-26 05:05:24,403 INFO [inference.py:490] Starting inference...
20
+ 2026-01-26 05:05:25,573 INFO [inference.py:309] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 05:05:25,576 INFO [inference.py:310] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 05:05:25,579 INFO [inference.py:311] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 05:05:34,838 INFO [inference.py:332] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 05:05:34,839 INFO [inference.py:333] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 05:05:34,839 INFO [inference.py:334] Encoder out range: [-13.684, 12.764]
26
+ 2026-01-26 05:05:35,536 INFO [inference.py:344] Number of hypotheses: 6
27
+ 2026-01-26 05:05:35,537 INFO [inference.py:346] First hypothesis: [171]
28
+ 2026-01-26 05:05:35,546 INFO [inference.py:309] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 05:05:35,547 INFO [inference.py:310] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 05:05:35,547 INFO [inference.py:311] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
33
+ 2026-01-26 05:05:43,001 INFO [inference.py:332] Encoder out shape: torch.Size([23, 209, 1024])
34
+ 2026-01-26 05:05:43,003 INFO [inference.py:333] Encoder out lens: tensor([209, 207, 207, 200, 198, 196, 190, 190, 185, 184, 182, 174, 170, 163,
35
+ 162, 157, 157, 156, 151, 149, 148, 139, 137])
36
+ 2026-01-26 05:05:43,003 INFO [inference.py:334] Encoder out range: [-12.514, 12.004]
37
+ 2026-01-26 05:05:43,905 INFO [inference.py:344] Number of hypotheses: 23
38
+ 2026-01-26 05:05:43,905 INFO [inference.py:346] First hypothesis: [23, 51, 156, 6, 205, 18, 116, 113, 363]
39
+ 2026-01-26 05:05:43,925 INFO [inference.py:309] Audio shape: torch.Size([39, 40640]), dtype: torch.float32
40
+ 2026-01-26 05:05:43,926 INFO [inference.py:310] Audio range: [-0.439, 0.480]
41
+ 2026-01-26 05:05:43,926 INFO [inference.py:311] Audio lengths: tensor([40640, 37279, 36799, 36480, 36480, 33280, 33279, 32320, 32159, 30400,
42
+ 28800, 28480, 28160, 23520, 23039, 22880, 22400, 21920, 21920, 20960,
43
+ 20160, 20000, 19200, 19040, 18880, 18240, 16480, 14720, 13600, 12960,
44
+ 12320, 11680, 11520, 10880, 9440, 9120, 7840, 5920, 5760],
45
+ dtype=torch.int32)
46
+ 2026-01-26 05:05:51,027 INFO [inference.py:332] Encoder out shape: torch.Size([39, 126, 1024])
47
+ 2026-01-26 05:05:51,028 INFO [inference.py:333] Encoder out lens: tensor([126, 116, 114, 113, 113, 103, 103, 100, 100, 94, 89, 88, 87, 73,
48
+ 71, 71, 69, 68, 68, 65, 62, 62, 59, 59, 58, 56, 51, 45,
49
+ 42, 40, 38, 36, 35, 33, 29, 28, 24, 18, 17])
50
+ 2026-01-26 05:05:51,028 INFO [inference.py:334] Encoder out range: [-11.444, 10.811]
51
+ 2026-01-26 05:05:51,620 INFO [inference.py:344] Number of hypotheses: 39
52
+ 2026-01-26 05:05:51,620 INFO [inference.py:346] First hypothesis: [11]
53
+ 2026-01-26 05:05:51,628 INFO [inference.py:309] Audio shape: torch.Size([23, 68000]), dtype: torch.float32
54
+ 2026-01-26 05:05:51,629 INFO [inference.py:310] Audio range: [-0.314, 0.332]
55
+ 2026-01-26 05:05:51,629 INFO [inference.py:311] Audio lengths: tensor([68000, 65920, 65599, 64799, 64160, 63520, 62400, 61600, 59040, 58239,
56
+ 56480, 55840, 55520, 55359, 54719, 53440, 52800, 52640, 47200, 46239,
57
+ 46079, 45280, 44960], dtype=torch.int32)
58
+ 2026-01-26 05:05:59,021 INFO [inference.py:332] Encoder out shape: torch.Size([23, 212, 1024])
59
+ 2026-01-26 05:05:59,022 INFO [inference.py:333] Encoder out lens: tensor([212, 205, 204, 202, 200, 198, 194, 192, 184, 181, 176, 174, 173, 172,
60
+ 170, 166, 164, 164, 147, 144, 143, 141, 140])
61
+ 2026-01-26 05:05:59,023 INFO [inference.py:334] Encoder out range: [-13.261, 11.090]
62
+ 2026-01-26 05:05:59,931 INFO [inference.py:344] Number of hypotheses: 23
63
+ 2026-01-26 05:05:59,932 INFO [inference.py:346] First hypothesis: [20]
64
+ 2026-01-26 05:06:00,567 INFO [inference.py:309] Audio shape: torch.Size([5, 317280]), dtype: torch.float32
65
+ 2026-01-26 05:06:00,568 INFO [inference.py:310] Audio range: [-0.323, 0.414]
66
+ 2026-01-26 05:06:00,569 INFO [inference.py:311] Audio lengths: tensor([317280, 298079, 298080, 294559, 292480], dtype=torch.int32)
67
+ 2026-01-26 05:06:09,302 INFO [inference.py:332] Encoder out shape: torch.Size([5, 991, 1024])
68
+ 2026-01-26 05:06:09,303 INFO [inference.py:333] Encoder out lens: tensor([991, 931, 931, 920, 913])
69
+ 2026-01-26 05:06:09,304 INFO [inference.py:334] Encoder out range: [-14.241, 14.344]
70
+ 2026-01-26 05:06:10,112 INFO [inference.py:344] Number of hypotheses: 5
71
+ 2026-01-26 05:06:10,113 INFO [inference.py:346] First hypothesis: [37, 23, 127, 274, 5, 147, 80, 73, 6, 16, 29, 119, 5, 20, 84, 171, 15, 6, 328, 5, 127, 20, 56]
72
+ 2026-01-26 05:06:10,117 INFO [inference.py:309] Audio shape: torch.Size([40, 39360]), dtype: torch.float32
73
+ 2026-01-26 05:06:10,118 INFO [inference.py:310] Audio range: [-0.274, 0.362]
74
+ 2026-01-26 05:06:10,118 INFO [inference.py:311] Audio lengths: tensor([39359, 39199, 39039, 38080, 36000, 35200, 34880, 34880, 33760, 33760,
75
+ 33600, 33120, 29440, 29280, 27360, 24960, 24960, 23680, 21760, 21600,
76
+ 20800, 16800, 16320, 16160, 16000, 15679, 15040, 13440, 12320, 7040,
77
+ 6560, 6400, 5760, 5760, 5120, 4800, 4800, 4640, 4480, 3360],
78
+ dtype=torch.int32)
79
+ 2026-01-26 05:06:16,910 INFO [inference.py:332] Encoder out shape: torch.Size([40, 122, 1024])
80
+ 2026-01-26 05:06:16,911 INFO [inference.py:333] Encoder out lens: tensor([122, 122, 121, 118, 112, 109, 108, 108, 105, 105, 104, 103, 91, 91,
81
+ 85, 77, 77, 73, 67, 67, 64, 52, 50, 50, 49, 48, 46, 41,
82
+ 38, 21, 20, 19, 17, 17, 15, 14, 14, 14, 13, 10])
83
+ 2026-01-26 05:06:16,911 INFO [inference.py:334] Encoder out range: [-11.784, 11.570]
84
+ 2026-01-26 05:06:17,504 INFO [inference.py:344] Number of hypotheses: 40
85
+ 2026-01-26 05:06:17,504 INFO [inference.py:346] First hypothesis: []
86
+ 2026-01-26 05:06:17,513 INFO [inference.py:309] Audio shape: torch.Size([23, 66880]), dtype: torch.float32
87
+ 2026-01-26 05:06:17,514 INFO [inference.py:310] Audio range: [-0.514, 0.393]
88
+ 2026-01-26 05:06:17,514 INFO [inference.py:311] Audio lengths: tensor([66880, 65439, 60799, 60320, 59520, 58240, 57280, 56320, 55520, 54080,
89
+ 51840, 51520, 50720, 49920, 49600, 48319, 48320, 47999, 46880, 46079,
90
+ 44640, 44320, 44160], dtype=torch.int32)
91
+ 2026-01-26 05:06:24,334 INFO [inference.py:332] Encoder out shape: torch.Size([23, 208, 1024])
92
+ 2026-01-26 05:06:24,335 INFO [inference.py:333] Encoder out lens: tensor([208, 204, 189, 188, 185, 181, 178, 175, 173, 168, 161, 160, 158, 155,
93
+ 154, 150, 150, 149, 146, 143, 139, 138, 137])
94
+ 2026-01-26 05:06:24,335 INFO [inference.py:334] Encoder out range: [-12.152, 11.060]
95
+ 2026-01-26 05:06:25,044 INFO [inference.py:344] Number of hypotheses: 23
96
+ 2026-01-26 05:06:25,045 INFO [inference.py:346] First hypothesis: [51, 6, 4, 27, 5, 26, 70, 22, 20, 265, 33, 46]
97
+ 2026-01-26 05:06:25,108 INFO [inference.py:309] Audio shape: torch.Size([24, 65600]), dtype: torch.float32
98
+ 2026-01-26 05:06:25,109 INFO [inference.py:310] Audio range: [-0.416, 0.458]
99
+ 2026-01-26 05:06:25,109 INFO [inference.py:311] Audio lengths: tensor([65600, 64000, 63680, 61280, 60000, 58080, 55200, 52960, 51359, 51200,
100
+ 50720, 50720, 50080, 49280, 48639, 47840, 47360, 46880, 46400, 46240,
101
+ 45920, 44640, 43040, 42720], dtype=torch.int32)
102
+ 2026-01-26 05:06:32,219 INFO [inference.py:332] Encoder out shape: torch.Size([24, 204, 1024])
103
+ 2026-01-26 05:06:32,220 INFO [inference.py:333] Encoder out lens: tensor([204, 199, 198, 191, 187, 181, 172, 165, 160, 159, 158, 158, 156, 153,
104
+ 151, 149, 147, 146, 144, 144, 143, 139, 134, 133])
105
+ 2026-01-26 05:06:32,220 INFO [inference.py:334] Encoder out range: [-12.007, 11.624]
106
+ 2026-01-26 05:06:32,900 INFO [inference.py:344] Number of hypotheses: 24
107
+ 2026-01-26 05:06:32,900 INFO [inference.py:346] First hypothesis: [11]
108
+ 2026-01-26 05:06:32,908 INFO [inference.py:309] Audio shape: torch.Size([9, 176960]), dtype: torch.float32
109
+ 2026-01-26 05:06:32,920 INFO [inference.py:310] Audio range: [-0.135, 0.191]
110
+ 2026-01-26 05:06:32,921 INFO [inference.py:311] Audio lengths: tensor([176960, 170720, 164480, 155840, 154559, 151839, 151840, 151360, 147040],
111
+ dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/.ipynb_checkpoints/log-inference-ihm-2026-01-26-05-20-04-checkpoint ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:20:04,436 INFO [inference.py:613] ================================================================================
2
+ 2026-01-26 05:20:04,436 INFO [inference.py:614] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:20:04,436 INFO [inference.py:615] ================================================================================
4
+ 2026-01-26 05:20:04,436 INFO [inference.py:616] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:20:04,436 INFO [inference.py:617] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:20:04,436 INFO [inference.py:618] Test set: ihm
7
+ 2026-01-26 05:20:04,436 INFO [inference.py:619] Decoding method: modified_beam_search
8
+ 2026-01-26 05:20:04,436 INFO [inference.py:621] Beam size: 4
9
+ 2026-01-26 05:20:04,436 INFO [inference.py:622] Max states: 64
10
+ 2026-01-26 05:20:04,436 INFO [inference.py:623] Max symbols per frame: 3
11
+ 2026-01-26 05:20:04,437 INFO [inference.py:629] Device: cpu
12
+ 2026-01-26 05:20:04,437 INFO [inference.py:632] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:20:04,438 INFO [inference.py:640] Vocabulary size: 500
14
+ 2026-01-26 05:20:04,438 INFO [inference.py:641] Blank ID: 0
15
+ 2026-01-26 05:20:04,438 INFO [inference.py:644] Creating model
16
+ 2026-01-26 05:20:05,956 INFO [inference.py:651] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-train-loss.pt
17
+ 2026-01-26 05:20:05,957 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-train-loss.pt
18
+ 2026-01-26 05:20:10,638 INFO [inference.py:680] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:20:10,639 INFO [inference.py:683] Loading test data
20
+ 2026-01-26 05:20:10,639 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:20:11,677 INFO [inference.py:694] Number of test utterances: 6676
22
+ 2026-01-26 05:20:11,677 INFO [inference.py:697] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/.ipynb_checkpoints/log-inference-ihm-2026-01-26-05-29-29-checkpoint ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:29:29,151 INFO [inference.py:613] ================================================================================
2
+ 2026-01-26 05:29:29,151 INFO [inference.py:614] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:29:29,151 INFO [inference.py:615] ================================================================================
4
+ 2026-01-26 05:29:29,151 INFO [inference.py:616] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:29:29,151 INFO [inference.py:617] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:29:29,151 INFO [inference.py:618] Test set: ihm
7
+ 2026-01-26 05:29:29,151 INFO [inference.py:619] Decoding method: modified_beam_search
8
+ 2026-01-26 05:29:29,151 INFO [inference.py:621] Beam size: 4
9
+ 2026-01-26 05:29:29,151 INFO [inference.py:622] Max states: 64
10
+ 2026-01-26 05:29:29,151 INFO [inference.py:623] Max symbols per frame: 3
11
+ 2026-01-26 05:29:29,151 INFO [inference.py:629] Device: cpu
12
+ 2026-01-26 05:29:29,151 INFO [inference.py:632] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:29:29,153 INFO [inference.py:640] Vocabulary size: 500
14
+ 2026-01-26 05:29:29,153 INFO [inference.py:641] Blank ID: 0
15
+ 2026-01-26 05:29:29,153 INFO [inference.py:644] Creating model
16
+ 2026-01-26 05:29:30,733 INFO [inference.py:673] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
17
+ 2026-01-26 05:29:30,734 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
18
+ 2026-01-26 05:29:35,902 INFO [inference.py:680] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:29:35,902 INFO [inference.py:683] Loading test data
20
+ 2026-01-26 05:29:35,902 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:29:37,022 INFO [inference.py:694] Number of test utterances: 6676
22
+ 2026-01-26 05:29:37,023 INFO [inference.py:697] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/hyp-ihm.txt ADDED
The diff for this file is too large to render. See raw diff
 
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-25-15-47-40 ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-25 15:47:40,637 INFO [inference.py:419] ================================================================================
2
+ 2026-01-25 15:47:40,637 INFO [inference.py:420] XLSR-Transducer Inference on AMI
3
+ 2026-01-25 15:47:40,637 INFO [inference.py:421] ================================================================================
4
+ 2026-01-25 15:47:40,637 INFO [inference.py:422] Experiment dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd
5
+ 2026-01-25 15:47:40,637 INFO [inference.py:423] Output dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-25 15:47:40,637 INFO [inference.py:424] Test set: ihm
7
+ 2026-01-25 15:47:40,637 INFO [inference.py:425] Decoding method: greedy_search
8
+ 2026-01-25 15:47:40,637 INFO [inference.py:431] Device: cpu
9
+ 2026-01-25 15:47:40,637 INFO [inference.py:434] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-25 15:47:40,639 INFO [inference.py:442] Vocabulary size: 500
11
+ 2026-01-25 15:47:40,639 INFO [inference.py:443] Blank ID: 0
12
+ 2026-01-25 15:47:40,639 INFO [inference.py:446] Creating model
13
+ 2026-01-25 15:47:41,928 INFO [inference.py:453] Loading checkpoint: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
14
+ 2026-01-25 15:47:41,929 INFO [checkpoint.py:111] Loading checkpoint from /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
15
+ 2026-01-25 15:47:46,671 INFO [inference.py:482] Number of model parameters: 317,511,772
16
+ 2026-01-25 15:47:46,671 INFO [inference.py:485] Loading test data
17
+ 2026-01-25 15:47:46,671 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-25 15:47:47,717 INFO [inference.py:496] Number of test utterances: 6676
19
+ 2026-01-25 15:47:47,717 INFO [inference.py:499] Starting inference...
20
+ 2026-01-25 15:47:48,838 INFO [inference.py:318] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-25 15:47:48,842 INFO [inference.py:319] Audio range: [-0.090, 0.104]
22
+ 2026-01-25 15:47:48,845 INFO [inference.py:320] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-25 15:47:58,037 INFO [inference.py:341] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-25 15:47:58,038 INFO [inference.py:342] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-25 15:47:58,039 INFO [inference.py:343] Encoder out range: [-11.805, 12.741]
26
+ 2026-01-25 15:48:09,204 INFO [inference.py:353] Number of hypotheses: 6
27
+ 2026-01-25 15:48:09,205 INFO [inference.py:355] First hypothesis: [37, 9, 49, 17, 9, 49, 17, 9, 49, 17, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 10, 7, 5, 13, 59, 14, 164, 59, 21, 19, 40, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 125, 13, 200, 130, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 49, 9, 7, 24, 154, 125, 13, 160, 202, 281, 116, 126, 281, 5, 8, 119, 55, 80, 59, 19, 75, 8, 14, 80, 59, 19, 75, 8, 14, 80, 59, 19, 75, 8, 14, 80, 59, 19, 75, 8, 14, 80, 59, 19, 75, 8, 14, 80, 59, 19, 75, 8, 14, 80, 59, 19, 75, 8, 14, 80, 4, 2, 11, 4, 2, 37, 4, 2, 37, 4, 2, 37, 4, 7, 197, 10, 7, 5, 13, 160, 157, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 2, 11]
28
+ 2026-01-25 15:48:09,222 INFO [inference.py:318] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-25 15:48:09,223 INFO [inference.py:319] Audio range: [-0.401, 0.443]
30
+ 2026-01-25 15:48:09,224 INFO [inference.py:320] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-43-42 ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 04:43:42,361 INFO [inference.py:419] ================================================================================
2
+ 2026-01-26 04:43:42,361 INFO [inference.py:420] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 04:43:42,361 INFO [inference.py:421] ================================================================================
4
+ 2026-01-26 04:43:42,361 INFO [inference.py:422] Experiment dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 04:43:42,361 INFO [inference.py:423] Output dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 04:43:42,361 INFO [inference.py:424] Test set: ihm
7
+ 2026-01-26 04:43:42,361 INFO [inference.py:425] Decoding method: greedy_search
8
+ 2026-01-26 04:43:42,361 INFO [inference.py:431] Device: cpu
9
+ 2026-01-26 04:43:42,361 INFO [inference.py:434] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 04:43:42,363 INFO [inference.py:442] Vocabulary size: 500
11
+ 2026-01-26 04:43:42,363 INFO [inference.py:443] Blank ID: 0
12
+ 2026-01-26 04:43:42,363 INFO [inference.py:446] Creating model
13
+ 2026-01-26 04:43:43,908 INFO [inference.py:453] Loading checkpoint: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/epoch-15.pt
14
+ 2026-01-26 04:43:43,908 INFO [checkpoint.py:111] Loading checkpoint from /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/epoch-15.pt
15
+ 2026-01-26 04:43:48,495 INFO [inference.py:482] Number of model parameters: 317,511,772
16
+ 2026-01-26 04:43:48,495 INFO [inference.py:485] Loading test data
17
+ 2026-01-26 04:43:48,495 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 04:43:49,572 INFO [inference.py:496] Number of test utterances: 6676
19
+ 2026-01-26 04:43:49,572 INFO [inference.py:499] Starting inference...
20
+ 2026-01-26 04:43:50,628 INFO [inference.py:318] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 04:43:50,631 INFO [inference.py:319] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 04:43:50,633 INFO [inference.py:320] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 04:43:59,926 INFO [inference.py:341] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 04:43:59,926 INFO [inference.py:342] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 04:43:59,927 INFO [inference.py:343] Encoder out range: [-4.703, 6.664]
26
+ 2026-01-26 04:44:00,350 INFO [inference.py:353] Number of hypotheses: 6
27
+ 2026-01-26 04:44:00,350 INFO [inference.py:355] First hypothesis: [11]
28
+ 2026-01-26 04:44:00,355 INFO [inference.py:318] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 04:44:00,356 INFO [inference.py:319] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 04:44:00,357 INFO [inference.py:320] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
33
+ 2026-01-26 04:44:07,432 INFO [inference.py:341] Encoder out shape: torch.Size([23, 209, 1024])
34
+ 2026-01-26 04:44:07,434 INFO [inference.py:342] Encoder out lens: tensor([209, 207, 207, 200, 198, 196, 190, 190, 185, 184, 182, 174, 170, 163,
35
+ 162, 157, 157, 156, 151, 149, 148, 139, 137])
36
+ 2026-01-26 04:44:07,434 INFO [inference.py:343] Encoder out range: [-4.701, 6.665]
37
+ 2026-01-26 04:44:08,039 INFO [inference.py:353] Number of hypotheses: 23
38
+ 2026-01-26 04:44:08,039 INFO [inference.py:355] First hypothesis: [11]
39
+ 2026-01-26 04:44:08,059 INFO [inference.py:318] Audio shape: torch.Size([39, 40640]), dtype: torch.float32
40
+ 2026-01-26 04:44:08,060 INFO [inference.py:319] Audio range: [-0.439, 0.480]
41
+ 2026-01-26 04:44:08,061 INFO [inference.py:320] Audio lengths: tensor([40640, 37279, 36799, 36480, 36480, 33280, 33279, 32320, 32159, 30400,
42
+ 28800, 28480, 28160, 23520, 23039, 22880, 22400, 21920, 21920, 20960,
43
+ 20160, 20000, 19200, 19040, 18880, 18240, 16480, 14720, 13600, 12960,
44
+ 12320, 11680, 11520, 10880, 9440, 9120, 7840, 5920, 5760],
45
+ dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-44-36 ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 04:44:36,166 INFO [inference.py:419] ================================================================================
2
+ 2026-01-26 04:44:36,166 INFO [inference.py:420] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 04:44:36,166 INFO [inference.py:421] ================================================================================
4
+ 2026-01-26 04:44:36,166 INFO [inference.py:422] Experiment dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 04:44:36,166 INFO [inference.py:423] Output dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 04:44:36,166 INFO [inference.py:424] Test set: ihm
7
+ 2026-01-26 04:44:36,166 INFO [inference.py:425] Decoding method: greedy_search
8
+ 2026-01-26 04:44:36,166 INFO [inference.py:431] Device: cpu
9
+ 2026-01-26 04:44:36,166 INFO [inference.py:434] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 04:44:36,168 INFO [inference.py:442] Vocabulary size: 500
11
+ 2026-01-26 04:44:36,168 INFO [inference.py:443] Blank ID: 0
12
+ 2026-01-26 04:44:36,168 INFO [inference.py:446] Creating model
13
+ 2026-01-26 04:44:37,655 INFO [inference.py:453] Loading checkpoint: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/checkpoint-30000.pt
14
+ 2026-01-26 04:44:37,655 INFO [checkpoint.py:111] Loading checkpoint from /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/checkpoint-30000.pt
15
+ 2026-01-26 04:44:42,489 INFO [inference.py:482] Number of model parameters: 317,511,772
16
+ 2026-01-26 04:44:42,489 INFO [inference.py:485] Loading test data
17
+ 2026-01-26 04:44:42,489 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 04:44:43,623 INFO [inference.py:496] Number of test utterances: 6676
19
+ 2026-01-26 04:44:43,623 INFO [inference.py:499] Starting inference...
20
+ 2026-01-26 04:44:44,773 INFO [inference.py:318] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 04:44:44,776 INFO [inference.py:319] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 04:44:44,779 INFO [inference.py:320] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 04:44:52,532 INFO [inference.py:341] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 04:44:52,532 INFO [inference.py:342] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 04:44:52,533 INFO [inference.py:343] Encoder out range: [-4.808, 7.175]
26
+ 2026-01-26 04:44:52,980 INFO [inference.py:353] Number of hypotheses: 6
27
+ 2026-01-26 04:44:52,980 INFO [inference.py:355] First hypothesis: [11, 4, 2, 11]
28
+ 2026-01-26 04:44:52,988 INFO [inference.py:318] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 04:44:52,989 INFO [inference.py:319] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 04:44:52,990 INFO [inference.py:320] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
33
+ 2026-01-26 04:45:00,200 INFO [inference.py:341] Encoder out shape: torch.Size([23, 209, 1024])
34
+ 2026-01-26 04:45:00,201 INFO [inference.py:342] Encoder out lens: tensor([209, 207, 207, 200, 198, 196, 190, 190, 185, 184, 182, 174, 170, 163,
35
+ 162, 157, 157, 156, 151, 149, 148, 139, 137])
36
+ 2026-01-26 04:45:00,201 INFO [inference.py:343] Encoder out range: [-4.808, 7.173]
37
+ 2026-01-26 04:45:01,019 INFO [inference.py:353] Number of hypotheses: 23
38
+ 2026-01-26 04:45:01,019 INFO [inference.py:355] First hypothesis: [11, 4, 2, 11]
39
+ 2026-01-26 04:45:01,031 INFO [inference.py:318] Audio shape: torch.Size([39, 40640]), dtype: torch.float32
40
+ 2026-01-26 04:45:01,032 INFO [inference.py:319] Audio range: [-0.439, 0.480]
41
+ 2026-01-26 04:45:01,033 INFO [inference.py:320] Audio lengths: tensor([40640, 37279, 36799, 36480, 36480, 33280, 33279, 32320, 32159, 30400,
42
+ 28800, 28480, 28160, 23520, 23039, 22880, 22400, 21920, 21920, 20960,
43
+ 20160, 20000, 19200, 19040, 18880, 18240, 16480, 14720, 13600, 12960,
44
+ 12320, 11680, 11520, 10880, 9440, 9120, 7840, 5920, 5760],
45
+ dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-45-26 ADDED
The diff for this file is too large to render. See raw diff
 
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-57-24 ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 04:57:24,557 INFO [inference.py:410] ================================================================================
2
+ 2026-01-26 04:57:24,557 INFO [inference.py:411] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 04:57:24,557 INFO [inference.py:412] ================================================================================
4
+ 2026-01-26 04:57:24,557 INFO [inference.py:413] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 04:57:24,557 INFO [inference.py:414] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 04:57:24,557 INFO [inference.py:415] Test set: ihm
7
+ 2026-01-26 04:57:24,558 INFO [inference.py:416] Decoding method: greedy_search
8
+ 2026-01-26 04:57:24,558 INFO [inference.py:422] Device: cpu
9
+ 2026-01-26 04:57:24,558 INFO [inference.py:425] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 04:57:24,559 INFO [inference.py:433] Vocabulary size: 500
11
+ 2026-01-26 04:57:24,559 INFO [inference.py:434] Blank ID: 0
12
+ 2026-01-26 04:57:24,559 INFO [inference.py:437] Creating model
13
+ 2026-01-26 04:57:26,107 INFO [inference.py:459] Loading checkpoint: xlsr_transducer/exp_16gb_scd/epoch-15.pt
14
+ 2026-01-26 04:57:26,108 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/epoch-15.pt
15
+ 2026-01-26 04:57:30,697 INFO [inference.py:473] Number of model parameters: 317,511,772
16
+ 2026-01-26 04:57:30,697 INFO [inference.py:476] Loading test data
17
+ 2026-01-26 04:57:30,697 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 04:57:31,812 INFO [inference.py:487] Number of test utterances: 6676
19
+ 2026-01-26 04:57:31,812 INFO [inference.py:490] Starting inference...
20
+ 2026-01-26 04:57:32,942 INFO [inference.py:309] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 04:57:32,945 INFO [inference.py:310] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 04:57:32,948 INFO [inference.py:311] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 04:57:42,125 INFO [inference.py:332] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 04:57:42,126 INFO [inference.py:333] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 04:57:42,129 INFO [inference.py:334] Encoder out range: [-4.703, 6.664]
26
+ 2026-01-26 04:57:42,499 INFO [inference.py:344] Number of hypotheses: 6
27
+ 2026-01-26 04:57:42,500 INFO [inference.py:346] First hypothesis: []
28
+ 2026-01-26 04:57:42,506 INFO [inference.py:309] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 04:57:42,506 INFO [inference.py:310] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 04:57:42,507 INFO [inference.py:311] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
33
+ 2026-01-26 04:57:49,615 INFO [inference.py:332] Encoder out shape: torch.Size([23, 209, 1024])
34
+ 2026-01-26 04:57:49,616 INFO [inference.py:333] Encoder out lens: tensor([209, 207, 207, 200, 198, 196, 190, 190, 185, 184, 182, 174, 170, 163,
35
+ 162, 157, 157, 156, 151, 149, 148, 139, 137])
36
+ 2026-01-26 04:57:49,616 INFO [inference.py:334] Encoder out range: [-4.701, 6.665]
37
+ 2026-01-26 04:57:50,141 INFO [inference.py:344] Number of hypotheses: 23
38
+ 2026-01-26 04:57:50,141 INFO [inference.py:346] First hypothesis: []
39
+ 2026-01-26 04:57:50,152 INFO [inference.py:309] Audio shape: torch.Size([39, 40640]), dtype: torch.float32
40
+ 2026-01-26 04:57:50,153 INFO [inference.py:310] Audio range: [-0.439, 0.480]
41
+ 2026-01-26 04:57:50,154 INFO [inference.py:311] Audio lengths: tensor([40640, 37279, 36799, 36480, 36480, 33280, 33279, 32320, 32159, 30400,
42
+ 28800, 28480, 28160, 23520, 23039, 22880, 22400, 21920, 21920, 20960,
43
+ 20160, 20000, 19200, 19040, 18880, 18240, 16480, 14720, 13600, 12960,
44
+ 12320, 11680, 11520, 10880, 9440, 9120, 7840, 5920, 5760],
45
+ dtype=torch.int32)
46
+ 2026-01-26 04:57:56,416 INFO [inference.py:332] Encoder out shape: torch.Size([39, 126, 1024])
47
+ 2026-01-26 04:57:56,417 INFO [inference.py:333] Encoder out lens: tensor([126, 116, 114, 113, 113, 103, 103, 100, 100, 94, 89, 88, 87, 73,
48
+ 71, 71, 69, 68, 68, 65, 62, 62, 59, 59, 58, 56, 51, 45,
49
+ 42, 40, 38, 36, 35, 33, 29, 28, 24, 18, 17])
50
+ 2026-01-26 04:57:56,417 INFO [inference.py:334] Encoder out range: [-4.699, 6.664]
51
+ 2026-01-26 04:57:56,902 INFO [inference.py:344] Number of hypotheses: 39
52
+ 2026-01-26 04:57:56,902 INFO [inference.py:346] First hypothesis: []
53
+ 2026-01-26 04:57:56,906 INFO [inference.py:309] Audio shape: torch.Size([23, 68000]), dtype: torch.float32
54
+ 2026-01-26 04:57:56,907 INFO [inference.py:310] Audio range: [-0.314, 0.332]
55
+ 2026-01-26 04:57:56,907 INFO [inference.py:311] Audio lengths: tensor([68000, 65920, 65599, 64799, 64160, 63520, 62400, 61600, 59040, 58239,
56
+ 56480, 55840, 55520, 55359, 54719, 53440, 52800, 52640, 47200, 46239,
57
+ 46079, 45280, 44960], dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-58-20 ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 04:58:20,350 INFO [inference.py:410] ================================================================================
2
+ 2026-01-26 04:58:20,350 INFO [inference.py:411] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 04:58:20,350 INFO [inference.py:412] ================================================================================
4
+ 2026-01-26 04:58:20,350 INFO [inference.py:413] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 04:58:20,350 INFO [inference.py:414] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 04:58:20,350 INFO [inference.py:415] Test set: ihm
7
+ 2026-01-26 04:58:20,350 INFO [inference.py:416] Decoding method: modified_beam_search
8
+ 2026-01-26 04:58:20,350 INFO [inference.py:422] Device: cpu
9
+ 2026-01-26 04:58:20,350 INFO [inference.py:425] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 04:58:20,352 INFO [inference.py:433] Vocabulary size: 500
11
+ 2026-01-26 04:58:20,352 INFO [inference.py:434] Blank ID: 0
12
+ 2026-01-26 04:58:20,352 INFO [inference.py:437] Creating model
13
+ 2026-01-26 04:58:21,896 INFO [inference.py:459] Loading checkpoint: xlsr_transducer/exp_16gb_scd/epoch-15.pt
14
+ 2026-01-26 04:58:21,897 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/epoch-15.pt
15
+ 2026-01-26 04:58:26,596 INFO [inference.py:473] Number of model parameters: 317,511,772
16
+ 2026-01-26 04:58:26,597 INFO [inference.py:476] Loading test data
17
+ 2026-01-26 04:58:26,597 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 04:58:27,751 INFO [inference.py:487] Number of test utterances: 6676
19
+ 2026-01-26 04:58:27,752 INFO [inference.py:490] Starting inference...
20
+ 2026-01-26 04:58:28,913 INFO [inference.py:309] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 04:58:28,920 INFO [inference.py:310] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 04:58:28,923 INFO [inference.py:311] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 04:58:38,037 INFO [inference.py:332] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 04:58:38,037 INFO [inference.py:333] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 04:58:38,038 INFO [inference.py:334] Encoder out range: [-4.703, 6.664]
26
+ 2026-01-26 04:58:38,417 INFO [inference.py:344] Number of hypotheses: 6
27
+ 2026-01-26 04:58:38,418 INFO [inference.py:346] First hypothesis: []
28
+ 2026-01-26 04:58:38,426 INFO [inference.py:309] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 04:58:38,427 INFO [inference.py:310] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 04:58:38,428 INFO [inference.py:311] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-04-59-21 ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 04:59:21,443 INFO [inference.py:410] ================================================================================
2
+ 2026-01-26 04:59:21,443 INFO [inference.py:411] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 04:59:21,443 INFO [inference.py:412] ================================================================================
4
+ 2026-01-26 04:59:21,443 INFO [inference.py:413] Experiment dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 04:59:21,443 INFO [inference.py:414] Output dir: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 04:59:21,443 INFO [inference.py:415] Test set: ihm
7
+ 2026-01-26 04:59:21,443 INFO [inference.py:416] Decoding method: modified_beam_search
8
+ 2026-01-26 04:59:21,443 INFO [inference.py:422] Device: cpu
9
+ 2026-01-26 04:59:21,443 INFO [inference.py:425] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 04:59:21,445 INFO [inference.py:433] Vocabulary size: 500
11
+ 2026-01-26 04:59:21,445 INFO [inference.py:434] Blank ID: 0
12
+ 2026-01-26 04:59:21,445 INFO [inference.py:437] Creating model
13
+ 2026-01-26 04:59:23,052 INFO [inference.py:444] Loading checkpoint: /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/best-train-loss.pt
14
+ 2026-01-26 04:59:23,052 INFO [checkpoint.py:111] Loading checkpoint from /workspace/icefall/egs/ami/ASR/xlsr_transducer/exp_16gb_scd/best-train-loss.pt
15
+ 2026-01-26 04:59:27,784 INFO [inference.py:473] Number of model parameters: 317,511,772
16
+ 2026-01-26 04:59:27,784 INFO [inference.py:476] Loading test data
17
+ 2026-01-26 04:59:27,784 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 04:59:28,889 INFO [inference.py:487] Number of test utterances: 6676
19
+ 2026-01-26 04:59:28,889 INFO [inference.py:490] Starting inference...
20
+ 2026-01-26 04:59:29,994 INFO [inference.py:309] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 04:59:29,997 INFO [inference.py:310] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 04:59:30,000 INFO [inference.py:311] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 04:59:39,304 INFO [inference.py:332] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 04:59:39,305 INFO [inference.py:333] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 04:59:39,306 INFO [inference.py:334] Encoder out range: [-13.684, 12.764]
26
+ 2026-01-26 04:59:39,937 INFO [inference.py:344] Number of hypotheses: 6
27
+ 2026-01-26 04:59:39,938 INFO [inference.py:346] First hypothesis: [171]
28
+ 2026-01-26 04:59:39,943 INFO [inference.py:309] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 04:59:39,998 INFO [inference.py:310] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 04:59:39,999 INFO [inference.py:311] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
33
+ 2026-01-26 04:59:47,631 INFO [inference.py:332] Encoder out shape: torch.Size([23, 209, 1024])
34
+ 2026-01-26 04:59:47,632 INFO [inference.py:333] Encoder out lens: tensor([209, 207, 207, 200, 198, 196, 190, 190, 185, 184, 182, 174, 170, 163,
35
+ 162, 157, 157, 156, 151, 149, 148, 139, 137])
36
+ 2026-01-26 04:59:47,632 INFO [inference.py:334] Encoder out range: [-12.514, 12.004]
37
+ 2026-01-26 04:59:48,802 INFO [inference.py:344] Number of hypotheses: 23
38
+ 2026-01-26 04:59:48,802 INFO [inference.py:346] First hypothesis: [23, 51, 156, 6, 205, 18, 116, 113, 363]
39
+ 2026-01-26 04:59:49,215 INFO [inference.py:309] Audio shape: torch.Size([39, 40640]), dtype: torch.float32
40
+ 2026-01-26 04:59:49,220 INFO [inference.py:310] Audio range: [-0.439, 0.480]
41
+ 2026-01-26 04:59:49,221 INFO [inference.py:311] Audio lengths: tensor([40640, 37279, 36799, 36480, 36480, 33280, 33279, 32320, 32159, 30400,
42
+ 28800, 28480, 28160, 23520, 23039, 22880, 22400, 21920, 21920, 20960,
43
+ 20160, 20000, 19200, 19040, 18880, 18240, 16480, 14720, 13600, 12960,
44
+ 12320, 11680, 11520, 10880, 9440, 9120, 7840, 5920, 5760],
45
+ dtype=torch.int32)
46
+ 2026-01-26 04:59:56,731 INFO [inference.py:332] Encoder out shape: torch.Size([39, 126, 1024])
47
+ 2026-01-26 04:59:56,732 INFO [inference.py:333] Encoder out lens: tensor([126, 116, 114, 113, 113, 103, 103, 100, 100, 94, 89, 88, 87, 73,
48
+ 71, 71, 69, 68, 68, 65, 62, 62, 59, 59, 58, 56, 51, 45,
49
+ 42, 40, 38, 36, 35, 33, 29, 28, 24, 18, 17])
50
+ 2026-01-26 04:59:56,733 INFO [inference.py:334] Encoder out range: [-11.444, 10.811]
51
+ 2026-01-26 04:59:57,403 INFO [inference.py:344] Number of hypotheses: 39
52
+ 2026-01-26 04:59:57,403 INFO [inference.py:346] First hypothesis: [11]
53
+ 2026-01-26 04:59:57,409 INFO [inference.py:309] Audio shape: torch.Size([23, 68000]), dtype: torch.float32
54
+ 2026-01-26 04:59:57,420 INFO [inference.py:310] Audio range: [-0.314, 0.332]
55
+ 2026-01-26 04:59:57,420 INFO [inference.py:311] Audio lengths: tensor([68000, 65920, 65599, 64799, 64160, 63520, 62400, 61600, 59040, 58239,
56
+ 56480, 55840, 55520, 55359, 54719, 53440, 52800, 52640, 47200, 46239,
57
+ 46079, 45280, 44960], dtype=torch.int32)
58
+ 2026-01-26 05:00:05,318 INFO [inference.py:332] Encoder out shape: torch.Size([23, 212, 1024])
59
+ 2026-01-26 05:00:05,319 INFO [inference.py:333] Encoder out lens: tensor([212, 205, 204, 202, 200, 198, 194, 192, 184, 181, 176, 174, 173, 172,
60
+ 170, 166, 164, 164, 147, 144, 143, 141, 140])
61
+ 2026-01-26 05:00:05,319 INFO [inference.py:334] Encoder out range: [-13.261, 11.090]
62
+ 2026-01-26 05:00:06,035 INFO [inference.py:344] Number of hypotheses: 23
63
+ 2026-01-26 05:00:06,035 INFO [inference.py:346] First hypothesis: [20]
64
+ 2026-01-26 05:00:06,104 INFO [inference.py:309] Audio shape: torch.Size([5, 317280]), dtype: torch.float32
65
+ 2026-01-26 05:00:06,105 INFO [inference.py:310] Audio range: [-0.323, 0.414]
66
+ 2026-01-26 05:00:06,105 INFO [inference.py:311] Audio lengths: tensor([317280, 298079, 298080, 294559, 292480], dtype=torch.int32)
67
+ 2026-01-26 05:00:14,039 INFO [inference.py:332] Encoder out shape: torch.Size([5, 991, 1024])
68
+ 2026-01-26 05:00:14,040 INFO [inference.py:333] Encoder out lens: tensor([991, 931, 931, 920, 913])
69
+ 2026-01-26 05:00:14,098 INFO [inference.py:334] Encoder out range: [-14.241, 14.344]
70
+ 2026-01-26 05:00:14,713 INFO [inference.py:344] Number of hypotheses: 5
71
+ 2026-01-26 05:00:14,713 INFO [inference.py:346] First hypothesis: [37, 23, 127, 274, 5, 147, 80, 73, 6, 16, 29, 119, 5, 20, 84, 171, 15, 6, 328, 5, 127, 20, 56]
72
+ 2026-01-26 05:00:14,718 INFO [inference.py:309] Audio shape: torch.Size([40, 39360]), dtype: torch.float32
73
+ 2026-01-26 05:00:14,719 INFO [inference.py:310] Audio range: [-0.274, 0.362]
74
+ 2026-01-26 05:00:14,719 INFO [inference.py:311] Audio lengths: tensor([39359, 39199, 39039, 38080, 36000, 35200, 34880, 34880, 33760, 33760,
75
+ 33600, 33120, 29440, 29280, 27360, 24960, 24960, 23680, 21760, 21600,
76
+ 20800, 16800, 16320, 16160, 16000, 15679, 15040, 13440, 12320, 7040,
77
+ 6560, 6400, 5760, 5760, 5120, 4800, 4800, 4640, 4480, 3360],
78
+ dtype=torch.int32)
79
+ 2026-01-26 05:00:21,633 INFO [inference.py:332] Encoder out shape: torch.Size([40, 122, 1024])
80
+ 2026-01-26 05:00:21,634 INFO [inference.py:333] Encoder out lens: tensor([122, 122, 121, 118, 112, 109, 108, 108, 105, 105, 104, 103, 91, 91,
81
+ 85, 77, 77, 73, 67, 67, 64, 52, 50, 50, 49, 48, 46, 41,
82
+ 38, 21, 20, 19, 17, 17, 15, 14, 14, 14, 13, 10])
83
+ 2026-01-26 05:00:21,635 INFO [inference.py:334] Encoder out range: [-11.784, 11.570]
84
+ 2026-01-26 05:00:22,302 INFO [inference.py:344] Number of hypotheses: 40
85
+ 2026-01-26 05:00:22,302 INFO [inference.py:346] First hypothesis: []
86
+ 2026-01-26 05:00:22,310 INFO [inference.py:309] Audio shape: torch.Size([23, 66880]), dtype: torch.float32
87
+ 2026-01-26 05:00:22,311 INFO [inference.py:310] Audio range: [-0.514, 0.393]
88
+ 2026-01-26 05:00:22,311 INFO [inference.py:311] Audio lengths: tensor([66880, 65439, 60799, 60320, 59520, 58240, 57280, 56320, 55520, 54080,
89
+ 51840, 51520, 50720, 49920, 49600, 48319, 48320, 47999, 46880, 46079,
90
+ 44640, 44320, 44160], dtype=torch.int32)
91
+ 2026-01-26 05:00:29,229 INFO [inference.py:332] Encoder out shape: torch.Size([23, 208, 1024])
92
+ 2026-01-26 05:00:29,230 INFO [inference.py:333] Encoder out lens: tensor([208, 204, 189, 188, 185, 181, 178, 175, 173, 168, 161, 160, 158, 155,
93
+ 154, 150, 150, 149, 146, 143, 139, 138, 137])
94
+ 2026-01-26 05:00:29,230 INFO [inference.py:334] Encoder out range: [-12.152, 11.060]
95
+ 2026-01-26 05:00:29,913 INFO [inference.py:344] Number of hypotheses: 23
96
+ 2026-01-26 05:00:29,913 INFO [inference.py:346] First hypothesis: [51, 6, 4, 27, 5, 26, 70, 22, 20, 265, 33, 46]
97
+ 2026-01-26 05:00:29,920 INFO [inference.py:309] Audio shape: torch.Size([24, 65600]), dtype: torch.float32
98
+ 2026-01-26 05:00:29,921 INFO [inference.py:310] Audio range: [-0.416, 0.458]
99
+ 2026-01-26 05:00:29,921 INFO [inference.py:311] Audio lengths: tensor([65600, 64000, 63680, 61280, 60000, 58080, 55200, 52960, 51359, 51200,
100
+ 50720, 50720, 50080, 49280, 48639, 47840, 47360, 46880, 46400, 46240,
101
+ 45920, 44640, 43040, 42720], dtype=torch.int32)
102
+ 2026-01-26 05:00:37,217 INFO [inference.py:332] Encoder out shape: torch.Size([24, 204, 1024])
103
+ 2026-01-26 05:00:37,217 INFO [inference.py:333] Encoder out lens: tensor([204, 199, 198, 191, 187, 181, 172, 165, 160, 159, 158, 158, 156, 153,
104
+ 151, 149, 147, 146, 144, 144, 143, 139, 134, 133])
105
+ 2026-01-26 05:00:37,218 INFO [inference.py:334] Encoder out range: [-12.007, 11.624]
106
+ 2026-01-26 05:00:37,807 INFO [inference.py:344] Number of hypotheses: 24
107
+ 2026-01-26 05:00:37,808 INFO [inference.py:346] First hypothesis: [11]
108
+ 2026-01-26 05:00:37,815 INFO [inference.py:309] Audio shape: torch.Size([9, 176960]), dtype: torch.float32
109
+ 2026-01-26 05:00:37,816 INFO [inference.py:310] Audio range: [-0.135, 0.191]
110
+ 2026-01-26 05:00:37,816 INFO [inference.py:311] Audio lengths: tensor([176960, 170720, 164480, 155840, 154559, 151839, 151840, 151360, 147040],
111
+ dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-02-37 ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:02:37,849 INFO [inference.py:410] ================================================================================
2
+ 2026-01-26 05:02:37,849 INFO [inference.py:411] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:02:37,850 INFO [inference.py:412] ================================================================================
4
+ 2026-01-26 05:02:37,850 INFO [inference.py:413] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:02:37,850 INFO [inference.py:414] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:02:37,850 INFO [inference.py:415] Test set: ihm
7
+ 2026-01-26 05:02:37,850 INFO [inference.py:416] Decoding method: modified_beam_search
8
+ 2026-01-26 05:02:37,850 INFO [inference.py:422] Device: cpu
9
+ 2026-01-26 05:02:37,850 INFO [inference.py:425] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 05:02:37,851 INFO [inference.py:433] Vocabulary size: 500
11
+ 2026-01-26 05:02:37,851 INFO [inference.py:434] Blank ID: 0
12
+ 2026-01-26 05:02:37,851 INFO [inference.py:437] Creating model
13
+ 2026-01-26 05:02:39,443 INFO [inference.py:459] Loading checkpoint: xlsr_transducer/exp_16gb_scd/epoch-15.pt
14
+ 2026-01-26 05:02:39,444 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/epoch-15.pt
15
+ 2026-01-26 05:02:44,138 INFO [inference.py:473] Number of model parameters: 317,511,772
16
+ 2026-01-26 05:02:44,139 INFO [inference.py:476] Loading test data
17
+ 2026-01-26 05:02:44,139 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 05:02:45,310 INFO [inference.py:487] Number of test utterances: 6676
19
+ 2026-01-26 05:02:45,310 INFO [inference.py:490] Starting inference...
20
+ 2026-01-26 05:02:46,398 INFO [inference.py:309] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 05:02:46,400 INFO [inference.py:310] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 05:02:46,404 INFO [inference.py:311] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 05:02:55,240 INFO [inference.py:332] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 05:02:55,241 INFO [inference.py:333] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 05:02:55,298 INFO [inference.py:334] Encoder out range: [-4.703, 6.664]
26
+ 2026-01-26 05:02:55,596 INFO [inference.py:344] Number of hypotheses: 6
27
+ 2026-01-26 05:02:55,596 INFO [inference.py:346] First hypothesis: []
28
+ 2026-01-26 05:02:55,605 INFO [inference.py:309] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 05:02:55,606 INFO [inference.py:310] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 05:02:55,607 INFO [inference.py:311] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-03-42 ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:03:42,123 INFO [inference.py:410] ================================================================================
2
+ 2026-01-26 05:03:42,123 INFO [inference.py:411] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:03:42,123 INFO [inference.py:412] ================================================================================
4
+ 2026-01-26 05:03:42,123 INFO [inference.py:413] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:03:42,123 INFO [inference.py:414] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:03:42,123 INFO [inference.py:415] Test set: ihm
7
+ 2026-01-26 05:03:42,123 INFO [inference.py:416] Decoding method: modified_beam_search
8
+ 2026-01-26 05:03:42,123 INFO [inference.py:422] Device: cpu
9
+ 2026-01-26 05:03:42,124 INFO [inference.py:425] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 05:03:42,125 INFO [inference.py:433] Vocabulary size: 500
11
+ 2026-01-26 05:03:42,125 INFO [inference.py:434] Blank ID: 0
12
+ 2026-01-26 05:03:42,125 INFO [inference.py:437] Creating model
13
+ 2026-01-26 05:03:43,760 INFO [inference.py:459] Loading checkpoint: xlsr_transducer/exp_16gb_scd/epoch-15.pt
14
+ 2026-01-26 05:03:43,760 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/epoch-15.pt
15
+ 2026-01-26 05:03:48,510 INFO [inference.py:473] Number of model parameters: 317,511,772
16
+ 2026-01-26 05:03:48,511 INFO [inference.py:476] Loading test data
17
+ 2026-01-26 05:03:48,511 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 05:03:49,613 INFO [inference.py:487] Number of test utterances: 6676
19
+ 2026-01-26 05:03:49,613 INFO [inference.py:490] Starting inference...
20
+ 2026-01-26 05:03:50,741 INFO [inference.py:309] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 05:03:50,756 INFO [inference.py:310] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 05:03:50,759 INFO [inference.py:311] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 05:03:59,827 INFO [inference.py:332] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 05:03:59,828 INFO [inference.py:333] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 05:03:59,828 INFO [inference.py:334] Encoder out range: [-4.703, 6.664]
26
+ 2026-01-26 05:04:00,148 INFO [inference.py:344] Number of hypotheses: 6
27
+ 2026-01-26 05:04:00,148 INFO [inference.py:346] First hypothesis: []
28
+ 2026-01-26 05:04:00,158 INFO [inference.py:309] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 05:04:00,162 INFO [inference.py:310] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 05:04:00,163 INFO [inference.py:311] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-05-16 ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:05:16,888 INFO [inference.py:410] ================================================================================
2
+ 2026-01-26 05:05:16,888 INFO [inference.py:411] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:05:16,888 INFO [inference.py:412] ================================================================================
4
+ 2026-01-26 05:05:16,888 INFO [inference.py:413] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:05:16,888 INFO [inference.py:414] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:05:16,888 INFO [inference.py:415] Test set: ihm
7
+ 2026-01-26 05:05:16,888 INFO [inference.py:416] Decoding method: modified_beam_search
8
+ 2026-01-26 05:05:16,888 INFO [inference.py:422] Device: cpu
9
+ 2026-01-26 05:05:16,888 INFO [inference.py:425] Loading BPE model from data/lang_bpe_500_scd
10
+ 2026-01-26 05:05:16,890 INFO [inference.py:433] Vocabulary size: 500
11
+ 2026-01-26 05:05:16,890 INFO [inference.py:434] Blank ID: 0
12
+ 2026-01-26 05:05:16,890 INFO [inference.py:437] Creating model
13
+ 2026-01-26 05:05:18,544 INFO [inference.py:444] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-train-loss.pt
14
+ 2026-01-26 05:05:18,544 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-train-loss.pt
15
+ 2026-01-26 05:05:23,319 INFO [inference.py:473] Number of model parameters: 317,511,772
16
+ 2026-01-26 05:05:23,320 INFO [inference.py:476] Loading test data
17
+ 2026-01-26 05:05:23,320 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
18
+ 2026-01-26 05:05:24,403 INFO [inference.py:487] Number of test utterances: 6676
19
+ 2026-01-26 05:05:24,403 INFO [inference.py:490] Starting inference...
20
+ 2026-01-26 05:05:25,573 INFO [inference.py:309] Audio shape: torch.Size([6, 246400]), dtype: torch.float32
21
+ 2026-01-26 05:05:25,576 INFO [inference.py:310] Audio range: [-0.090, 0.104]
22
+ 2026-01-26 05:05:25,579 INFO [inference.py:311] Audio lengths: tensor([246400, 244799, 238079, 228000, 224000, 222880], dtype=torch.int32)
23
+ 2026-01-26 05:05:34,838 INFO [inference.py:332] Encoder out shape: torch.Size([6, 769, 1024])
24
+ 2026-01-26 05:05:34,839 INFO [inference.py:333] Encoder out lens: tensor([769, 764, 743, 712, 699, 696])
25
+ 2026-01-26 05:05:34,839 INFO [inference.py:334] Encoder out range: [-13.684, 12.764]
26
+ 2026-01-26 05:05:35,536 INFO [inference.py:344] Number of hypotheses: 6
27
+ 2026-01-26 05:05:35,537 INFO [inference.py:346] First hypothesis: [171]
28
+ 2026-01-26 05:05:35,546 INFO [inference.py:309] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
29
+ 2026-01-26 05:05:35,547 INFO [inference.py:310] Audio range: [-0.401, 0.443]
30
+ 2026-01-26 05:05:35,547 INFO [inference.py:311] Audio lengths: tensor([67200, 66559, 66400, 64159, 63680, 63040, 61120, 60960, 59519, 59040,
31
+ 58560, 55840, 54559, 52480, 52160, 50559, 50400, 50240, 48480, 47840,
32
+ 47520, 44639, 44000], dtype=torch.int32)
33
+ 2026-01-26 05:05:43,001 INFO [inference.py:332] Encoder out shape: torch.Size([23, 209, 1024])
34
+ 2026-01-26 05:05:43,003 INFO [inference.py:333] Encoder out lens: tensor([209, 207, 207, 200, 198, 196, 190, 190, 185, 184, 182, 174, 170, 163,
35
+ 162, 157, 157, 156, 151, 149, 148, 139, 137])
36
+ 2026-01-26 05:05:43,003 INFO [inference.py:334] Encoder out range: [-12.514, 12.004]
37
+ 2026-01-26 05:05:43,905 INFO [inference.py:344] Number of hypotheses: 23
38
+ 2026-01-26 05:05:43,905 INFO [inference.py:346] First hypothesis: [23, 51, 156, 6, 205, 18, 116, 113, 363]
39
+ 2026-01-26 05:05:43,925 INFO [inference.py:309] Audio shape: torch.Size([39, 40640]), dtype: torch.float32
40
+ 2026-01-26 05:05:43,926 INFO [inference.py:310] Audio range: [-0.439, 0.480]
41
+ 2026-01-26 05:05:43,926 INFO [inference.py:311] Audio lengths: tensor([40640, 37279, 36799, 36480, 36480, 33280, 33279, 32320, 32159, 30400,
42
+ 28800, 28480, 28160, 23520, 23039, 22880, 22400, 21920, 21920, 20960,
43
+ 20160, 20000, 19200, 19040, 18880, 18240, 16480, 14720, 13600, 12960,
44
+ 12320, 11680, 11520, 10880, 9440, 9120, 7840, 5920, 5760],
45
+ dtype=torch.int32)
46
+ 2026-01-26 05:05:51,027 INFO [inference.py:332] Encoder out shape: torch.Size([39, 126, 1024])
47
+ 2026-01-26 05:05:51,028 INFO [inference.py:333] Encoder out lens: tensor([126, 116, 114, 113, 113, 103, 103, 100, 100, 94, 89, 88, 87, 73,
48
+ 71, 71, 69, 68, 68, 65, 62, 62, 59, 59, 58, 56, 51, 45,
49
+ 42, 40, 38, 36, 35, 33, 29, 28, 24, 18, 17])
50
+ 2026-01-26 05:05:51,028 INFO [inference.py:334] Encoder out range: [-11.444, 10.811]
51
+ 2026-01-26 05:05:51,620 INFO [inference.py:344] Number of hypotheses: 39
52
+ 2026-01-26 05:05:51,620 INFO [inference.py:346] First hypothesis: [11]
53
+ 2026-01-26 05:05:51,628 INFO [inference.py:309] Audio shape: torch.Size([23, 68000]), dtype: torch.float32
54
+ 2026-01-26 05:05:51,629 INFO [inference.py:310] Audio range: [-0.314, 0.332]
55
+ 2026-01-26 05:05:51,629 INFO [inference.py:311] Audio lengths: tensor([68000, 65920, 65599, 64799, 64160, 63520, 62400, 61600, 59040, 58239,
56
+ 56480, 55840, 55520, 55359, 54719, 53440, 52800, 52640, 47200, 46239,
57
+ 46079, 45280, 44960], dtype=torch.int32)
58
+ 2026-01-26 05:05:59,021 INFO [inference.py:332] Encoder out shape: torch.Size([23, 212, 1024])
59
+ 2026-01-26 05:05:59,022 INFO [inference.py:333] Encoder out lens: tensor([212, 205, 204, 202, 200, 198, 194, 192, 184, 181, 176, 174, 173, 172,
60
+ 170, 166, 164, 164, 147, 144, 143, 141, 140])
61
+ 2026-01-26 05:05:59,023 INFO [inference.py:334] Encoder out range: [-13.261, 11.090]
62
+ 2026-01-26 05:05:59,931 INFO [inference.py:344] Number of hypotheses: 23
63
+ 2026-01-26 05:05:59,932 INFO [inference.py:346] First hypothesis: [20]
64
+ 2026-01-26 05:06:00,567 INFO [inference.py:309] Audio shape: torch.Size([5, 317280]), dtype: torch.float32
65
+ 2026-01-26 05:06:00,568 INFO [inference.py:310] Audio range: [-0.323, 0.414]
66
+ 2026-01-26 05:06:00,569 INFO [inference.py:311] Audio lengths: tensor([317280, 298079, 298080, 294559, 292480], dtype=torch.int32)
67
+ 2026-01-26 05:06:09,302 INFO [inference.py:332] Encoder out shape: torch.Size([5, 991, 1024])
68
+ 2026-01-26 05:06:09,303 INFO [inference.py:333] Encoder out lens: tensor([991, 931, 931, 920, 913])
69
+ 2026-01-26 05:06:09,304 INFO [inference.py:334] Encoder out range: [-14.241, 14.344]
70
+ 2026-01-26 05:06:10,112 INFO [inference.py:344] Number of hypotheses: 5
71
+ 2026-01-26 05:06:10,113 INFO [inference.py:346] First hypothesis: [37, 23, 127, 274, 5, 147, 80, 73, 6, 16, 29, 119, 5, 20, 84, 171, 15, 6, 328, 5, 127, 20, 56]
72
+ 2026-01-26 05:06:10,117 INFO [inference.py:309] Audio shape: torch.Size([40, 39360]), dtype: torch.float32
73
+ 2026-01-26 05:06:10,118 INFO [inference.py:310] Audio range: [-0.274, 0.362]
74
+ 2026-01-26 05:06:10,118 INFO [inference.py:311] Audio lengths: tensor([39359, 39199, 39039, 38080, 36000, 35200, 34880, 34880, 33760, 33760,
75
+ 33600, 33120, 29440, 29280, 27360, 24960, 24960, 23680, 21760, 21600,
76
+ 20800, 16800, 16320, 16160, 16000, 15679, 15040, 13440, 12320, 7040,
77
+ 6560, 6400, 5760, 5760, 5120, 4800, 4800, 4640, 4480, 3360],
78
+ dtype=torch.int32)
79
+ 2026-01-26 05:06:16,910 INFO [inference.py:332] Encoder out shape: torch.Size([40, 122, 1024])
80
+ 2026-01-26 05:06:16,911 INFO [inference.py:333] Encoder out lens: tensor([122, 122, 121, 118, 112, 109, 108, 108, 105, 105, 104, 103, 91, 91,
81
+ 85, 77, 77, 73, 67, 67, 64, 52, 50, 50, 49, 48, 46, 41,
82
+ 38, 21, 20, 19, 17, 17, 15, 14, 14, 14, 13, 10])
83
+ 2026-01-26 05:06:16,911 INFO [inference.py:334] Encoder out range: [-11.784, 11.570]
84
+ 2026-01-26 05:06:17,504 INFO [inference.py:344] Number of hypotheses: 40
85
+ 2026-01-26 05:06:17,504 INFO [inference.py:346] First hypothesis: []
86
+ 2026-01-26 05:06:17,513 INFO [inference.py:309] Audio shape: torch.Size([23, 66880]), dtype: torch.float32
87
+ 2026-01-26 05:06:17,514 INFO [inference.py:310] Audio range: [-0.514, 0.393]
88
+ 2026-01-26 05:06:17,514 INFO [inference.py:311] Audio lengths: tensor([66880, 65439, 60799, 60320, 59520, 58240, 57280, 56320, 55520, 54080,
89
+ 51840, 51520, 50720, 49920, 49600, 48319, 48320, 47999, 46880, 46079,
90
+ 44640, 44320, 44160], dtype=torch.int32)
91
+ 2026-01-26 05:06:24,334 INFO [inference.py:332] Encoder out shape: torch.Size([23, 208, 1024])
92
+ 2026-01-26 05:06:24,335 INFO [inference.py:333] Encoder out lens: tensor([208, 204, 189, 188, 185, 181, 178, 175, 173, 168, 161, 160, 158, 155,
93
+ 154, 150, 150, 149, 146, 143, 139, 138, 137])
94
+ 2026-01-26 05:06:24,335 INFO [inference.py:334] Encoder out range: [-12.152, 11.060]
95
+ 2026-01-26 05:06:25,044 INFO [inference.py:344] Number of hypotheses: 23
96
+ 2026-01-26 05:06:25,045 INFO [inference.py:346] First hypothesis: [51, 6, 4, 27, 5, 26, 70, 22, 20, 265, 33, 46]
97
+ 2026-01-26 05:06:25,108 INFO [inference.py:309] Audio shape: torch.Size([24, 65600]), dtype: torch.float32
98
+ 2026-01-26 05:06:25,109 INFO [inference.py:310] Audio range: [-0.416, 0.458]
99
+ 2026-01-26 05:06:25,109 INFO [inference.py:311] Audio lengths: tensor([65600, 64000, 63680, 61280, 60000, 58080, 55200, 52960, 51359, 51200,
100
+ 50720, 50720, 50080, 49280, 48639, 47840, 47360, 46880, 46400, 46240,
101
+ 45920, 44640, 43040, 42720], dtype=torch.int32)
102
+ 2026-01-26 05:06:32,219 INFO [inference.py:332] Encoder out shape: torch.Size([24, 204, 1024])
103
+ 2026-01-26 05:06:32,220 INFO [inference.py:333] Encoder out lens: tensor([204, 199, 198, 191, 187, 181, 172, 165, 160, 159, 158, 158, 156, 153,
104
+ 151, 149, 147, 146, 144, 144, 143, 139, 134, 133])
105
+ 2026-01-26 05:06:32,220 INFO [inference.py:334] Encoder out range: [-12.007, 11.624]
106
+ 2026-01-26 05:06:32,900 INFO [inference.py:344] Number of hypotheses: 24
107
+ 2026-01-26 05:06:32,900 INFO [inference.py:346] First hypothesis: [11]
108
+ 2026-01-26 05:06:32,908 INFO [inference.py:309] Audio shape: torch.Size([9, 176960]), dtype: torch.float32
109
+ 2026-01-26 05:06:32,920 INFO [inference.py:310] Audio range: [-0.135, 0.191]
110
+ 2026-01-26 05:06:32,921 INFO [inference.py:311] Audio lengths: tensor([176960, 170720, 164480, 155840, 154559, 151839, 151840, 151360, 147040],
111
+ dtype=torch.int32)
112
+ 2026-01-26 05:06:40,613 INFO [inference.py:332] Encoder out shape: torch.Size([9, 552, 1024])
113
+ 2026-01-26 05:06:40,614 INFO [inference.py:333] Encoder out lens: tensor([552, 533, 513, 486, 482, 474, 474, 472, 459])
114
+ 2026-01-26 05:06:40,614 INFO [inference.py:334] Encoder out range: [-13.325, 12.083]
115
+ 2026-01-26 05:06:41,231 INFO [inference.py:344] Number of hypotheses: 9
116
+ 2026-01-26 05:06:41,231 INFO [inference.py:346] First hypothesis: [11, 87, 7, 5, 13, 379, 130, 101, 6, 16, 29, 119, 5, 6, 16, 29, 119, 18, 115, 93, 58, 36, 30, 201, 38, 58, 134, 50, 6, 16, 29, 119, 20, 81, 7, 85, 272, 73, 105, 6, 205, 17, 47]
117
+ 2026-01-26 05:06:41,240 INFO [inference.py:309] Audio shape: torch.Size([14, 112320]), dtype: torch.float32
118
+ 2026-01-26 05:06:41,241 INFO [inference.py:310] Audio range: [-0.469, 0.457]
119
+ 2026-01-26 05:06:41,242 INFO [inference.py:311] Audio lengths: tensor([112320, 105920, 105439, 104000, 103840, 101920, 98720, 98400, 96960,
120
+ 96800, 96320, 95680, 93760, 93600], dtype=torch.int32)
121
+ 2026-01-26 05:06:49,007 INFO [inference.py:332] Encoder out shape: torch.Size([14, 350, 1024])
122
+ 2026-01-26 05:06:49,008 INFO [inference.py:333] Encoder out lens: tensor([350, 330, 329, 324, 324, 318, 308, 307, 302, 302, 300, 298, 292, 292])
123
+ 2026-01-26 05:06:49,009 INFO [inference.py:334] Encoder out range: [-14.286, 11.940]
124
+ 2026-01-26 05:06:49,714 INFO [inference.py:344] Number of hypotheses: 14
125
+ 2026-01-26 05:06:49,714 INFO [inference.py:346] First hypothesis: [39, 9, 83, 7, 8, 148, 122, 26, 48]
126
+ 2026-01-26 05:06:49,714 INFO [inference.py:535] Processed 206 utterances in 10 batches
127
+ 2026-01-26 05:06:49,723 INFO [inference.py:309] Audio shape: torch.Size([38, 41440]), dtype: torch.float32
128
+ 2026-01-26 05:06:49,724 INFO [inference.py:310] Audio range: [-0.272, 0.322]
129
+ 2026-01-26 05:06:49,726 INFO [inference.py:311] Audio lengths: tensor([41440, 41120, 40160, 35680, 33120, 32960, 32800, 31520, 31040, 30880,
130
+ 30239, 29920, 29120, 27360, 25279, 24480, 23520, 22720, 22720, 21600,
131
+ 20800, 20320, 19840, 19840, 17600, 15520, 13120, 12480, 12320, 11040,
132
+ 10560, 9600, 8640, 7520, 5440, 5120, 5120, 4640],
133
+ dtype=torch.int32)
134
+ 2026-01-26 05:06:57,233 INFO [inference.py:332] Encoder out shape: torch.Size([38, 129, 1024])
135
+ 2026-01-26 05:06:57,234 INFO [inference.py:333] Encoder out lens: tensor([129, 128, 125, 111, 103, 102, 102, 98, 96, 96, 94, 93, 90, 85,
136
+ 78, 76, 73, 70, 70, 67, 64, 63, 61, 61, 54, 48, 40, 38,
137
+ 38, 34, 32, 29, 26, 23, 16, 15, 15, 14])
138
+ 2026-01-26 05:06:57,235 INFO [inference.py:334] Encoder out range: [-13.512, 11.822]
139
+ 2026-01-26 05:06:57,919 INFO [inference.py:344] Number of hypotheses: 38
140
+ 2026-01-26 05:06:57,920 INFO [inference.py:346] First hypothesis: [56, 199, 130]
141
+ 2026-01-26 05:06:57,928 INFO [inference.py:309] Audio shape: torch.Size([38, 41280]), dtype: torch.float32
142
+ 2026-01-26 05:06:57,929 INFO [inference.py:310] Audio range: [-0.080, 0.105]
143
+ 2026-01-26 05:06:57,930 INFO [inference.py:311] Audio lengths: tensor([41280, 40320, 36800, 35680, 34880, 34879, 34080, 34080, 32000, 30400,
144
+ 29280, 29280, 28320, 24000, 23040, 20960, 20960, 20960, 20160, 16960,
145
+ 14080, 13280, 12640, 12160, 10720, 9440, 8640, 6240, 6080, 5440,
146
+ 5440, 5120, 4800, 4800, 4640, 4480, 4320, 4160],
147
+ dtype=torch.int32)
148
+ 2026-01-26 05:07:05,017 INFO [inference.py:332] Encoder out shape: torch.Size([38, 128, 1024])
149
+ 2026-01-26 05:07:05,019 INFO [inference.py:333] Encoder out lens: tensor([128, 125, 114, 111, 108, 108, 106, 106, 99, 94, 91, 91, 88, 74,
150
+ 71, 65, 65, 65, 62, 52, 43, 41, 39, 37, 33, 29, 26, 19,
151
+ 18, 16, 16, 15, 14, 14, 14, 13, 13, 12])
152
+ 2026-01-26 05:07:05,019 INFO [inference.py:334] Encoder out range: [-11.071, 11.522]
153
+ 2026-01-26 05:07:05,620 INFO [inference.py:344] Number of hypotheses: 38
154
+ 2026-01-26 05:07:05,620 INFO [inference.py:346] First hypothesis: [10, 7, 5, 6, 148]
155
+ 2026-01-26 05:07:05,630 INFO [inference.py:309] Audio shape: torch.Size([38, 41760]), dtype: torch.float32
156
+ 2026-01-26 05:07:05,631 INFO [inference.py:310] Audio range: [-0.246, 0.340]
157
+ 2026-01-26 05:07:05,631 INFO [inference.py:311] Audio lengths: tensor([41760, 39680, 38880, 36799, 36639, 36000, 34559, 34240, 33120, 31840,
158
+ 30720, 30560, 29760, 29280, 24640, 24160, 22720, 21759, 21600, 20960,
159
+ 16320, 14400, 13600, 11360, 10880, 10399, 10400, 9760, 9440, 9280,
160
+ 8320, 8320, 7680, 7360, 6880, 6880, 6240, 6240],
161
+ dtype=torch.int32)
162
+ 2026-01-26 05:07:13,101 INFO [inference.py:332] Encoder out shape: torch.Size([38, 130, 1024])
163
+ 2026-01-26 05:07:13,102 INFO [inference.py:333] Encoder out lens: tensor([130, 123, 121, 114, 114, 112, 107, 106, 103, 99, 95, 95, 92, 91,
164
+ 76, 75, 70, 67, 67, 65, 50, 44, 42, 35, 33, 32, 32, 30,
165
+ 29, 28, 25, 25, 23, 22, 21, 21, 19, 19])
166
+ 2026-01-26 05:07:13,103 INFO [inference.py:334] Encoder out range: [-11.967, 11.229]
167
+ 2026-01-26 05:07:13,708 INFO [inference.py:344] Number of hypotheses: 38
168
+ 2026-01-26 05:07:13,708 INFO [inference.py:346] First hypothesis: [145, 9, 7, 24, 44, 205]
169
+ 2026-01-26 05:07:13,715 INFO [inference.py:309] Audio shape: torch.Size([9, 170400]), dtype: torch.float32
170
+ 2026-01-26 05:07:13,716 INFO [inference.py:310] Audio range: [-0.370, 0.393]
171
+ 2026-01-26 05:07:13,716 INFO [inference.py:311] Audio lengths: tensor([170400, 166559, 165919, 164800, 156800, 152480, 147520, 146559, 145759],
172
+ dtype=torch.int32)
173
+ 2026-01-26 05:07:21,734 INFO [inference.py:332] Encoder out shape: torch.Size([9, 532, 1024])
174
+ 2026-01-26 05:07:21,735 INFO [inference.py:333] Encoder out lens: tensor([532, 520, 518, 514, 489, 476, 460, 457, 455])
175
+ 2026-01-26 05:07:21,735 INFO [inference.py:334] Encoder out range: [-12.221, 14.348]
176
+ 2026-01-26 05:07:22,459 INFO [inference.py:344] Number of hypotheses: 9
177
+ 2026-01-26 05:07:22,459 INFO [inference.py:346] First hypothesis: [37, 4, 2, 11]
178
+ 2026-01-26 05:07:22,468 INFO [inference.py:309] Audio shape: torch.Size([5, 315520]), dtype: torch.float32
179
+ 2026-01-26 05:07:22,469 INFO [inference.py:310] Audio range: [-0.297, 0.334]
180
+ 2026-01-26 05:07:22,470 INFO [inference.py:311] Audio lengths: tensor([315520, 301440, 294399, 292480, 289919], dtype=torch.int32)
181
+ 2026-01-26 05:07:31,016 INFO [inference.py:332] Encoder out shape: torch.Size([5, 985, 1024])
182
+ 2026-01-26 05:07:31,017 INFO [inference.py:333] Encoder out lens: tensor([985, 941, 919, 913, 905])
183
+ 2026-01-26 05:07:31,017 INFO [inference.py:334] Encoder out range: [-12.260, 13.635]
184
+ 2026-01-26 05:07:31,753 INFO [inference.py:344] Number of hypotheses: 5
185
+ 2026-01-26 05:07:31,753 INFO [inference.py:346] First hypothesis: [11, 52, 87, 7, 5, 272, 25, 313, 359, 5, 6, 24, 25, 297, 5, 114, 32, 7, 8, 38, 204, 51, 13, 58, 63, 5, 18, 47, 259, 101, 18, 34, 16, 29, 119, 5, 113, 64, 113, 64]
186
+ 2026-01-26 05:07:31,760 INFO [inference.py:309] Audio shape: torch.Size([6, 239520]), dtype: torch.float32
187
+ 2026-01-26 05:07:31,773 INFO [inference.py:310] Audio range: [-0.116, 0.111]
188
+ 2026-01-26 05:07:31,773 INFO [inference.py:311] Audio lengths: tensor([239519, 234240, 223840, 223360, 219679, 215680], dtype=torch.int32)
189
+ 2026-01-26 05:07:39,824 INFO [inference.py:332] Encoder out shape: torch.Size([6, 748, 1024])
190
+ 2026-01-26 05:07:39,824 INFO [inference.py:333] Encoder out lens: tensor([748, 731, 699, 697, 686, 673])
191
+ 2026-01-26 05:07:39,825 INFO [inference.py:334] Encoder out range: [-13.591, 10.919]
192
+ 2026-01-26 05:07:40,203 INFO [inference.py:344] Number of hypotheses: 6
193
+ 2026-01-26 05:07:40,204 INFO [inference.py:346] First hypothesis: [23]
194
+ 2026-01-26 05:07:40,210 INFO [inference.py:309] Audio shape: torch.Size([5, 315200]), dtype: torch.float32
195
+ 2026-01-26 05:07:40,210 INFO [inference.py:310] Audio range: [-0.082, 0.158]
196
+ 2026-01-26 05:07:40,211 INFO [inference.py:311] Audio lengths: tensor([315200, 310560, 300000, 299680, 296959], dtype=torch.int32)
197
+ 2026-01-26 05:07:49,627 INFO [inference.py:332] Encoder out shape: torch.Size([5, 984, 1024])
198
+ 2026-01-26 05:07:49,628 INFO [inference.py:333] Encoder out lens: tensor([984, 970, 937, 936, 927])
199
+ 2026-01-26 05:07:49,628 INFO [inference.py:334] Encoder out range: [-14.589, 11.647]
200
+ 2026-01-26 05:07:50,241 INFO [inference.py:344] Number of hypotheses: 5
201
+ 2026-01-26 05:07:50,241 INFO [inference.py:346] First hypothesis: [310, 20, 51, 46, 6, 155, 22, 51, 274, 73, 152, 25, 383, 73, 155, 22, 26, 93, 53, 183, 5, 73, 93, 269, 27, 5, 31]
202
+ 2026-01-26 05:07:50,247 INFO [inference.py:309] Audio shape: torch.Size([6, 237280]), dtype: torch.float32
203
+ 2026-01-26 05:07:50,248 INFO [inference.py:310] Audio range: [-0.130, 0.131]
204
+ 2026-01-26 05:07:50,248 INFO [inference.py:311] Audio lengths: tensor([237280, 228159, 220639, 220480, 219359, 213119], dtype=torch.int32)
205
+ 2026-01-26 05:07:58,004 INFO [inference.py:332] Encoder out shape: torch.Size([6, 741, 1024])
206
+ 2026-01-26 05:07:58,005 INFO [inference.py:333] Encoder out lens: tensor([741, 712, 689, 688, 685, 665])
207
+ 2026-01-26 05:07:58,005 INFO [inference.py:334] Encoder out range: [-13.120, 12.506]
208
+ 2026-01-26 05:07:58,629 INFO [inference.py:344] Number of hypotheses: 6
209
+ 2026-01-26 05:07:58,630 INFO [inference.py:346] First hypothesis: [37, 349, 41, 18, 349, 41, 49, 101, 6]
210
+ 2026-01-26 05:07:58,637 INFO [inference.py:309] Audio shape: torch.Size([17, 91040]), dtype: torch.float32
211
+ 2026-01-26 05:07:58,637 INFO [inference.py:310] Audio range: [-0.574, 0.629]
212
+ 2026-01-26 05:07:58,638 INFO [inference.py:311] Audio lengths: tensor([91040, 90240, 89119, 88480, 87520, 86079, 83680, 82880, 81120, 79520,
213
+ 79520, 78079, 76800, 76480, 73760, 73600, 73599], dtype=torch.int32)
214
+ 2026-01-26 05:08:06,806 INFO [inference.py:332] Encoder out shape: torch.Size([17, 284, 1024])
215
+ 2026-01-26 05:08:06,807 INFO [inference.py:333] Encoder out lens: tensor([284, 281, 278, 276, 273, 268, 261, 258, 253, 248, 248, 243, 239, 238,
216
+ 230, 229, 229])
217
+ 2026-01-26 05:08:06,807 INFO [inference.py:334] Encoder out range: [-13.703, 11.821]
218
+ 2026-01-26 05:08:07,431 INFO [inference.py:344] Number of hypotheses: 17
219
+ 2026-01-26 05:08:07,432 INFO [inference.py:346] First hypothesis: [131, 214, 33, 259, 26, 101, 265, 6, 205]
220
+ 2026-01-26 05:08:07,438 INFO [inference.py:309] Audio shape: torch.Size([23, 68960]), dtype: torch.float32
221
+ 2026-01-26 05:08:07,439 INFO [inference.py:310] Audio range: [-0.269, 0.266]
222
+ 2026-01-26 05:08:07,439 INFO [inference.py:311] Audio lengths: tensor([68959, 66880, 64800, 64479, 61920, 59680, 54400, 53440, 52479, 52319,
223
+ 51840, 46880, 46559, 45120, 44480, 43360, 43360, 43360, 43040, 43040,
224
+ 43040, 42880, 42560], dtype=torch.int32)
225
+ 2026-01-26 05:08:16,530 INFO [inference.py:332] Encoder out shape: torch.Size([23, 215, 1024])
226
+ 2026-01-26 05:08:16,531 INFO [inference.py:333] Encoder out lens: tensor([215, 208, 202, 201, 193, 186, 169, 166, 163, 163, 161, 146, 145, 140,
227
+ 138, 135, 135, 135, 134, 134, 134, 133, 132])
228
+ 2026-01-26 05:08:16,531 INFO [inference.py:334] Encoder out range: [-13.477, 12.445]
229
+ 2026-01-26 05:08:17,420 INFO [inference.py:344] Number of hypotheses: 23
230
+ 2026-01-26 05:08:17,420 INFO [inference.py:346] First hypothesis: [225]
231
+ 2026-01-26 05:08:17,420 INFO [inference.py:535] Processed 391 utterances in 20 batches
232
+ 2026-01-26 05:08:17,426 INFO [inference.py:309] Audio shape: torch.Size([17, 92320]), dtype: torch.float32
233
+ 2026-01-26 05:08:17,427 INFO [inference.py:310] Audio range: [-0.234, 0.300]
234
+ 2026-01-26 05:08:17,427 INFO [inference.py:311] Audio lengths: tensor([92320, 91200, 91200, 90560, 89120, 84000, 83840, 83360, 82880, 82079,
235
+ 79840, 79520, 76800, 73760, 73280, 70079, 69600], dtype=torch.int32)
236
+ 2026-01-26 05:08:25,743 INFO [inference.py:332] Encoder out shape: torch.Size([17, 288, 1024])
237
+ 2026-01-26 05:08:25,744 INFO [inference.py:333] Encoder out lens: tensor([288, 284, 284, 282, 278, 262, 261, 260, 258, 256, 249, 248, 239, 230,
238
+ 228, 218, 217])
239
+ 2026-01-26 05:08:25,798 INFO [inference.py:334] Encoder out range: [-13.483, 12.297]
240
+ 2026-01-26 05:08:26,544 INFO [inference.py:344] Number of hypotheses: 17
241
+ 2026-01-26 05:08:26,545 INFO [inference.py:346] First hypothesis: [39, 52, 10, 7, 85, 58, 134, 5, 84, 189, 29, 14, 43, 8, 93, 130, 16, 34, 84]
242
+ 2026-01-26 05:08:26,602 INFO [inference.py:309] Audio shape: torch.Size([23, 68800]), dtype: torch.float32
243
+ 2026-01-26 05:08:26,604 INFO [inference.py:310] Audio range: [-0.321, 0.370]
244
+ 2026-01-26 05:08:26,604 INFO [inference.py:311] Audio lengths: tensor([68799, 66720, 62560, 62240, 61919, 60160, 59840, 58080, 57920, 57280,
245
+ 53920, 52960, 51040, 50080, 49920, 49280, 48160, 48160, 47680, 47200,
246
+ 44800, 44000, 42560], dtype=torch.int32)
247
+ 2026-01-26 05:08:34,725 INFO [inference.py:332] Encoder out shape: torch.Size([23, 214, 1024])
248
+ 2026-01-26 05:08:34,725 INFO [inference.py:333] Encoder out lens: tensor([214, 208, 195, 194, 193, 187, 186, 181, 180, 178, 168, 165, 159, 156,
249
+ 155, 153, 150, 150, 148, 147, 139, 137, 132])
250
+ 2026-01-26 05:08:34,726 INFO [inference.py:334] Encoder out range: [-11.273, 12.003]
251
+ 2026-01-26 05:08:35,331 INFO [inference.py:344] Number of hypotheses: 23
252
+ 2026-01-26 05:08:35,331 INFO [inference.py:346] First hypothesis: [218, 4, 2, 11]
253
+ 2026-01-26 05:08:35,338 INFO [inference.py:309] Audio shape: torch.Size([38, 42080]), dtype: torch.float32
254
+ 2026-01-26 05:08:35,339 INFO [inference.py:310] Audio range: [-0.400, 0.452]
255
+ 2026-01-26 05:08:35,340 INFO [inference.py:311] Audio lengths: tensor([42080, 39200, 37439, 36960, 35520, 34560, 34079, 33599, 33600, 33280,
256
+ 31520, 31200, 29760, 28160, 28000, 27200, 26720, 25600, 25120, 23200,
257
+ 22880, 21280, 20800, 20000, 19680, 19520, 19200, 18080, 17600, 17600,
258
+ 16320, 13120, 12320, 11680, 8000, 6400, 5120, 3840],
259
+ dtype=torch.int32)
260
+ 2026-01-26 05:08:43,838 INFO [inference.py:332] Encoder out shape: torch.Size([38, 131, 1024])
261
+ 2026-01-26 05:08:43,839 INFO [inference.py:333] Encoder out lens: tensor([131, 122, 116, 115, 110, 107, 106, 104, 104, 103, 98, 97, 92, 87,
262
+ 87, 84, 83, 79, 78, 72, 71, 66, 64, 62, 61, 60, 59, 56,
263
+ 54, 54, 50, 40, 38, 36, 24, 19, 15, 11])
264
+ 2026-01-26 05:08:43,839 INFO [inference.py:334] Encoder out range: [-11.872, 11.798]
265
+ 2026-01-26 05:08:44,627 INFO [inference.py:344] Number of hypotheses: 38
266
+ 2026-01-26 05:08:44,627 INFO [inference.py:346] First hypothesis: []
267
+ 2026-01-26 05:08:44,634 INFO [inference.py:309] Audio shape: torch.Size([5, 280640]), dtype: torch.float32
268
+ 2026-01-26 05:08:44,635 INFO [inference.py:310] Audio range: [-0.100, 0.092]
269
+ 2026-01-26 05:08:44,635 INFO [inference.py:311] Audio lengths: tensor([280639, 280640, 272800, 270080, 256480], dtype=torch.int32)
270
+ 2026-01-26 05:08:53,033 INFO [inference.py:332] Encoder out shape: torch.Size([5, 876, 1024])
271
+ 2026-01-26 05:08:53,034 INFO [inference.py:333] Encoder out lens: tensor([876, 876, 852, 843, 801])
272
+ 2026-01-26 05:08:53,034 INFO [inference.py:334] Encoder out range: [-14.497, 12.570]
273
+ 2026-01-26 05:08:53,729 INFO [inference.py:344] Number of hypotheses: 5
274
+ 2026-01-26 05:08:53,730 INFO [inference.py:346] First hypothesis: [68, 13, 211, 25, 294, 6, 344, 131, 214, 56, 18, 34, 146, 16, 56, 46, 136, 40, 26, 5, 156, 6, 189]
275
+ 2026-01-26 05:08:53,736 INFO [inference.py:309] Audio shape: torch.Size([6, 248640]), dtype: torch.float32
276
+ 2026-01-26 05:08:53,737 INFO [inference.py:310] Audio range: [-0.080, 0.094]
277
+ 2026-01-26 05:08:53,738 INFO [inference.py:311] Audio lengths: tensor([248639, 242720, 233119, 227199, 217440, 216479], dtype=torch.int32)
278
+ 2026-01-26 05:09:02,511 INFO [inference.py:332] Encoder out shape: torch.Size([6, 776, 1024])
279
+ 2026-01-26 05:09:02,511 INFO [inference.py:333] Encoder out lens: tensor([776, 758, 728, 709, 679, 676])
280
+ 2026-01-26 05:09:02,512 INFO [inference.py:334] Encoder out range: [-12.796, 12.210]
281
+ 2026-01-26 05:09:03,260 INFO [inference.py:344] Number of hypotheses: 6
282
+ 2026-01-26 05:09:03,260 INFO [inference.py:346] First hypothesis: [11, 87, 7, 5, 58, 134, 5, 71, 67, 6, 195, 25, 98, 36, 67, 51, 121, 26, 117, 10, 208, 13, 265, 39, 9, 100]
283
+ 2026-01-26 05:09:03,270 INFO [inference.py:309] Audio shape: torch.Size([13, 117120]), dtype: torch.float32
284
+ 2026-01-26 05:09:03,270 INFO [inference.py:310] Audio range: [-0.283, 0.260]
285
+ 2026-01-26 05:09:03,271 INFO [inference.py:311] Audio lengths: tensor([117120, 111680, 107200, 106720, 106239, 104639, 104480, 101920, 100960,
286
+ 98880, 96960, 93920, 93600], dtype=torch.int32)
287
+ 2026-01-26 05:09:11,401 INFO [inference.py:332] Encoder out shape: torch.Size([13, 365, 1024])
288
+ 2026-01-26 05:09:11,402 INFO [inference.py:333] Encoder out lens: tensor([365, 348, 334, 333, 331, 326, 326, 318, 315, 308, 302, 293, 292])
289
+ 2026-01-26 05:09:11,402 INFO [inference.py:334] Encoder out range: [-12.112, 13.452]
290
+ 2026-01-26 05:09:12,027 INFO [inference.py:344] Number of hypotheses: 13
291
+ 2026-01-26 05:09:12,027 INFO [inference.py:346] First hypothesis: [11]
292
+ 2026-01-26 05:09:12,035 INFO [inference.py:309] Audio shape: torch.Size([23, 67200]), dtype: torch.float32
293
+ 2026-01-26 05:09:12,036 INFO [inference.py:310] Audio range: [-0.233, 0.248]
294
+ 2026-01-26 05:09:12,037 INFO [inference.py:311] Audio lengths: tensor([67200, 67039, 66079, 62079, 61760, 60480, 59520, 58080, 57760, 54239,
295
+ 54080, 54080, 52960, 50080, 49920, 49280, 49119, 47840, 47840, 46720,
296
+ 45600, 44800, 44000], dtype=torch.int32)
297
+ 2026-01-26 05:09:19,942 INFO [inference.py:332] Encoder out shape: torch.Size([23, 209, 1024])
298
+ 2026-01-26 05:09:19,943 INFO [inference.py:333] Encoder out lens: tensor([209, 209, 206, 193, 192, 188, 185, 181, 180, 169, 168, 168, 165, 156,
299
+ 155, 153, 153, 149, 149, 145, 142, 139, 137])
300
+ 2026-01-26 05:09:19,943 INFO [inference.py:334] Encoder out range: [-14.993, 12.111]
301
+ 2026-01-26 05:09:20,821 INFO [inference.py:344] Number of hypotheses: 23
302
+ 2026-01-26 05:09:20,821 INFO [inference.py:346] First hypothesis: [18, 34, 174, 15, 44, 84, 15, 34, 81, 7, 69, 57, 101]
303
+ 2026-01-26 05:09:20,829 INFO [inference.py:309] Audio shape: torch.Size([6, 269120]), dtype: torch.float32
304
+ 2026-01-26 05:09:20,830 INFO [inference.py:310] Audio range: [-0.516, 0.413]
305
+ 2026-01-26 05:09:20,830 INFO [inference.py:311] Audio lengths: tensor([269119, 263680, 262719, 262559, 258240, 249759], dtype=torch.int32)
306
+ 2026-01-26 05:09:30,645 INFO [inference.py:332] Encoder out shape: torch.Size([6, 840, 1024])
307
+ 2026-01-26 05:09:30,645 INFO [inference.py:333] Encoder out lens: tensor([840, 823, 820, 820, 806, 780])
308
+ 2026-01-26 05:09:30,646 INFO [inference.py:334] Encoder out range: [-11.696, 10.834]
309
+ 2026-01-26 05:09:31,125 INFO [inference.py:344] Number of hypotheses: 6
310
+ 2026-01-26 05:09:31,125 INFO [inference.py:346] First hypothesis: []
311
+ 2026-01-26 05:09:31,132 INFO [inference.py:309] Audio shape: torch.Size([6, 241440]), dtype: torch.float32
312
+ 2026-01-26 05:09:31,133 INFO [inference.py:310] Audio range: [-0.067, 0.106]
313
+ 2026-01-26 05:09:31,134 INFO [inference.py:311] Audio lengths: tensor([241440, 240479, 238079, 236800, 224800, 224159], dtype=torch.int32)
314
+ 2026-01-26 05:09:39,233 INFO [inference.py:332] Encoder out shape: torch.Size([6, 754, 1024])
315
+ 2026-01-26 05:09:39,234 INFO [inference.py:333] Encoder out lens: tensor([754, 751, 743, 739, 702, 700])
316
+ 2026-01-26 05:09:39,234 INFO [inference.py:334] Encoder out range: [-13.524, 12.974]
317
+ 2026-01-26 05:09:39,812 INFO [inference.py:344] Number of hypotheses: 6
318
+ 2026-01-26 05:09:39,812 INFO [inference.py:346] First hypothesis: [87, 7, 5, 51, 195, 25, 6, 362, 39, 114, 38, 65, 18, 354, 16, 38, 89, 174]
319
+ 2026-01-26 05:09:39,820 INFO [inference.py:309] Audio shape: torch.Size([23, 68000]), dtype: torch.float32
320
+ 2026-01-26 05:09:39,821 INFO [inference.py:310] Audio range: [-0.180, 0.177]
321
+ 2026-01-26 05:09:39,821 INFO [inference.py:311] Audio lengths: tensor([68000, 66080, 65120, 64319, 64000, 60960, 58880, 58400, 58240, 57600,
322
+ 50239, 49760, 48480, 48480, 47520, 47200, 46560, 46080, 44960, 44480,
323
+ 43200, 42719, 42240], dtype=torch.int32)
324
+ 2026-01-26 05:09:48,028 INFO [inference.py:332] Encoder out shape: torch.Size([23, 212, 1024])
325
+ 2026-01-26 05:09:48,029 INFO [inference.py:333] Encoder out lens: tensor([212, 206, 203, 200, 199, 190, 183, 182, 181, 179, 156, 155, 151, 151,
326
+ 148, 147, 145, 143, 140, 138, 134, 133, 131])
327
+ 2026-01-26 05:09:48,029 INFO [inference.py:334] Encoder out range: [-13.762, 11.575]
328
+ 2026-01-26 05:09:48,731 INFO [inference.py:344] Number of hypotheses: 23
329
+ 2026-01-26 05:09:48,731 INFO [inference.py:346] First hypothesis: [61, 51, 184, 13, 4, 140, 5, 169, 93, 25, 6, 260]
330
+ 2026-01-26 05:09:48,731 INFO [inference.py:535] Processed 551 utterances in 30 batches
331
+ 2026-01-26 05:09:48,738 INFO [inference.py:309] Audio shape: torch.Size([5, 317280]), dtype: torch.float32
332
+ 2026-01-26 05:09:48,739 INFO [inference.py:310] Audio range: [-0.122, 0.148]
333
+ 2026-01-26 05:09:48,739 INFO [inference.py:311] Audio lengths: tensor([317280, 311840, 309600, 301120, 295680], dtype=torch.int32)
334
+ 2026-01-26 05:09:58,852 INFO [inference.py:332] Encoder out shape: torch.Size([5, 991, 1024])
335
+ 2026-01-26 05:09:58,853 INFO [inference.py:333] Encoder out lens: tensor([991, 974, 967, 940, 923])
336
+ 2026-01-26 05:09:58,853 INFO [inference.py:334] Encoder out range: [-13.657, 13.923]
337
+ 2026-01-26 05:09:59,738 INFO [inference.py:344] Number of hypotheses: 5
338
+ 2026-01-26 05:09:59,739 INFO [inference.py:346] First hypothesis: [39, 231, 32, 51, 49, 25, 93, 221, 18, 118, 159, 6, 4, 27, 5, 8, 93, 193, 39, 6, 130, 18, 7, 69, 176, 33, 152, 25, 284, 251, 205]
339
+ 2026-01-26 05:09:59,746 INFO [inference.py:309] Audio shape: torch.Size([24, 64160]), dtype: torch.float32
340
+ 2026-01-26 05:09:59,747 INFO [inference.py:310] Audio range: [-0.274, 0.264]
341
+ 2026-01-26 05:09:59,748 INFO [inference.py:311] Audio lengths: tensor([64160, 61760, 61759, 61760, 59520, 58720, 57280, 55840, 55520, 54720,
342
+ 51520, 50880, 50880, 50720, 49600, 49440, 49280, 47839, 46719, 46399,
343
+ 45279, 43999, 43520, 42240], dtype=torch.int32)
344
+ 2026-01-26 05:10:08,023 INFO [inference.py:332] Encoder out shape: torch.Size([24, 200, 1024])
345
+ 2026-01-26 05:10:08,024 INFO [inference.py:333] Encoder out lens: tensor([200, 192, 192, 192, 185, 183, 178, 174, 173, 170, 160, 158, 158, 158,
346
+ 154, 154, 153, 149, 145, 144, 141, 137, 135, 131])
347
+ 2026-01-26 05:10:08,024 INFO [inference.py:334] Encoder out range: [-13.370, 11.318]
348
+ 2026-01-26 05:10:08,807 INFO [inference.py:344] Number of hypotheses: 24
349
+ 2026-01-26 05:10:08,807 INFO [inference.py:346] First hypothesis: [61, 49, 39, 17, 7, 5, 64, 9, 115, 16, 34, 57, 7, 5, 206, 221, 16]
350
+ 2026-01-26 05:10:08,814 INFO [inference.py:309] Audio shape: torch.Size([5, 287520]), dtype: torch.float32
351
+ 2026-01-26 05:10:08,814 INFO [inference.py:310] Audio range: [-0.099, 0.090]
352
+ 2026-01-26 05:10:08,815 INFO [inference.py:311] Audio lengths: tensor([287520, 283360, 264959, 261760, 259360], dtype=torch.int32)
353
+ 2026-01-26 05:10:16,810 INFO [inference.py:332] Encoder out shape: torch.Size([5, 898, 1024])
354
+ 2026-01-26 05:10:16,811 INFO [inference.py:333] Encoder out lens: tensor([898, 885, 827, 817, 810])
355
+ 2026-01-26 05:10:16,811 INFO [inference.py:334] Encoder out range: [-13.209, 12.373]
356
+ 2026-01-26 05:10:17,639 INFO [inference.py:344] Number of hypotheses: 5
357
+ 2026-01-26 05:10:17,640 INFO [inference.py:346] First hypothesis: [231, 178, 51, 144, 193, 46, 13, 38, 86, 120, 194, 16, 95, 13, 104, 19, 36, 26, 87, 20, 193]
358
+ 2026-01-26 05:10:17,648 INFO [inference.py:309] Audio shape: torch.Size([5, 310720]), dtype: torch.float32
359
+ 2026-01-26 05:10:17,649 INFO [inference.py:310] Audio range: [-0.046, 0.111]
360
+ 2026-01-26 05:10:17,650 INFO [inference.py:311] Audio lengths: tensor([310719, 308639, 298560, 294880, 293759], dtype=torch.int32)
361
+ 2026-01-26 05:10:26,341 INFO [inference.py:332] Encoder out shape: torch.Size([5, 970, 1024])
362
+ 2026-01-26 05:10:26,342 INFO [inference.py:333] Encoder out lens: tensor([970, 964, 932, 921, 917])
363
+ 2026-01-26 05:10:26,342 INFO [inference.py:334] Encoder out range: [-13.787, 12.644]
364
+ 2026-01-26 05:10:26,950 INFO [inference.py:344] Number of hypotheses: 5
365
+ 2026-01-26 05:10:26,951 INFO [inference.py:346] First hypothesis: [17, 7, 5, 46, 48, 66, 48, 84, 51, 66]
366
+ 2026-01-26 05:10:26,958 INFO [inference.py:309] Audio shape: torch.Size([38, 41600]), dtype: torch.float32
367
+ 2026-01-26 05:10:26,959 INFO [inference.py:310] Audio range: [-0.528, 0.544]
368
+ 2026-01-26 05:10:26,960 INFO [inference.py:311] Audio lengths: tensor([41599, 39200, 37119, 36799, 34400, 34079, 33439, 32960, 31200, 31200,
369
+ 26400, 25600, 24000, 22560, 22080, 21919, 21920, 21280, 20799, 19360,
370
+ 18880, 18880, 17600, 17440, 15200, 13760, 12640, 11360, 5760, 5280,
371
+ 5120, 4640, 4320, 3840, 3680, 3360, 3360, 3200],
372
+ dtype=torch.int32)
373
+ 2026-01-26 05:10:34,801 INFO [inference.py:332] Encoder out shape: torch.Size([38, 129, 1024])
374
+ 2026-01-26 05:10:34,802 INFO [inference.py:333] Encoder out lens: tensor([129, 122, 115, 114, 107, 106, 104, 102, 97, 97, 82, 79, 74, 70,
375
+ 68, 68, 68, 66, 64, 60, 58, 58, 54, 54, 47, 42, 39, 35,
376
+ 17, 16, 15, 14, 13, 11, 11, 10, 10, 9])
377
+ 2026-01-26 05:10:34,802 INFO [inference.py:334] Encoder out range: [-12.505, 11.696]
378
+ 2026-01-26 05:10:35,305 INFO [inference.py:344] Number of hypotheses: 38
379
+ 2026-01-26 05:10:35,305 INFO [inference.py:346] First hypothesis: []
380
+ 2026-01-26 05:10:35,312 INFO [inference.py:309] Audio shape: torch.Size([11, 143680]), dtype: torch.float32
381
+ 2026-01-26 05:10:35,312 INFO [inference.py:310] Audio range: [-0.331, 0.228]
382
+ 2026-01-26 05:10:35,313 INFO [inference.py:311] Audio lengths: tensor([143680, 143360, 143200, 137439, 130559, 129279, 128960, 125280, 125280,
383
+ 123040, 118079], dtype=torch.int32)
384
+ 2026-01-26 05:10:44,633 INFO [inference.py:332] Encoder out shape: torch.Size([11, 448, 1024])
385
+ 2026-01-26 05:10:44,633 INFO [inference.py:333] Encoder out lens: tensor([448, 447, 447, 429, 407, 403, 402, 391, 391, 384, 368])
386
+ 2026-01-26 05:10:44,634 INFO [inference.py:334] Encoder out range: [-12.574, 13.090]
387
+ 2026-01-26 05:10:45,323 INFO [inference.py:344] Number of hypotheses: 11
388
+ 2026-01-26 05:10:45,323 INFO [inference.py:346] First hypothesis: [11, 37, 9, 102, 18, 230, 95, 6, 24, 64, 16, 163, 73, 6, 165, 5, 17, 171, 15, 267, 153, 5]
389
+ 2026-01-26 05:10:45,330 INFO [inference.py:309] Audio shape: torch.Size([5, 283680]), dtype: torch.float32
390
+ 2026-01-26 05:10:45,331 INFO [inference.py:310] Audio range: [-0.096, 0.119]
391
+ 2026-01-26 05:10:45,332 INFO [inference.py:311] Audio lengths: tensor([283680, 281119, 271360, 262560, 252479], dtype=torch.int32)
392
+ 2026-01-26 05:10:53,624 INFO [inference.py:332] Encoder out shape: torch.Size([5, 886, 1024])
393
+ 2026-01-26 05:10:53,625 INFO [inference.py:333] Encoder out lens: tensor([886, 878, 847, 820, 788])
394
+ 2026-01-26 05:10:53,625 INFO [inference.py:334] Encoder out range: [-12.921, 13.557]
395
+ 2026-01-26 05:10:54,461 INFO [inference.py:344] Number of hypotheses: 5
396
+ 2026-01-26 05:10:54,462 INFO [inference.py:346] First hypothesis: [112, 81, 230, 38, 105, 34, 16, 34, 200, 294, 171, 15, 58, 134, 29, 16, 122, 25, 6, 25, 6, 16, 29, 119, 5, 25, 6, 205, 5, 17, 220]
397
+ 2026-01-26 05:10:54,467 INFO [inference.py:309] Audio shape: torch.Size([23, 68800]), dtype: torch.float32
398
+ 2026-01-26 05:10:54,468 INFO [inference.py:310] Audio range: [-0.356, 0.274]
399
+ 2026-01-26 05:10:54,468 INFO [inference.py:311] Audio lengths: tensor([68800, 65600, 64800, 64480, 62400, 58079, 57119, 56159, 54560, 53920,
400
+ 53920, 51840, 51520, 49280, 49280, 47519, 46240, 45280, 44960, 44960,
401
+ 44480, 43680, 42560], dtype=torch.int32)
402
+ 2026-01-26 05:11:03,138 INFO [inference.py:332] Encoder out shape: torch.Size([23, 214, 1024])
403
+ 2026-01-26 05:11:03,139 INFO [inference.py:333] Encoder out lens: tensor([214, 204, 202, 201, 194, 181, 178, 175, 170, 168, 168, 161, 160, 153,
404
+ 153, 148, 144, 141, 140, 140, 138, 136, 132])
405
+ 2026-01-26 05:11:03,139 INFO [inference.py:334] Encoder out range: [-13.047, 12.227]
406
+ 2026-01-26 05:11:04,047 INFO [inference.py:344] Number of hypotheses: 23
407
+ 2026-01-26 05:11:04,048 INFO [inference.py:346] First hypothesis: [57, 33, 193, 16, 48, 66, 33, 13, 74, 19, 201]
408
+ 2026-01-26 05:11:04,053 INFO [inference.py:309] Audio shape: torch.Size([11, 139520]), dtype: torch.float32
409
+ 2026-01-26 05:11:04,054 INFO [inference.py:310] Audio range: [-0.117, 0.153]
410
+ 2026-01-26 05:11:04,054 INFO [inference.py:311] Audio lengths: tensor([139520, 139200, 138880, 138079, 137440, 134720, 128320, 124000, 121600,
411
+ 120160, 118240], dtype=torch.int32)
412
+ 2026-01-26 05:11:12,432 INFO [inference.py:332] Encoder out shape: torch.Size([11, 435, 1024])
413
+ 2026-01-26 05:11:12,433 INFO [inference.py:333] Encoder out lens: tensor([435, 434, 433, 431, 429, 420, 400, 387, 379, 375, 369])
414
+ 2026-01-26 05:11:12,433 INFO [inference.py:334] Encoder out range: [-13.984, 12.798]
415
+ 2026-01-26 05:11:13,304 INFO [inference.py:344] Number of hypotheses: 11
416
+ 2026-01-26 05:11:13,304 INFO [inference.py:346] First hypothesis: [105, 206, 66, 5, 18, 47, 236, 49]
417
+ 2026-01-26 05:11:13,310 INFO [inference.py:309] Audio shape: torch.Size([6, 248640]), dtype: torch.float32
418
+ 2026-01-26 05:11:13,311 INFO [inference.py:310] Audio range: [-0.155, 0.171]
419
+ 2026-01-26 05:11:13,312 INFO [inference.py:311] Audio lengths: tensor([248639, 231359, 228480, 225440, 223360, 212800], dtype=torch.int32)
420
+ 2026-01-26 05:11:22,125 INFO [inference.py:332] Encoder out shape: torch.Size([6, 776, 1024])
421
+ 2026-01-26 05:11:22,126 INFO [inference.py:333] Encoder out lens: tensor([776, 722, 713, 704, 697, 664])
422
+ 2026-01-26 05:11:22,126 INFO [inference.py:334] Encoder out range: [-13.353, 11.759]
423
+ 2026-01-26 05:11:22,718 INFO [inference.py:344] Number of hypotheses: 6
424
+ 2026-01-26 05:11:22,719 INFO [inference.py:346] First hypothesis: [39, 93, 25, 6, 24, 6, 130, 117, 66, 100, 87, 7, 5, 110, 17, 172, 51, 9, 51, 49, 26, 87, 23, 6, 66, 33, 6, 91, 18, 7, 27, 46, 333, 130, 33, 357, 136, 21, 24]
425
+ 2026-01-26 05:11:22,719 INFO [inference.py:535] Processed 684 utterances in 40 batches
426
+ 2026-01-26 05:11:22,724 INFO [inference.py:309] Audio shape: torch.Size([9, 173600]), dtype: torch.float32
427
+ 2026-01-26 05:11:22,725 INFO [inference.py:310] Audio range: [-0.501, 0.295]
428
+ 2026-01-26 05:11:22,725 INFO [inference.py:311] Audio lengths: tensor([173599, 168319, 161760, 161760, 159040, 158719, 156800, 148000, 147359],
429
+ dtype=torch.int32)
430
+ 2026-01-26 05:11:31,816 INFO [inference.py:332] Encoder out shape: torch.Size([9, 542, 1024])
431
+ 2026-01-26 05:11:31,816 INFO [inference.py:333] Encoder out lens: tensor([542, 525, 505, 505, 496, 495, 489, 462, 460])
432
+ 2026-01-26 05:11:31,817 INFO [inference.py:334] Encoder out range: [-11.960, 12.653]
433
+ 2026-01-26 05:11:32,422 INFO [inference.py:344] Number of hypotheses: 9
434
+ 2026-01-26 05:11:32,422 INFO [inference.py:346] First hypothesis: [11, 159, 110, 17, 50, 6, 51, 195, 25, 34, 13, 260, 6, 395, 195, 26, 17, 18, 118, 53, 86, 56]
435
+ 2026-01-26 05:11:32,428 INFO [inference.py:309] Audio shape: torch.Size([11, 144640]), dtype: torch.float32
436
+ 2026-01-26 05:11:32,429 INFO [inference.py:310] Audio range: [-0.332, 0.358]
437
+ 2026-01-26 05:11:32,429 INFO [inference.py:311] Audio lengths: tensor([144639, 143520, 140159, 139840, 133760, 128159, 128159, 124000, 119680,
438
+ 119200, 119040], dtype=torch.int32)
439
+ 2026-01-26 05:11:41,731 INFO [inference.py:332] Encoder out shape: torch.Size([11, 451, 1024])
440
+ 2026-01-26 05:11:41,731 INFO [inference.py:333] Encoder out lens: tensor([451, 448, 437, 436, 417, 400, 400, 387, 373, 372, 371])
441
+ 2026-01-26 05:11:41,732 INFO [inference.py:334] Encoder out range: [-13.569, 12.367]
442
+ 2026-01-26 05:11:42,844 INFO [inference.py:344] Number of hypotheses: 11
443
+ 2026-01-26 05:11:42,844 INFO [inference.py:346] First hypothesis: [51, 419, 26, 15, 72, 113, 6, 91, 33]
444
+ 2026-01-26 05:11:42,850 INFO [inference.py:309] Audio shape: torch.Size([13, 116640]), dtype: torch.float32
445
+ 2026-01-26 05:11:42,850 INFO [inference.py:310] Audio range: [-0.268, 0.323]
446
+ 2026-01-26 05:11:42,851 INFO [inference.py:311] Audio lengths: tensor([116640, 116000, 112799, 110240, 104319, 101919, 100799, 100800, 98400,
447
+ 96480, 95039, 93920, 93600], dtype=torch.int32)
448
+ 2026-01-26 05:11:50,812 INFO [inference.py:332] Encoder out shape: torch.Size([13, 364, 1024])
449
+ 2026-01-26 05:11:50,813 INFO [inference.py:333] Encoder out lens: tensor([364, 362, 352, 344, 325, 318, 314, 314, 307, 301, 296, 293, 292])
450
+ 2026-01-26 05:11:50,813 INFO [inference.py:334] Encoder out range: [-11.927, 13.414]
451
+ 2026-01-26 05:11:51,360 INFO [inference.py:344] Number of hypotheses: 13
452
+ 2026-01-26 05:11:51,360 INFO [inference.py:346] First hypothesis: [111, 114, 157, 57, 152, 25, 130, 101, 210, 96]
453
+ 2026-01-26 05:11:51,366 INFO [inference.py:309] Audio shape: torch.Size([13, 114560]), dtype: torch.float32
454
+ 2026-01-26 05:11:51,367 INFO [inference.py:310] Audio range: [-0.431, 0.430]
455
+ 2026-01-26 05:11:51,368 INFO [inference.py:311] Audio lengths: tensor([114559, 111359, 110240, 108639, 107840, 103519, 102240, 101759, 101120,
456
+ 100639, 98560, 97760, 97759], dtype=torch.int32)
457
+ 2026-01-26 05:11:59,310 INFO [inference.py:332] Encoder out shape: torch.Size([13, 357, 1024])
458
+ 2026-01-26 05:11:59,311 INFO [inference.py:333] Encoder out lens: tensor([357, 347, 344, 339, 336, 323, 319, 317, 315, 314, 307, 305, 305])
459
+ 2026-01-26 05:11:59,311 INFO [inference.py:334] Encoder out range: [-11.355, 12.943]
460
+ 2026-01-26 05:11:59,902 INFO [inference.py:344] Number of hypotheses: 13
461
+ 2026-01-26 05:11:59,902 INFO [inference.py:346] First hypothesis: [264, 48]
462
+ 2026-01-26 05:11:59,908 INFO [inference.py:309] Audio shape: torch.Size([6, 243200]), dtype: torch.float32
463
+ 2026-01-26 05:11:59,909 INFO [inference.py:310] Audio range: [-0.339, 0.341]
464
+ 2026-01-26 05:11:59,909 INFO [inference.py:311] Audio lengths: tensor([243200, 242079, 241760, 237920, 231679, 212799], dtype=torch.int32)
465
+ 2026-01-26 05:12:08,215 INFO [inference.py:332] Encoder out shape: torch.Size([6, 759, 1024])
466
+ 2026-01-26 05:12:08,215 INFO [inference.py:333] Encoder out lens: tensor([759, 756, 755, 743, 723, 664])
467
+ 2026-01-26 05:12:08,216 INFO [inference.py:334] Encoder out range: [-13.935, 11.852]
468
+ 2026-01-26 05:12:08,701 INFO [inference.py:344] Number of hypotheses: 6
469
+ 2026-01-26 05:12:08,701 INFO [inference.py:346] First hypothesis: [66, 89, 174, 20]
470
+ 2026-01-26 05:12:08,707 INFO [inference.py:309] Audio shape: torch.Size([5, 275520]), dtype: torch.float32
471
+ 2026-01-26 05:12:08,708 INFO [inference.py:310] Audio range: [-0.126, 0.148]
472
+ 2026-01-26 05:12:08,709 INFO [inference.py:311] Audio lengths: tensor([275520, 274880, 274880, 263999, 254879], dtype=torch.int32)
473
+ 2026-01-26 05:12:16,831 INFO [inference.py:332] Encoder out shape: torch.Size([5, 860, 1024])
474
+ 2026-01-26 05:12:16,831 INFO [inference.py:333] Encoder out lens: tensor([860, 858, 858, 824, 796])
475
+ 2026-01-26 05:12:16,832 INFO [inference.py:334] Encoder out range: [-12.819, 13.634]
476
+ 2026-01-26 05:12:17,414 INFO [inference.py:344] Number of hypotheses: 5
477
+ 2026-01-26 05:12:17,414 INFO [inference.py:346] First hypothesis: [11, 39, 51, 49, 46, 171, 81, 7, 69]
478
+ 2026-01-26 05:12:17,420 INFO [inference.py:309] Audio shape: torch.Size([38, 41920]), dtype: torch.float32
479
+ 2026-01-26 05:12:17,421 INFO [inference.py:310] Audio range: [-0.380, 0.393]
480
+ 2026-01-26 05:12:17,422 INFO [inference.py:311] Audio lengths: tensor([41919, 41760, 41599, 38560, 38080, 37440, 34400, 33600, 32159, 29120,
481
+ 27200, 26560, 25600, 24800, 23680, 23520, 23360, 19680, 18880, 16160,
482
+ 15360, 15200, 14880, 13600, 13440, 10080, 7840, 6720, 6400, 6080,
483
+ 6080, 5600, 5440, 5120, 4640, 4000, 3840, 3520],
484
+ dtype=torch.int32)
485
+ 2026-01-26 05:12:25,625 INFO [inference.py:332] Encoder out shape: torch.Size([38, 130, 1024])
486
+ 2026-01-26 05:12:25,626 INFO [inference.py:333] Encoder out lens: tensor([130, 130, 129, 120, 118, 116, 107, 104, 100, 90, 84, 82, 79, 77,
487
+ 73, 73, 72, 61, 58, 50, 47, 47, 46, 42, 41, 31, 24, 20,
488
+ 19, 18, 18, 17, 16, 15, 14, 12, 11, 10])
489
+ 2026-01-26 05:12:25,626 INFO [inference.py:334] Encoder out range: [-12.608, 11.500]
490
+ 2026-01-26 05:12:26,111 INFO [inference.py:344] Number of hypotheses: 38
491
+ 2026-01-26 05:12:26,111 INFO [inference.py:346] First hypothesis: [11]
492
+ 2026-01-26 05:12:26,117 INFO [inference.py:309] Audio shape: torch.Size([5, 289760]), dtype: torch.float32
493
+ 2026-01-26 05:12:26,117 INFO [inference.py:310] Audio range: [-0.259, 0.249]
494
+ 2026-01-26 05:12:26,118 INFO [inference.py:311] Audio lengths: tensor([289760, 283039, 277760, 261599, 250080], dtype=torch.int32)
495
+ 2026-01-26 05:12:34,901 INFO [inference.py:332] Encoder out shape: torch.Size([5, 905, 1024])
496
+ 2026-01-26 05:12:34,902 INFO [inference.py:333] Encoder out lens: tensor([905, 884, 867, 817, 781])
497
+ 2026-01-26 05:12:34,903 INFO [inference.py:334] Encoder out range: [-12.988, 13.561]
498
+ 2026-01-26 05:12:35,738 INFO [inference.py:344] Number of hypotheses: 5
499
+ 2026-01-26 05:12:35,738 INFO [inference.py:346] First hypothesis: [6, 290, 20, 48, 33, 238, 205, 37, 48, 265, 274]
500
+ 2026-01-26 05:12:35,744 INFO [inference.py:309] Audio shape: torch.Size([23, 68640]), dtype: torch.float32
501
+ 2026-01-26 05:12:35,745 INFO [inference.py:310] Audio range: [-0.114, 0.158]
502
+ 2026-01-26 05:12:35,745 INFO [inference.py:311] Audio lengths: tensor([68640, 67680, 66719, 66080, 65759, 65600, 64159, 64159, 61119, 60000,
503
+ 56800, 56639, 53760, 53440, 52640, 52479, 50720, 50400, 49760, 46880,
504
+ 46080, 45280, 45120], dtype=torch.int32)
505
+ 2026-01-26 05:12:44,120 INFO [inference.py:332] Encoder out shape: torch.Size([23, 214, 1024])
506
+ 2026-01-26 05:12:44,120 INFO [inference.py:333] Encoder out lens: tensor([214, 211, 208, 206, 205, 204, 200, 200, 190, 187, 177, 176, 167, 166,
507
+ 164, 163, 158, 157, 155, 146, 143, 141, 140])
508
+ 2026-01-26 05:12:44,121 INFO [inference.py:334] Encoder out range: [-13.289, 13.747]
509
+ 2026-01-26 05:12:44,824 INFO [inference.py:344] Number of hypotheses: 23
510
+ 2026-01-26 05:12:44,824 INFO [inference.py:346] First hypothesis: [89, 186, 32, 7, 8, 234, 13]
511
+ 2026-01-26 05:12:44,831 INFO [inference.py:309] Audio shape: torch.Size([40, 39520]), dtype: torch.float32
512
+ 2026-01-26 05:12:44,832 INFO [inference.py:310] Audio range: [-0.170, 0.217]
513
+ 2026-01-26 05:12:44,832 INFO [inference.py:311] Audio lengths: tensor([39520, 38720, 37760, 36800, 36320, 36159, 34720, 33919, 32640, 31200,
514
+ 29760, 28479, 27840, 27840, 24320, 23040, 21120, 20639, 17920, 16800,
515
+ 16160, 15840, 14720, 14560, 14560, 14400, 13760, 11520, 9920, 9919,
516
+ 9760, 9120, 7840, 7360, 7040, 5440, 4960, 4960, 4800, 4320],
517
+ dtype=torch.int32)
518
+ 2026-01-26 05:12:52,729 INFO [inference.py:332] Encoder out shape: torch.Size([40, 123, 1024])
519
+ 2026-01-26 05:12:52,730 INFO [inference.py:333] Encoder out lens: tensor([123, 120, 117, 114, 113, 112, 108, 105, 101, 97, 92, 88, 86, 86,
520
+ 75, 71, 65, 64, 55, 52, 50, 49, 45, 45, 45, 44, 42, 35,
521
+ 30, 30, 30, 28, 24, 22, 21, 16, 15, 15, 14, 13])
522
+ 2026-01-26 05:12:52,730 INFO [inference.py:334] Encoder out range: [-11.403, 12.142]
523
+ 2026-01-26 05:12:53,318 INFO [inference.py:344] Number of hypotheses: 40
524
+ 2026-01-26 05:12:53,319 INFO [inference.py:346] First hypothesis: [89]
525
+ 2026-01-26 05:12:53,319 INFO [inference.py:535] Processed 847 utterances in 50 batches
526
+ 2026-01-26 05:12:53,326 INFO [inference.py:309] Audio shape: torch.Size([9, 176320]), dtype: torch.float32
527
+ 2026-01-26 05:12:53,326 INFO [inference.py:310] Audio range: [-0.145, 0.173]
528
+ 2026-01-26 05:12:53,327 INFO [inference.py:311] Audio lengths: tensor([176320, 174879, 170880, 161280, 161120, 158880, 155039, 153760, 146079],
529
+ dtype=torch.int32)
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-13-05 ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:13:05,965 INFO [inference.py:617] ================================================================================
2
+ 2026-01-26 05:13:05,965 INFO [inference.py:618] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:13:05,965 INFO [inference.py:619] ================================================================================
4
+ 2026-01-26 05:13:05,965 INFO [inference.py:620] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:13:05,965 INFO [inference.py:621] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:13:05,965 INFO [inference.py:622] Test set: ihm
7
+ 2026-01-26 05:13:05,965 INFO [inference.py:623] Decoding method: modified_beam_search
8
+ 2026-01-26 05:13:05,966 INFO [inference.py:625] Beam size: 4
9
+ 2026-01-26 05:13:05,966 INFO [inference.py:626] Max states: 64
10
+ 2026-01-26 05:13:05,966 INFO [inference.py:627] Max symbols per frame: 3
11
+ 2026-01-26 05:13:05,966 INFO [inference.py:633] Device: cpu
12
+ 2026-01-26 05:13:05,966 INFO [inference.py:636] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:13:05,967 INFO [inference.py:644] Vocabulary size: 500
14
+ 2026-01-26 05:13:05,967 INFO [inference.py:645] Blank ID: 0
15
+ 2026-01-26 05:13:05,967 INFO [inference.py:648] Creating model
16
+ 2026-01-26 05:13:07,626 INFO [inference.py:655] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-train-loss.pt
17
+ 2026-01-26 05:13:07,626 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-train-loss.pt
18
+ 2026-01-26 05:13:12,816 INFO [inference.py:684] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:13:12,816 INFO [inference.py:687] Loading test data
20
+ 2026-01-26 05:13:12,816 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:13:13,985 INFO [inference.py:698] Number of test utterances: 6676
22
+ 2026-01-26 05:13:13,986 INFO [inference.py:701] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-14-59 ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:14:59,640 INFO [inference.py:625] ================================================================================
2
+ 2026-01-26 05:14:59,640 INFO [inference.py:626] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:14:59,640 INFO [inference.py:627] ================================================================================
4
+ 2026-01-26 05:14:59,640 INFO [inference.py:628] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:14:59,640 INFO [inference.py:629] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:14:59,640 INFO [inference.py:630] Test set: ihm
7
+ 2026-01-26 05:14:59,640 INFO [inference.py:631] Decoding method: modified_beam_search
8
+ 2026-01-26 05:14:59,640 INFO [inference.py:633] Beam size: 4
9
+ 2026-01-26 05:14:59,640 INFO [inference.py:634] Max states: 64
10
+ 2026-01-26 05:14:59,640 INFO [inference.py:635] Max symbols per frame: 3
11
+ 2026-01-26 05:14:59,640 INFO [inference.py:641] Device: cpu
12
+ 2026-01-26 05:14:59,640 INFO [inference.py:644] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:14:59,642 INFO [inference.py:652] Vocabulary size: 500
14
+ 2026-01-26 05:14:59,642 INFO [inference.py:653] Blank ID: 0
15
+ 2026-01-26 05:14:59,642 INFO [inference.py:656] Creating model
16
+ 2026-01-26 05:15:01,252 INFO [inference.py:663] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-train-loss.pt
17
+ 2026-01-26 05:15:01,252 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-train-loss.pt
18
+ 2026-01-26 05:15:06,040 INFO [inference.py:692] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:15:06,040 INFO [inference.py:695] Loading test data
20
+ 2026-01-26 05:15:06,040 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:15:07,133 INFO [inference.py:706] Number of test utterances: 6676
22
+ 2026-01-26 05:15:07,133 INFO [inference.py:709] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-17-40 ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:17:40,782 INFO [inference.py:622] ================================================================================
2
+ 2026-01-26 05:17:40,782 INFO [inference.py:623] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:17:40,782 INFO [inference.py:624] ================================================================================
4
+ 2026-01-26 05:17:40,782 INFO [inference.py:625] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:17:40,782 INFO [inference.py:626] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:17:40,783 INFO [inference.py:627] Test set: ihm
7
+ 2026-01-26 05:17:40,783 INFO [inference.py:628] Decoding method: modified_beam_search
8
+ 2026-01-26 05:17:40,783 INFO [inference.py:630] Beam size: 4
9
+ 2026-01-26 05:17:40,783 INFO [inference.py:631] Max states: 64
10
+ 2026-01-26 05:17:40,783 INFO [inference.py:632] Max symbols per frame: 3
11
+ 2026-01-26 05:17:40,783 INFO [inference.py:638] Device: cpu
12
+ 2026-01-26 05:17:40,783 INFO [inference.py:641] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:17:40,784 INFO [inference.py:649] Vocabulary size: 500
14
+ 2026-01-26 05:17:40,784 INFO [inference.py:650] Blank ID: 0
15
+ 2026-01-26 05:17:40,785 INFO [inference.py:653] Creating model
16
+ 2026-01-26 05:17:42,399 INFO [inference.py:660] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-train-loss.pt
17
+ 2026-01-26 05:17:42,400 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-train-loss.pt
18
+ 2026-01-26 05:17:47,415 INFO [inference.py:689] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:17:47,416 INFO [inference.py:692] Loading test data
20
+ 2026-01-26 05:17:47,416 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:17:48,537 INFO [inference.py:703] Number of test utterances: 6676
22
+ 2026-01-26 05:17:48,538 INFO [inference.py:706] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-20-04 ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:20:04,436 INFO [inference.py:613] ================================================================================
2
+ 2026-01-26 05:20:04,436 INFO [inference.py:614] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:20:04,436 INFO [inference.py:615] ================================================================================
4
+ 2026-01-26 05:20:04,436 INFO [inference.py:616] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:20:04,436 INFO [inference.py:617] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:20:04,436 INFO [inference.py:618] Test set: ihm
7
+ 2026-01-26 05:20:04,436 INFO [inference.py:619] Decoding method: modified_beam_search
8
+ 2026-01-26 05:20:04,436 INFO [inference.py:621] Beam size: 4
9
+ 2026-01-26 05:20:04,436 INFO [inference.py:622] Max states: 64
10
+ 2026-01-26 05:20:04,436 INFO [inference.py:623] Max symbols per frame: 3
11
+ 2026-01-26 05:20:04,437 INFO [inference.py:629] Device: cpu
12
+ 2026-01-26 05:20:04,437 INFO [inference.py:632] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:20:04,438 INFO [inference.py:640] Vocabulary size: 500
14
+ 2026-01-26 05:20:04,438 INFO [inference.py:641] Blank ID: 0
15
+ 2026-01-26 05:20:04,438 INFO [inference.py:644] Creating model
16
+ 2026-01-26 05:20:05,956 INFO [inference.py:651] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-train-loss.pt
17
+ 2026-01-26 05:20:05,957 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-train-loss.pt
18
+ 2026-01-26 05:20:10,638 INFO [inference.py:680] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:20:10,639 INFO [inference.py:683] Loading test data
20
+ 2026-01-26 05:20:10,639 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:20:11,677 INFO [inference.py:694] Number of test utterances: 6676
22
+ 2026-01-26 05:20:11,677 INFO [inference.py:697] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-29-29 ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:29:29,151 INFO [inference.py:613] ================================================================================
2
+ 2026-01-26 05:29:29,151 INFO [inference.py:614] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:29:29,151 INFO [inference.py:615] ================================================================================
4
+ 2026-01-26 05:29:29,151 INFO [inference.py:616] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:29:29,151 INFO [inference.py:617] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:29:29,151 INFO [inference.py:618] Test set: ihm
7
+ 2026-01-26 05:29:29,151 INFO [inference.py:619] Decoding method: modified_beam_search
8
+ 2026-01-26 05:29:29,151 INFO [inference.py:621] Beam size: 4
9
+ 2026-01-26 05:29:29,151 INFO [inference.py:622] Max states: 64
10
+ 2026-01-26 05:29:29,151 INFO [inference.py:623] Max symbols per frame: 3
11
+ 2026-01-26 05:29:29,151 INFO [inference.py:629] Device: cpu
12
+ 2026-01-26 05:29:29,151 INFO [inference.py:632] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:29:29,153 INFO [inference.py:640] Vocabulary size: 500
14
+ 2026-01-26 05:29:29,153 INFO [inference.py:641] Blank ID: 0
15
+ 2026-01-26 05:29:29,153 INFO [inference.py:644] Creating model
16
+ 2026-01-26 05:29:30,733 INFO [inference.py:673] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
17
+ 2026-01-26 05:29:30,734 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
18
+ 2026-01-26 05:29:35,902 INFO [inference.py:680] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:29:35,902 INFO [inference.py:683] Loading test data
20
+ 2026-01-26 05:29:35,902 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:29:37,022 INFO [inference.py:694] Number of test utterances: 6676
22
+ 2026-01-26 05:29:37,023 INFO [inference.py:697] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-48-19 ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:48:19,123 INFO [inference.py:613] ================================================================================
2
+ 2026-01-26 05:48:19,123 INFO [inference.py:614] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:48:19,123 INFO [inference.py:615] ================================================================================
4
+ 2026-01-26 05:48:19,123 INFO [inference.py:616] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:48:19,123 INFO [inference.py:617] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:48:19,123 INFO [inference.py:618] Test set: ihm
7
+ 2026-01-26 05:48:19,123 INFO [inference.py:619] Decoding method: modified_beam_search
8
+ 2026-01-26 05:48:19,123 INFO [inference.py:621] Beam size: 4
9
+ 2026-01-26 05:48:19,123 INFO [inference.py:622] Max states: 64
10
+ 2026-01-26 05:48:19,123 INFO [inference.py:623] Max symbols per frame: 3
11
+ 2026-01-26 05:48:19,123 INFO [inference.py:629] Device: cpu
12
+ 2026-01-26 05:48:19,123 INFO [inference.py:632] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:48:19,125 INFO [inference.py:640] Vocabulary size: 500
14
+ 2026-01-26 05:48:19,125 INFO [inference.py:641] Blank ID: 0
15
+ 2026-01-26 05:48:19,125 INFO [inference.py:644] Creating model
16
+ 2026-01-26 05:48:22,516 INFO [inference.py:651] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
17
+ 2026-01-26 05:48:22,517 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
18
+ 2026-01-26 05:48:39,229 INFO [inference.py:680] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:48:39,229 INFO [inference.py:683] Loading test data
20
+ 2026-01-26 05:48:39,229 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:48:41,915 INFO [inference.py:694] Number of test utterances: 6676
22
+ 2026-01-26 05:48:41,915 INFO [inference.py:697] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-50-10 ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:50:10,649 INFO [inference.py:613] ================================================================================
2
+ 2026-01-26 05:50:10,649 INFO [inference.py:614] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:50:10,649 INFO [inference.py:615] ================================================================================
4
+ 2026-01-26 05:50:10,649 INFO [inference.py:616] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:50:10,649 INFO [inference.py:617] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:50:10,649 INFO [inference.py:618] Test set: ihm
7
+ 2026-01-26 05:50:10,649 INFO [inference.py:619] Decoding method: modified_beam_search
8
+ 2026-01-26 05:50:10,649 INFO [inference.py:621] Beam size: 4
9
+ 2026-01-26 05:50:10,649 INFO [inference.py:622] Max states: 64
10
+ 2026-01-26 05:50:10,649 INFO [inference.py:623] Max symbols per frame: 3
11
+ 2026-01-26 05:50:10,649 INFO [inference.py:629] Device: cuda:0
12
+ 2026-01-26 05:50:10,649 INFO [inference.py:632] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:50:10,651 INFO [inference.py:640] Vocabulary size: 500
14
+ 2026-01-26 05:50:10,651 INFO [inference.py:641] Blank ID: 0
15
+ 2026-01-26 05:50:10,651 INFO [inference.py:644] Creating model
16
+ 2026-01-26 05:50:12,218 INFO [inference.py:651] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
17
+ 2026-01-26 05:50:12,219 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
18
+ 2026-01-26 05:50:18,117 INFO [inference.py:680] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:50:18,118 INFO [inference.py:683] Loading test data
20
+ 2026-01-26 05:50:18,118 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:50:19,185 INFO [inference.py:694] Number of test utterances: 6676
22
+ 2026-01-26 05:50:19,186 INFO [inference.py:697] Starting inference...
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-05-54-32 ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 05:54:32,577 INFO [inference.py:613] ================================================================================
2
+ 2026-01-26 05:54:32,577 INFO [inference.py:614] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 05:54:32,577 INFO [inference.py:615] ================================================================================
4
+ 2026-01-26 05:54:32,578 INFO [inference.py:616] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 05:54:32,578 INFO [inference.py:617] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 05:54:32,578 INFO [inference.py:618] Test set: ihm
7
+ 2026-01-26 05:54:32,578 INFO [inference.py:619] Decoding method: modified_beam_search
8
+ 2026-01-26 05:54:32,578 INFO [inference.py:621] Beam size: 4
9
+ 2026-01-26 05:54:32,578 INFO [inference.py:622] Max states: 64
10
+ 2026-01-26 05:54:32,578 INFO [inference.py:623] Max symbols per frame: 3
11
+ 2026-01-26 05:54:32,578 INFO [inference.py:627] Device: cuda:0
12
+ 2026-01-26 05:54:32,578 INFO [inference.py:630] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 05:54:32,579 INFO [inference.py:638] Vocabulary size: 500
14
+ 2026-01-26 05:54:32,580 INFO [inference.py:639] Blank ID: 0
15
+ 2026-01-26 05:54:32,580 INFO [inference.py:642] Creating model
16
+ 2026-01-26 05:54:34,158 INFO [inference.py:649] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
17
+ 2026-01-26 05:54:34,158 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
18
+ 2026-01-26 05:54:40,547 INFO [inference.py:678] Number of model parameters: 317,511,772
19
+ 2026-01-26 05:54:40,547 INFO [inference.py:681] Loading test data
20
+ 2026-01-26 05:54:40,548 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 05:54:41,686 INFO [inference.py:692] Number of test utterances: 6676
22
+ 2026-01-26 05:54:41,686 INFO [inference.py:695] Starting inference...
23
+ 2026-01-26 05:54:41,686 INFO [inference.py:696] Note: First batch may take longer due to GPU warmup
24
+ 2026-01-26 05:54:42,879 INFO [inference.py:711]
25
+ ============================================================
26
+ 2026-01-26 05:54:42,879 INFO [inference.py:712] Processing batch 1
27
+ 2026-01-26 05:54:42,879 INFO [inference.py:718] Batch size: 6
28
+ 2026-01-26 05:54:42,879 INFO [inference.py:736] Starting decoding for this batch...
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-06-02-34 ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 06:02:34,679 INFO [inference.py:630] ================================================================================
2
+ 2026-01-26 06:02:34,679 INFO [inference.py:631] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 06:02:34,679 INFO [inference.py:632] ================================================================================
4
+ 2026-01-26 06:02:34,679 INFO [inference.py:633] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 06:02:34,679 INFO [inference.py:634] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 06:02:34,679 INFO [inference.py:635] Test set: ihm
7
+ 2026-01-26 06:02:34,679 INFO [inference.py:636] Decoding method: modified_beam_search
8
+ 2026-01-26 06:02:34,679 INFO [inference.py:638] Beam size: 4
9
+ 2026-01-26 06:02:34,679 INFO [inference.py:639] Max states: 64
10
+ 2026-01-26 06:02:34,680 INFO [inference.py:640] Max symbols per frame: 3
11
+ 2026-01-26 06:02:34,680 INFO [inference.py:644] Device: cuda:0
12
+ 2026-01-26 06:02:34,680 INFO [inference.py:647] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 06:02:34,681 INFO [inference.py:655] Vocabulary size: 500
14
+ 2026-01-26 06:02:34,681 INFO [inference.py:656] Blank ID: 0
15
+ 2026-01-26 06:02:34,681 INFO [inference.py:659] Creating model
16
+ 2026-01-26 06:02:36,292 INFO [inference.py:666] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
17
+ 2026-01-26 06:02:36,293 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
18
+ 2026-01-26 06:02:42,168 INFO [inference.py:695] Number of model parameters: 317,511,772
19
+ 2026-01-26 06:02:42,168 INFO [inference.py:698] Loading test data
20
+ 2026-01-26 06:02:42,168 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 06:02:43,277 INFO [inference.py:709] Number of test utterances: 6676
22
+ 2026-01-26 06:02:43,278 INFO [inference.py:712] Starting inference...
23
+ 2026-01-26 06:02:43,278 INFO [inference.py:713] Note: First batch may take longer due to GPU warmup
24
+ 2026-01-26 06:02:44,374 INFO [inference.py:728]
25
+ ============================================================
26
+ 2026-01-26 06:02:44,375 INFO [inference.py:729] Processing batch 1
27
+ 2026-01-26 06:02:44,375 INFO [inference.py:735] Batch size: 6
28
+ 2026-01-26 06:02:44,375 INFO [inference.py:753] Starting decoding for this batch...
29
+ 2026-01-26 06:02:45,363 INFO [inference.py:299] Beam search: Processing 6 utterances
30
+ 2026-01-26 06:02:45,363 INFO [inference.py:305] Utterance 1/6: 769 frames
31
+ 2026-01-26 06:02:45,363 INFO [inference.py:312] Frame 0/769, |B|=1
32
+ 2026-01-26 06:02:45,503 INFO [inference.py:362] After initial expansion: |A|=5
33
+ 2026-01-26 06:02:45,503 INFO [inference.py:371] Emission iteration 0, |A|=5
34
+ 2026-01-26 06:02:45,506 INFO [inference.py:371] Emission iteration 1, |A|=15
35
+ 2026-01-26 06:02:45,513 INFO [inference.py:371] Emission iteration 2, |A|=45
36
+ 2026-01-26 06:02:56,541 INFO [inference.py:312] Frame 100/769, |B|=64
37
+ 2026-01-26 06:02:56,567 INFO [inference.py:362] After initial expansion: |A|=320
38
+ 2026-01-26 06:02:56,567 INFO [inference.py:371] Emission iteration 0, |A|=320
39
+ 2026-01-26 06:02:56,594 INFO [inference.py:371] Emission iteration 1, |A|=64
40
+ 2026-01-26 06:02:56,620 INFO [inference.py:371] Emission iteration 2, |A|=64
41
+ 2026-01-26 06:03:07,191 INFO [inference.py:312] Frame 200/769, |B|=64
42
+ 2026-01-26 06:03:07,216 INFO [inference.py:362] After initial expansion: |A|=320
43
+ 2026-01-26 06:03:07,217 INFO [inference.py:371] Emission iteration 0, |A|=320
44
+ 2026-01-26 06:03:07,243 INFO [inference.py:371] Emission iteration 1, |A|=64
45
+ 2026-01-26 06:03:07,270 INFO [inference.py:371] Emission iteration 2, |A|=64
46
+ 2026-01-26 06:03:17,826 INFO [inference.py:312] Frame 300/769, |B|=64
47
+ 2026-01-26 06:03:17,851 INFO [inference.py:362] After initial expansion: |A|=320
48
+ 2026-01-26 06:03:17,851 INFO [inference.py:371] Emission iteration 0, |A|=320
49
+ 2026-01-26 06:03:17,878 INFO [inference.py:371] Emission iteration 1, |A|=64
50
+ 2026-01-26 06:03:17,904 INFO [inference.py:371] Emission iteration 2, |A|=64
51
+ 2026-01-26 06:03:28,408 INFO [inference.py:312] Frame 400/769, |B|=64
52
+ 2026-01-26 06:03:28,434 INFO [inference.py:362] After initial expansion: |A|=320
53
+ 2026-01-26 06:03:28,434 INFO [inference.py:371] Emission iteration 0, |A|=320
54
+ 2026-01-26 06:03:28,460 INFO [inference.py:371] Emission iteration 1, |A|=64
55
+ 2026-01-26 06:03:28,487 INFO [inference.py:371] Emission iteration 2, |A|=64
56
+ 2026-01-26 06:03:39,030 INFO [inference.py:312] Frame 500/769, |B|=64
57
+ 2026-01-26 06:03:39,060 INFO [inference.py:362] After initial expansion: |A|=320
58
+ 2026-01-26 06:03:39,060 INFO [inference.py:371] Emission iteration 0, |A|=320
59
+ 2026-01-26 06:03:39,094 INFO [inference.py:371] Emission iteration 1, |A|=64
60
+ 2026-01-26 06:03:39,125 INFO [inference.py:371] Emission iteration 2, |A|=64
61
+ 2026-01-26 06:03:49,620 INFO [inference.py:312] Frame 600/769, |B|=64
62
+ 2026-01-26 06:03:49,646 INFO [inference.py:362] After initial expansion: |A|=320
63
+ 2026-01-26 06:03:49,646 INFO [inference.py:371] Emission iteration 0, |A|=320
64
+ 2026-01-26 06:03:49,673 INFO [inference.py:371] Emission iteration 1, |A|=64
65
+ 2026-01-26 06:03:49,699 INFO [inference.py:371] Emission iteration 2, |A|=64
66
+ 2026-01-26 06:04:00,283 INFO [inference.py:312] Frame 700/769, |B|=64
67
+ 2026-01-26 06:04:00,309 INFO [inference.py:362] After initial expansion: |A|=320
68
+ 2026-01-26 06:04:00,309 INFO [inference.py:371] Emission iteration 0, |A|=320
69
+ 2026-01-26 06:04:00,335 INFO [inference.py:371] Emission iteration 1, |A|=64
70
+ 2026-01-26 06:04:00,362 INFO [inference.py:371] Emission iteration 2, |A|=64
71
+ 2026-01-26 06:04:07,525 INFO [inference.py:455] Utterance 1 result: 2 tokens
72
+ 2026-01-26 06:04:07,525 INFO [inference.py:305] Utterance 2/6: 764 frames
73
+ 2026-01-26 06:04:07,525 INFO [inference.py:312] Frame 0/764, |B|=1
74
+ 2026-01-26 06:04:07,526 INFO [inference.py:362] After initial expansion: |A|=5
75
+ 2026-01-26 06:04:07,526 INFO [inference.py:371] Emission iteration 0, |A|=5
76
+ 2026-01-26 06:04:07,528 INFO [inference.py:371] Emission iteration 1, |A|=15
77
+ 2026-01-26 06:04:07,534 INFO [inference.py:371] Emission iteration 2, |A|=45
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-06-04-30 ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-26 06:04:30,814 INFO [inference.py:578] ================================================================================
2
+ 2026-01-26 06:04:30,815 INFO [inference.py:579] XLSR-Transducer Inference on AMI
3
+ 2026-01-26 06:04:30,815 INFO [inference.py:580] ================================================================================
4
+ 2026-01-26 06:04:30,815 INFO [inference.py:581] Experiment dir: xlsr_transducer/exp_16gb_scd
5
+ 2026-01-26 06:04:30,815 INFO [inference.py:582] Output dir: xlsr_transducer/exp_16gb_scd/inference_results
6
+ 2026-01-26 06:04:30,815 INFO [inference.py:583] Test set: ihm
7
+ 2026-01-26 06:04:30,815 INFO [inference.py:584] Decoding method: modified_beam_search
8
+ 2026-01-26 06:04:30,815 INFO [inference.py:586] Beam size: 4
9
+ 2026-01-26 06:04:30,815 INFO [inference.py:587] Max states: 64
10
+ 2026-01-26 06:04:30,815 INFO [inference.py:588] Max symbols per frame: 3
11
+ 2026-01-26 06:04:30,815 INFO [inference.py:592] Device: cuda:0
12
+ 2026-01-26 06:04:30,815 INFO [inference.py:595] Loading BPE model from data/lang_bpe_500_scd
13
+ 2026-01-26 06:04:30,817 INFO [inference.py:603] Vocabulary size: 500
14
+ 2026-01-26 06:04:30,817 INFO [inference.py:604] Blank ID: 0
15
+ 2026-01-26 06:04:30,817 INFO [inference.py:607] Creating model
16
+ 2026-01-26 06:04:32,424 INFO [inference.py:614] Loading checkpoint: xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
17
+ 2026-01-26 06:04:32,424 INFO [checkpoint.py:111] Loading checkpoint from xlsr_transducer/exp_16gb_scd/best-valid-loss.pt
18
+ 2026-01-26 06:04:38,254 INFO [inference.py:643] Number of model parameters: 317,511,772
19
+ 2026-01-26 06:04:38,254 INFO [inference.py:646] Loading test data
20
+ 2026-01-26 06:04:38,254 INFO [asr_datamodule.py:448] About to get AMI IHM test cuts with [SCD] tokens
21
+ 2026-01-26 06:04:39,360 INFO [inference.py:657] Number of test utterances: 6676
22
+ 2026-01-26 06:04:39,361 INFO [inference.py:660] Starting inference...
23
+ 2026-01-26 06:04:39,361 INFO [inference.py:661] Note: First batch may take longer due to GPU warmup
24
+ 2026-01-26 06:04:40,450 INFO [inference.py:676]
25
+ ============================================================
26
+ 2026-01-26 06:04:40,450 INFO [inference.py:677] Processing batch 1
27
+ 2026-01-26 06:04:40,450 INFO [inference.py:683] Batch size: 6
28
+ 2026-01-26 06:04:40,450 INFO [inference.py:701] Starting decoding for this batch...
29
+ 2026-01-26 06:04:41,439 INFO [inference.py:283] Beam search: Processing 6 utterances
30
+ 2026-01-26 06:04:41,440 INFO [inference.py:289] Utterance 1/6: 769 frames
31
+ 2026-01-26 06:04:41,440 INFO [inference.py:296] Frame 0/769, |B|=1
32
+ 2026-01-26 06:04:41,938 INFO [inference.py:296] Frame 200/769, |B|=4
33
+ 2026-01-26 06:04:42,252 INFO [inference.py:296] Frame 400/769, |B|=4
34
+ 2026-01-26 06:04:42,564 INFO [inference.py:296] Frame 600/769, |B|=4
35
+ 2026-01-26 06:04:42,846 INFO [inference.py:403] Utterance 1 result: 1 tokens
36
+ 2026-01-26 06:04:42,846 INFO [inference.py:289] Utterance 2/6: 764 frames
37
+ 2026-01-26 06:04:42,846 INFO [inference.py:296] Frame 0/764, |B|=1
38
+ 2026-01-26 06:04:43,158 INFO [inference.py:296] Frame 200/764, |B|=4
39
+ 2026-01-26 06:04:43,477 INFO [inference.py:296] Frame 400/764, |B|=4
40
+ 2026-01-26 06:04:43,804 INFO [inference.py:296] Frame 600/764, |B|=4
41
+ 2026-01-26 06:04:44,077 INFO [inference.py:403] Utterance 2 result: 31 tokens
42
+ 2026-01-26 06:04:44,078 INFO [inference.py:289] Utterance 3/6: 743 frames
43
+ 2026-01-26 06:04:44,078 INFO [inference.py:296] Frame 0/743, |B|=1
44
+ 2026-01-26 06:04:44,393 INFO [inference.py:296] Frame 200/743, |B|=4
45
+ 2026-01-26 06:04:44,721 INFO [inference.py:296] Frame 400/743, |B|=4
46
+ 2026-01-26 06:04:45,054 INFO [inference.py:296] Frame 600/743, |B|=4
47
+ 2026-01-26 06:04:45,278 INFO [inference.py:403] Utterance 3 result: 1 tokens
48
+ 2026-01-26 06:04:45,278 INFO [inference.py:289] Utterance 4/6: 712 frames
49
+ 2026-01-26 06:04:45,278 INFO [inference.py:296] Frame 0/712, |B|=1
50
+ 2026-01-26 06:04:45,592 INFO [inference.py:296] Frame 200/712, |B|=4
51
+ 2026-01-26 06:04:45,907 INFO [inference.py:296] Frame 400/712, |B|=4
52
+ 2026-01-26 06:04:46,221 INFO [inference.py:296] Frame 600/712, |B|=4
53
+ 2026-01-26 06:04:46,396 INFO [inference.py:403] Utterance 4 result: 13 tokens
54
+ 2026-01-26 06:04:46,397 INFO [inference.py:289] Utterance 5/6: 699 frames
55
+ 2026-01-26 06:04:46,397 INFO [inference.py:296] Frame 0/699, |B|=1
56
+ 2026-01-26 06:04:46,713 INFO [inference.py:296] Frame 200/699, |B|=4
57
+ 2026-01-26 06:04:47,059 INFO [inference.py:296] Frame 400/699, |B|=4
58
+ 2026-01-26 06:04:47,404 INFO [inference.py:296] Frame 600/699, |B|=4
59
+ 2026-01-26 06:04:47,572 INFO [inference.py:403] Utterance 5 result: 11 tokens
60
+ 2026-01-26 06:04:47,572 INFO [inference.py:289] Utterance 6/6: 696 frames
61
+ 2026-01-26 06:04:47,572 INFO [inference.py:296] Frame 0/696, |B|=1
62
+ 2026-01-26 06:04:47,895 INFO [inference.py:296] Frame 200/696, |B|=4
63
+ 2026-01-26 06:04:48,221 INFO [inference.py:296] Frame 400/696, |B|=4
64
+ 2026-01-26 06:04:48,558 INFO [inference.py:296] Frame 600/696, |B|=4
65
+ 2026-01-26 06:04:48,713 INFO [inference.py:403] Utterance 6 result: 13 tokens
66
+ 2026-01-26 06:04:48,713 INFO [inference.py:410] Beam search complete
67
+ 2026-01-26 06:04:48,713 INFO [inference.py:707] Decoding completed in 8.26s
68
+ 2026-01-26 06:04:48,713 INFO [inference.py:710] Converting tokens to text...
69
+ 2026-01-26 06:04:48,714 INFO [inference.py:715] First hypothesis: OKAY...
70
+ 2026-01-26 06:04:48,714 INFO [inference.py:723] Batch 1 completed in 8.26s
71
+ 2026-01-26 06:04:48,714 INFO [inference.py:724] Average time per utterance: 1.38s
72
+ 2026-01-26 06:04:48,714 INFO [inference.py:725] Total processed so far: 6 utterances in 1 batches
egs/ami/ASR/xlsr_transducer/inference_results/log-inference-ihm-2026-01-26-06-07-36 ADDED
The diff for this file is too large to render. See raw diff
 
egs/ami/ASR/xlsr_transducer/inference_results/metrics-ihm.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ================================================================================
2
+ XLSR-Transducer Inference Results
3
+ ================================================================================
4
+ Experiment: xlsr_transducer/exp_16gb_scd
5
+ Test set: ihm
6
+ Decoding method: modified_beam_search
7
+ Beam size: 4
8
+ Max states: 64
9
+ Max symbols per frame: 3
10
+ Number of utterances: 6676
11
+ Total words: 92205
12
+ Total errors: 73964
13
+ WER: 80.22%
14
+ Total inference time: 46.9 minutes
15
+ Average time per utterance: 0.42s
16
+ ================================================================================
egs/ami/ASR/xlsr_transducer/inference_results/ref-ihm.txt ADDED
The diff for this file is too large to render. See raw diff
 
egs/ami/ASR/xlsr_transducer/log/log-train-2026-01-25-02-57-28 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27310bd90828a6f6d515d1181fa187228601dfe8247ecc89d39848c95e54ea20
3
+ size 174840669
egs/ami/ASR/xlsr_transducer/tensorboard/events.out.tfevents.1769309848.3edaabdb707c.1028020.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bb2715978701cb9358c38337c7bb5316cffc55440353a079aeb9c0bdc3867f2
3
+ size 158109