diff --git "a/info.log" "b/info.log" new file mode 100644--- /dev/null +++ "b/info.log" @@ -0,0 +1,3454 @@ +2025-10-14 02:33:37,074 - train - INFO - ConformerModel( + (linear_layer_1): Linear(in_features=128, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (conformer_blocks): Sequential( + (0): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (1): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (2): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (3): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (4): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (5): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (6): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (7): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (8): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (9): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + ) + (linear_layer_2): Linear(in_features=256, out_features=28, bias=True) +) +2025-10-14 02:41:40,845 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 02:43:53,269 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 02:53:22,675 - train - INFO - epoch : 1 +2025-10-14 02:53:22,676 - train - INFO - loss : 1.8004077727144414 +2025-10-14 02:53:22,676 - train - INFO - grad_norm : 1.1968199670916857 +2025-10-14 02:53:22,676 - train - INFO - val_loss : 1.467006968049442 +2025-10-14 02:53:22,676 - train - INFO - val_CER_(Argmax): 0.46757403776657935 +2025-10-14 02:53:22,676 - train - INFO - val_WER_(Argmax): 0.91038943818923 +2025-10-14 02:53:22,676 - train - INFO - val_WER : 0.91038943818923 +2025-10-14 02:53:22,677 - train - INFO - val_CER : 0.4676158105801009 +2025-10-14 02:53:22,677 - train - INFO - test_loss : 1.430062472820282 +2025-10-14 02:53:22,677 - train - INFO - test_CER_(Argmax): 0.4589772069767894 +2025-10-14 02:53:22,677 - train - INFO - test_WER_(Argmax): 0.906752747897352 +2025-10-14 02:53:22,677 - train - INFO - test_WER : 0.906752747897352 +2025-10-14 02:53:22,677 - train - INFO - test_CER : 0.45909848600936254 +2025-10-14 02:53:23,214 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 03:03:20,978 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 03:05:39,207 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 03:13:20,217 - train - INFO - epoch : 2 +2025-10-14 03:13:20,217 - train - INFO - loss : 0.9497057100636276 +2025-10-14 03:13:20,217 - train - INFO - grad_norm : 1.2964627569045253 +2025-10-14 03:13:20,218 - train - INFO - val_loss : 0.9703210683429943 +2025-10-14 03:13:20,218 - train - INFO - val_CER_(Argmax): 0.3104969995818503 +2025-10-14 03:13:20,218 - train - INFO - val_WER_(Argmax): 0.7484133288798358 +2025-10-14 03:13:20,218 - train - INFO - val_WER : 0.7484133288798358 +2025-10-14 03:13:20,218 - train - INFO - val_CER : 0.30930462548815907 +2025-10-14 03:13:20,218 - train - INFO - test_loss : 0.9425025051686822 +2025-10-14 03:13:20,218 - train - INFO - test_CER_(Argmax): 0.30274019803097907 +2025-10-14 03:13:20,218 - train - INFO - test_WER_(Argmax): 0.7372494591821148 +2025-10-14 03:13:20,218 - train - INFO - test_WER : 0.7372494591821148 +2025-10-14 03:13:20,219 - train - INFO - test_CER : 0.30157728834330844 +2025-10-14 03:13:20,885 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 03:25:20,451 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 03:33:10,510 - train - INFO - epoch : 3 +2025-10-14 03:33:10,510 - train - INFO - loss : 0.7622273603081703 +2025-10-14 03:33:10,510 - train - INFO - grad_norm : 1.1402055078744888 +2025-10-14 03:33:10,511 - train - INFO - val_loss : 0.8829054685199962 +2025-10-14 03:33:10,511 - train - INFO - val_CER_(Argmax): 0.28195997427460867 +2025-10-14 03:33:10,511 - train - INFO - val_WER_(Argmax): 0.686879415725692 +2025-10-14 03:33:10,511 - train - INFO - val_WER : 0.686879415725692 +2025-10-14 03:33:10,511 - train - INFO - val_CER : 0.28205835407842134 +2025-10-14 03:33:10,511 - train - INFO - test_loss : 0.8557483960942525 +2025-10-14 03:33:10,511 - train - INFO - test_CER_(Argmax): 0.2709563942448364 +2025-10-14 03:33:10,511 - train - INFO - test_WER_(Argmax): 0.6692670294343694 +2025-10-14 03:33:10,511 - train - INFO - test_WER : 0.6692670294343694 +2025-10-14 03:33:10,512 - train - INFO - test_CER : 0.27099345931346924 +2025-10-14 03:33:11,148 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 03:53:06,907 - train - INFO - epoch : 4 +2025-10-14 03:53:06,908 - train - INFO - loss : 0.602354964017868 +2025-10-14 03:53:06,908 - train - INFO - grad_norm : 0.8039740589261055 +2025-10-14 03:53:06,908 - train - INFO - val_loss : 0.7007653839447919 +2025-10-14 03:53:06,908 - train - INFO - val_CER_(Argmax): 0.22225459871101402 +2025-10-14 03:53:06,908 - train - INFO - val_WER_(Argmax): 0.5930371723301392 +2025-10-14 03:53:06,908 - train - INFO - val_WER : 0.5930371723301392 +2025-10-14 03:53:06,908 - train - INFO - val_CER : 0.22210362264174635 +2025-10-14 03:53:06,908 - train - INFO - test_loss : 0.6907694921260927 +2025-10-14 03:53:06,908 - train - INFO - test_CER_(Argmax): 0.2163203087967603 +2025-10-14 03:53:06,909 - train - INFO - test_WER_(Argmax): 0.5772301195235712 +2025-10-14 03:53:06,909 - train - INFO - test_WER : 0.5772301195235712 +2025-10-14 03:53:06,909 - train - INFO - test_CER : 0.21606124644889366 +2025-10-14 03:53:07,539 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 03:58:05,821 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 04:13:08,317 - train - INFO - epoch : 5 +2025-10-14 04:13:08,318 - train - INFO - loss : 0.5589919736981392 +2025-10-14 04:13:08,318 - train - INFO - grad_norm : 0.6693760113418102 +2025-10-14 04:13:08,318 - train - INFO - val_loss : 0.6744126432082233 +2025-10-14 04:13:08,318 - train - INFO - val_CER_(Argmax): 0.21458751555183747 +2025-10-14 04:13:08,319 - train - INFO - val_WER_(Argmax): 0.5772959000644565 +2025-10-14 04:13:08,319 - train - INFO - val_WER : 0.5772959000644565 +2025-10-14 04:13:08,319 - train - INFO - val_CER : 0.2145634022145982 +2025-10-14 04:13:08,319 - train - INFO - test_loss : 0.6590640392245316 +2025-10-14 04:13:08,319 - train - INFO - test_CER_(Argmax): 0.20839209387247756 +2025-10-14 04:13:08,319 - train - INFO - test_WER_(Argmax): 0.5701414952271677 +2025-10-14 04:13:08,319 - train - INFO - test_WER : 0.5701414952271677 +2025-10-14 04:13:08,320 - train - INFO - test_CER : 0.20839523910387514 +2025-10-14 04:13:08,988 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 04:14:24,248 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 04:24:28,763 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 04:33:06,709 - train - INFO - epoch : 6 +2025-10-14 04:33:06,709 - train - INFO - loss : 0.510627674460411 +2025-10-14 04:33:06,709 - train - INFO - grad_norm : 0.5528287927806378 +2025-10-14 04:33:06,710 - train - INFO - val_loss : 0.6022388451239642 +2025-10-14 04:33:06,710 - train - INFO - val_CER_(Argmax): 0.18296962699849365 +2025-10-14 04:33:06,710 - train - INFO - val_WER_(Argmax): 0.5093214369543156 +2025-10-14 04:33:06,710 - train - INFO - val_WER : 0.5093214369543156 +2025-10-14 04:33:06,710 - train - INFO - val_CER : 0.18261087096328094 +2025-10-14 04:33:06,710 - train - INFO - test_loss : 0.6027363142589244 +2025-10-14 04:33:06,710 - train - INFO - test_CER_(Argmax): 0.1820291797987071 +2025-10-14 04:33:06,710 - train - INFO - test_WER_(Argmax): 0.5047745633151853 +2025-10-14 04:33:06,710 - train - INFO - test_WER : 0.5047745633151853 +2025-10-14 04:33:06,710 - train - INFO - test_CER : 0.18158809112179367 +2025-10-14 04:33:07,385 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 04:42:03,191 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 04:53:08,991 - train - INFO - epoch : 7 +2025-10-14 04:53:08,992 - train - INFO - loss : 0.49359407886188833 +2025-10-14 04:53:08,992 - train - INFO - grad_norm : 0.5190753033712282 +2025-10-14 04:53:08,992 - train - INFO - val_loss : 0.6014730471021988 +2025-10-14 04:53:08,992 - train - INFO - val_CER_(Argmax): 0.181704769539811 +2025-10-14 04:53:08,993 - train - INFO - val_WER_(Argmax): 0.5059981667294224 +2025-10-14 04:53:08,993 - train - INFO - val_WER : 0.5059981667294224 +2025-10-14 04:53:08,993 - train - INFO - val_CER : 0.1815978525961797 +2025-10-14 04:53:08,993 - train - INFO - test_loss : 0.5996043998293761 +2025-10-14 04:53:08,993 - train - INFO - test_CER_(Argmax): 0.1791391732758246 +2025-10-14 04:53:08,993 - train - INFO - test_WER_(Argmax): 0.4997126493658538 +2025-10-14 04:53:08,993 - train - INFO - test_WER : 0.4997126493658538 +2025-10-14 04:53:08,994 - train - INFO - test_CER : 0.179074904175568 +2025-10-14 04:53:09,676 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 04:58:15,378 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 05:13:07,029 - train - INFO - epoch : 8 +2025-10-14 05:13:07,030 - train - INFO - loss : 0.46707881450653077 +2025-10-14 05:13:07,030 - train - INFO - grad_norm : 0.48182750105857847 +2025-10-14 05:13:07,030 - train - INFO - val_loss : 0.6367673782741322 +2025-10-14 05:13:07,030 - train - INFO - val_CER_(Argmax): 0.19087030186457948 +2025-10-14 05:13:07,030 - train - INFO - val_WER_(Argmax): 0.5220166510054483 +2025-10-14 05:13:07,030 - train - INFO - val_WER : 0.5220166510054483 +2025-10-14 05:13:07,030 - train - INFO - val_CER : 0.190838427285644 +2025-10-14 05:13:07,031 - train - INFO - test_loss : 0.6175299279573487 +2025-10-14 05:13:07,031 - train - INFO - test_CER_(Argmax): 0.18446621558922244 +2025-10-14 05:13:07,031 - train - INFO - test_WER_(Argmax): 0.5084007759551228 +2025-10-14 05:13:07,031 - train - INFO - test_WER : 0.5084007759551228 +2025-10-14 05:13:07,031 - train - INFO - test_CER : 0.184502159264417 +2025-10-14 05:20:54,798 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 05:33:02,433 - train - INFO - epoch : 9 +2025-10-14 05:33:02,434 - train - INFO - loss : 0.45376209877244195 +2025-10-14 05:33:02,434 - train - INFO - grad_norm : 0.45064780400626026 +2025-10-14 05:33:02,435 - train - INFO - val_loss : 0.5756007843157824 +2025-10-14 05:33:02,435 - train - INFO - val_CER_(Argmax): 0.16914851214323548 +2025-10-14 05:33:02,435 - train - INFO - val_WER_(Argmax): 0.47379946747058904 +2025-10-14 05:33:02,435 - train - INFO - val_WER : 0.47379946747058904 +2025-10-14 05:33:02,435 - train - INFO - val_CER : 0.16900721738636365 +2025-10-14 05:33:02,435 - train - INFO - test_loss : 0.5721487871757368 +2025-10-14 05:33:02,436 - train - INFO - test_CER_(Argmax): 0.16664663014275632 +2025-10-14 05:33:02,436 - train - INFO - test_WER_(Argmax): 0.4681725557437241 +2025-10-14 05:33:02,436 - train - INFO - test_WER : 0.4681725557437241 +2025-10-14 05:33:02,436 - train - INFO - test_CER : 0.16649619330717766 +2025-10-14 05:33:03,081 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 05:33:14,650 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 05:43:25,713 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 05:52:54,270 - train - INFO - epoch : 10 +2025-10-14 05:52:54,271 - train - INFO - loss : 0.45315693199634555 +2025-10-14 05:52:54,271 - train - INFO - grad_norm : 0.4713628640770912 +2025-10-14 05:52:54,271 - train - INFO - val_loss : 0.5362565857522628 +2025-10-14 05:52:54,271 - train - INFO - val_CER_(Argmax): 0.16329148533434748 +2025-10-14 05:52:54,271 - train - INFO - val_WER_(Argmax): 0.46387223589891163 +2025-10-14 05:52:54,271 - train - INFO - val_WER : 0.46387223589891163 +2025-10-14 05:52:54,271 - train - INFO - val_CER : 0.16286558348739288 +2025-10-14 05:52:54,271 - train - INFO - test_loss : 0.5317229174259233 +2025-10-14 05:52:54,271 - train - INFO - test_CER_(Argmax): 0.16059709717785184 +2025-10-14 05:52:54,271 - train - INFO - test_WER_(Argmax): 0.45693877851173287 +2025-10-14 05:52:54,272 - train - INFO - test_WER : 0.45693877851173287 +2025-10-14 05:52:54,272 - train - INFO - test_CER : 0.1603862774668369 +2025-10-14 05:52:54,905 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 06:12:35,761 - train - INFO - epoch : 11 +2025-10-14 06:12:35,762 - train - INFO - loss : 0.39052830457687376 +2025-10-14 06:12:35,762 - train - INFO - grad_norm : 0.4450642728060484 +2025-10-14 06:12:35,762 - train - INFO - val_loss : 0.51932592532214 +2025-10-14 06:12:35,762 - train - INFO - val_CER_(Argmax): 0.15941955542976455 +2025-10-14 06:12:35,762 - train - INFO - val_WER_(Argmax): 0.45788092649283985 +2025-10-14 06:12:35,763 - train - INFO - val_WER : 0.45788092649283985 +2025-10-14 06:12:35,763 - train - INFO - val_CER : 0.15936533181396875 +2025-10-14 06:12:35,763 - train - INFO - test_loss : 0.5151232182979584 +2025-10-14 06:12:35,763 - train - INFO - test_CER_(Argmax): 0.15455427775801173 +2025-10-14 06:12:35,763 - train - INFO - test_WER_(Argmax): 0.44637718160637985 +2025-10-14 06:12:35,763 - train - INFO - test_WER : 0.44637718160637985 +2025-10-14 06:12:35,763 - train - INFO - test_CER : 0.15453517470721737 +2025-10-14 06:12:36,429 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 06:13:48,904 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 06:23:40,844 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 06:32:20,518 - train - INFO - epoch : 12 +2025-10-14 06:32:20,518 - train - INFO - loss : 0.34388998910784724 +2025-10-14 06:32:20,518 - train - INFO - grad_norm : 0.40590107560157773 +2025-10-14 06:32:20,518 - train - INFO - val_loss : 0.5381783243487863 +2025-10-14 06:32:20,518 - train - INFO - val_CER_(Argmax): 0.16583726089869136 +2025-10-14 06:32:20,519 - train - INFO - val_WER_(Argmax): 0.465543927183187 +2025-10-14 06:32:20,519 - train - INFO - val_WER : 0.465543927183187 +2025-10-14 06:32:20,519 - train - INFO - val_CER : 0.16531468827653925 +2025-10-14 06:32:20,519 - train - INFO - test_loss : 0.5295760235408458 +2025-10-14 06:32:20,519 - train - INFO - test_CER_(Argmax): 0.16174688828818035 +2025-10-14 06:32:20,519 - train - INFO - test_WER_(Argmax): 0.4550225790980045 +2025-10-14 06:32:20,519 - train - INFO - test_WER : 0.4550225790980045 +2025-10-14 06:32:20,519 - train - INFO - test_CER : 0.16122770436209705 +2025-10-14 06:43:29,714 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 06:52:09,495 - train - INFO - epoch : 13 +2025-10-14 06:52:09,496 - train - INFO - loss : 0.34937265180051325 +2025-10-14 06:52:09,496 - train - INFO - grad_norm : 0.4043967518210411 +2025-10-14 06:52:09,496 - train - INFO - val_loss : 0.48908867941183204 +2025-10-14 06:52:09,496 - train - INFO - val_CER_(Argmax): 0.14971865099250545 +2025-10-14 06:52:09,497 - train - INFO - val_WER_(Argmax): 0.4332590763851437 +2025-10-14 06:52:09,497 - train - INFO - val_WER : 0.4332590763851437 +2025-10-14 06:52:09,497 - train - INFO - val_CER : 0.1497014184247999 +2025-10-14 06:52:09,497 - train - INFO - test_loss : 0.49085995009759575 +2025-10-14 06:52:09,497 - train - INFO - test_CER_(Argmax): 0.1494535796558252 +2025-10-14 06:52:09,497 - train - INFO - test_WER_(Argmax): 0.4310647728310606 +2025-10-14 06:52:09,497 - train - INFO - test_WER : 0.4310647728310606 +2025-10-14 06:52:09,497 - train - INFO - test_CER : 0.14945445213192635 +2025-10-14 06:52:10,157 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 06:57:44,296 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 07:11:49,105 - train - INFO - epoch : 14 +2025-10-14 07:11:49,105 - train - INFO - loss : 0.32730326041579244 +2025-10-14 07:11:49,105 - train - INFO - grad_norm : 0.37786272332072257 +2025-10-14 07:11:49,105 - train - INFO - val_loss : 0.5277743637561798 +2025-10-14 07:11:49,106 - train - INFO - val_CER_(Argmax): 0.15338718155863898 +2025-10-14 07:11:49,106 - train - INFO - val_WER_(Argmax): 0.44184830517379275 +2025-10-14 07:11:49,106 - train - INFO - val_WER : 0.44184830517379275 +2025-10-14 07:11:49,106 - train - INFO - val_CER : 0.15328479401941472 +2025-10-14 07:11:49,106 - train - INFO - test_loss : 0.5241630139147363 +2025-10-14 07:11:49,106 - train - INFO - test_CER_(Argmax): 0.14972207345168487 +2025-10-14 07:11:49,106 - train - INFO - test_WER_(Argmax): 0.43158279188478127 +2025-10-14 07:11:49,106 - train - INFO - test_WER : 0.43158279188478127 +2025-10-14 07:11:49,106 - train - INFO - test_CER : 0.14960003782834835 +2025-10-14 07:31:25,217 - train - INFO - epoch : 15 +2025-10-14 07:31:25,218 - train - INFO - loss : 0.3190571042895317 +2025-10-14 07:31:25,219 - train - INFO - grad_norm : 0.3733240906894207 +2025-10-14 07:31:25,219 - train - INFO - val_loss : 0.4872164645615746 +2025-10-14 07:31:25,219 - train - INFO - val_CER_(Argmax): 0.14572484758853044 +2025-10-14 07:31:25,219 - train - INFO - val_WER_(Argmax): 0.4191253071785101 +2025-10-14 07:31:25,219 - train - INFO - val_WER : 0.4191253071785101 +2025-10-14 07:31:25,219 - train - INFO - val_CER : 0.1452825863232472 +2025-10-14 07:31:25,219 - train - INFO - test_loss : 0.48767780676120664 +2025-10-14 07:31:25,219 - train - INFO - test_CER_(Argmax): 0.1442184158052019 +2025-10-14 07:31:25,219 - train - INFO - test_WER_(Argmax): 0.4156251226875968 +2025-10-14 07:31:25,219 - train - INFO - test_WER : 0.4156251226875968 +2025-10-14 07:31:25,220 - train - INFO - test_CER : 0.14397894549224005 +2025-10-14 07:31:25,873 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 07:32:07,397 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 07:38:29,332 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 07:51:09,085 - train - INFO - epoch : 16 +2025-10-14 07:51:09,085 - train - INFO - loss : 0.32288431376218796 +2025-10-14 07:51:09,085 - train - INFO - grad_norm : 0.3875794377923012 +2025-10-14 07:51:09,085 - train - INFO - val_loss : 0.4824349988909329 +2025-10-14 07:51:09,086 - train - INFO - val_CER_(Argmax): 0.1440481174169628 +2025-10-14 07:51:09,086 - train - INFO - val_WER_(Argmax): 0.41937533251399106 +2025-10-14 07:51:09,086 - train - INFO - val_WER : 0.41937533251399106 +2025-10-14 07:51:09,086 - train - INFO - val_CER : 0.14402863422008916 +2025-10-14 07:51:09,086 - train - INFO - test_loss : 0.4875093171509301 +2025-10-14 07:51:09,086 - train - INFO - test_CER_(Argmax): 0.1435495187572233 +2025-10-14 07:51:09,086 - train - INFO - test_WER_(Argmax): 0.41857261206475077 +2025-10-14 07:51:09,086 - train - INFO - test_WER : 0.41857261206475077 +2025-10-14 07:51:09,086 - train - INFO - test_CER : 0.14345600543528397 +2025-10-14 07:53:04,340 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 08:11:00,249 - train - INFO - epoch : 17 +2025-10-14 08:11:00,250 - train - INFO - loss : 0.3117878006398678 +2025-10-14 08:11:00,250 - train - INFO - grad_norm : 0.37129794418811796 +2025-10-14 08:11:00,250 - train - INFO - val_loss : 0.46727428611587074 +2025-10-14 08:11:00,251 - train - INFO - val_CER_(Argmax): 0.1398674099185118 +2025-10-14 08:11:00,251 - train - INFO - val_WER_(Argmax): 0.4029957532757304 +2025-10-14 08:11:00,251 - train - INFO - val_WER : 0.4029957532757304 +2025-10-14 08:11:00,251 - train - INFO - val_CER : 0.138428207923737 +2025-10-14 08:11:00,251 - train - INFO - test_loss : 0.4605345202655327 +2025-10-14 08:11:00,251 - train - INFO - test_CER_(Argmax): 0.13763281168297406 +2025-10-14 08:11:00,251 - train - INFO - test_WER_(Argmax): 0.40026093361768017 +2025-10-14 08:11:00,251 - train - INFO - test_WER : 0.40026093361768017 +2025-10-14 08:11:00,251 - train - INFO - test_CER : 0.13695235893427762 +2025-10-14 08:11:00,897 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 08:20:41,969 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 08:31:03,757 - train - INFO - epoch : 18 +2025-10-14 08:31:03,757 - train - INFO - loss : 0.3081890671846256 +2025-10-14 08:31:03,758 - train - INFO - grad_norm : 0.37016375235576726 +2025-10-14 08:31:03,758 - train - INFO - val_loss : 0.4678590574685265 +2025-10-14 08:31:03,758 - train - INFO - val_CER_(Argmax): 0.13379779068102082 +2025-10-14 08:31:03,758 - train - INFO - val_WER_(Argmax): 0.3922874175956862 +2025-10-14 08:31:03,758 - train - INFO - val_WER : 0.3922874175956862 +2025-10-14 08:31:03,758 - train - INFO - val_CER : 0.13369347277137167 +2025-10-14 08:31:03,758 - train - INFO - test_loss : 0.4725317922307224 +2025-10-14 08:31:03,758 - train - INFO - test_CER_(Argmax): 0.13567544579262084 +2025-10-14 08:31:03,758 - train - INFO - test_WER_(Argmax): 0.3943974565677792 +2025-10-14 08:31:03,758 - train - INFO - test_WER : 0.3943974565677792 +2025-10-14 08:31:03,759 - train - INFO - test_CER : 0.13559223799657805 +2025-10-14 08:31:04,419 - train - INFO - Saving current best: model_best.pth ... +2025-10-14 08:36:17,211 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 08:50:46,825 - train - INFO - epoch : 19 +2025-10-14 08:50:46,825 - train - INFO - loss : 0.2682218397408724 +2025-10-14 08:50:46,825 - train - INFO - grad_norm : 0.32187166772782805 +2025-10-14 08:50:46,826 - train - INFO - val_loss : 0.47410393462461586 +2025-10-14 08:50:46,826 - train - INFO - val_CER_(Argmax): 0.14253732992734036 +2025-10-14 08:50:46,826 - train - INFO - val_WER_(Argmax): 0.409711592751532 +2025-10-14 08:50:46,826 - train - INFO - val_WER : 0.409711592751532 +2025-10-14 08:50:46,826 - train - INFO - val_CER : 0.14251998090573448 +2025-10-14 08:50:46,826 - train - INFO - test_loss : 0.470459710170583 +2025-10-14 08:50:46,826 - train - INFO - test_CER_(Argmax): 0.14044063030058693 +2025-10-14 08:50:46,826 - train - INFO - test_WER_(Argmax): 0.40506228117177334 +2025-10-14 08:50:46,826 - train - INFO - test_WER : 0.40506228117177334 +2025-10-14 08:50:46,827 - train - INFO - test_CER : 0.1404145806112171 +2025-10-14 08:53:19,357 - urllib3.connectionpool - WARNING - Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='www.comet.com', port=443): Read timed out. (read timeout=10)")': /clientlib/status-report/update +2025-10-14 08:53:21,310 - urllib3.connectionpool - WARNING - Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='www.comet.com', port=443): Read timed out. (read timeout=10)")': /clientlib/rest/v2/write/experiment/output +2025-10-14 08:57:22,238 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 09:02:29,246 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 09:10:42,467 - train - INFO - epoch : 20 +2025-10-14 09:10:42,468 - train - INFO - loss : 0.26297851376235487 +2025-10-14 09:10:42,468 - train - INFO - grad_norm : 0.36333887211978433 +2025-10-14 09:10:42,468 - train - INFO - val_loss : 0.498136385048137 +2025-10-14 09:10:42,468 - train - INFO - val_CER_(Argmax): 0.1409829398308886 +2025-10-14 09:10:42,468 - train - INFO - val_WER_(Argmax): 0.40756040553729267 +2025-10-14 09:10:42,468 - train - INFO - val_WER : 0.40756040553729267 +2025-10-14 09:10:42,468 - train - INFO - val_CER : 0.14095097487519742 +2025-10-14 09:10:42,468 - train - INFO - test_loss : 0.48370018637761836 +2025-10-14 09:10:42,469 - train - INFO - test_CER_(Argmax): 0.1360583083333615 +2025-10-14 09:10:42,469 - train - INFO - test_WER_(Argmax): 0.39381178720670307 +2025-10-14 09:10:42,469 - train - INFO - test_WER : 0.39381178720670307 +2025-10-14 09:10:42,469 - train - INFO - test_CER : 0.1360182842490938 +2025-10-14 09:10:42,992 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch20.pth ... +2025-10-14 09:11:48,738 - train - INFO - Saving model on keyboard interrupt +2025-10-14 09:11:49,259 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch21.pth ... +2025-10-14 23:15:37,958 - train - INFO - ConformerModel( + (linear_layer_1): Linear(in_features=128, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (conformer_blocks): Sequential( + (0): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (1): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (2): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (3): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (4): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (5): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (6): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (7): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (8): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (9): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + ) + (linear_layer_2): Linear(in_features=256, out_features=28, bias=True) +) +2025-10-14 23:15:37,974 - train - INFO - Loading checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch21.pth ... +2025-10-14 23:15:38,389 - train - INFO - Checkpoint loaded. Resume training from epoch 22 +2025-10-14 23:23:52,739 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 23:26:06,300 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 23:36:17,410 - train - INFO - epoch : 22 +2025-10-14 23:36:17,411 - train - INFO - loss : 0.2934816901129906 +2025-10-14 23:36:17,411 - train - INFO - grad_norm : 0.3821600427982783 +2025-10-14 23:36:17,411 - train - INFO - val_loss : 0.4874178209725548 +2025-10-14 23:36:17,411 - train - INFO - val_CER_(Argmax): 0.13945358821010811 +2025-10-14 23:36:17,411 - train - INFO - val_WER_(Argmax): 0.40280195242601624 +2025-10-14 23:36:17,412 - train - INFO - val_WER : 0.40280195242601624 +2025-10-14 23:36:17,412 - train - INFO - val_CER : 0.13942245845447518 +2025-10-14 23:36:17,412 - train - INFO - test_loss : 0.48479962312593694 +2025-10-14 23:36:17,412 - train - INFO - test_CER_(Argmax): 0.13631144181611937 +2025-10-14 23:36:17,412 - train - INFO - test_WER_(Argmax): 0.3958590163824199 +2025-10-14 23:36:17,412 - train - INFO - test_WER : 0.3958590163824199 +2025-10-14 23:36:17,412 - train - INFO - test_CER : 0.13629110738251396 +2025-10-14 23:46:20,954 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 23:48:42,744 - train - WARNING - OOM on batch. Skipping batch. +2025-10-14 23:56:39,785 - train - INFO - epoch : 23 +2025-10-14 23:56:39,786 - train - INFO - loss : 0.2551273109775093 +2025-10-14 23:56:39,786 - train - INFO - grad_norm : 0.3137171796368594 +2025-10-14 23:56:39,786 - train - INFO - val_loss : 0.46369018204071943 +2025-10-14 23:56:39,786 - train - INFO - val_CER_(Argmax): 0.13684675847070465 +2025-10-14 23:56:39,786 - train - INFO - val_WER_(Argmax): 0.39398508692774187 +2025-10-14 23:56:39,787 - train - INFO - val_WER : 0.39398508692774187 +2025-10-14 23:56:39,787 - train - INFO - val_CER : 0.13685407007272402 +2025-10-14 23:56:39,787 - train - INFO - test_loss : 0.4581556451029894 +2025-10-14 23:56:39,787 - train - INFO - test_CER_(Argmax): 0.1350574156452121 +2025-10-14 23:56:39,787 - train - INFO - test_WER_(Argmax): 0.3875261612050869 +2025-10-14 23:56:39,787 - train - INFO - test_WER : 0.3875261612050869 +2025-10-14 23:56:39,787 - train - INFO - test_CER : 0.13511500315245614 +2025-10-15 00:08:52,697 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 00:17:07,562 - train - INFO - epoch : 24 +2025-10-15 00:17:07,562 - train - INFO - loss : 0.245378298163414 +2025-10-15 00:17:07,563 - train - INFO - grad_norm : 0.3387820851057768 +2025-10-15 00:17:07,563 - train - INFO - val_loss : 0.5287014912156498 +2025-10-15 00:17:07,563 - train - INFO - val_CER_(Argmax): 0.1539106275040438 +2025-10-15 00:17:07,563 - train - INFO - val_WER_(Argmax): 0.42460000884399807 +2025-10-15 00:17:07,563 - train - INFO - val_WER : 0.42460000884399807 +2025-10-15 00:17:07,563 - train - INFO - val_CER : 0.15297842974722475 +2025-10-15 00:17:07,563 - train - INFO - test_loss : 0.5029914589916787 +2025-10-15 00:17:07,563 - train - INFO - test_CER_(Argmax): 0.14657635076122771 +2025-10-15 00:17:07,564 - train - INFO - test_WER_(Argmax): 0.41203503318541046 +2025-10-15 00:17:07,564 - train - INFO - test_WER : 0.41203503318541046 +2025-10-15 00:17:07,564 - train - INFO - test_CER : 0.1457842404666151 +2025-10-15 00:37:36,383 - train - INFO - epoch : 25 +2025-10-15 00:37:36,384 - train - INFO - loss : 0.23428807348012926 +2025-10-15 00:37:36,384 - train - INFO - grad_norm : 0.3239304776489735 +2025-10-15 00:37:36,384 - train - INFO - val_loss : 0.44715122440282035 +2025-10-15 00:37:36,384 - train - INFO - val_CER_(Argmax): 0.12522702466393065 +2025-10-15 00:37:36,384 - train - INFO - val_WER_(Argmax): 0.36798136361847184 +2025-10-15 00:37:36,385 - train - INFO - val_WER : 0.36798136361847184 +2025-10-15 00:37:36,385 - train - INFO - val_CER : 0.12516017792671708 +2025-10-15 00:37:36,385 - train - INFO - test_loss : 0.4525063423121848 +2025-10-15 00:37:36,385 - train - INFO - test_CER_(Argmax): 0.12637926517100662 +2025-10-15 00:37:36,385 - train - INFO - test_WER_(Argmax): 0.3720632078294957 +2025-10-15 00:37:36,385 - train - INFO - test_WER : 0.3720632078294957 +2025-10-15 00:37:36,385 - train - INFO - test_CER : 0.1263555695987811 +2025-10-15 00:37:36,962 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 00:42:33,247 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 00:57:31,683 - train - INFO - epoch : 26 +2025-10-15 00:57:31,684 - train - INFO - loss : 0.21956679798662662 +2025-10-15 00:57:31,684 - train - INFO - grad_norm : 0.30377400361001494 +2025-10-15 00:57:31,684 - train - INFO - val_loss : 0.4440635372610653 +2025-10-15 00:57:31,684 - train - INFO - val_CER_(Argmax): 0.12553994948531128 +2025-10-15 00:57:31,684 - train - INFO - val_WER_(Argmax): 0.3683140692106527 +2025-10-15 00:57:31,684 - train - INFO - val_WER : 0.3683140692106527 +2025-10-15 00:57:31,685 - train - INFO - val_CER : 0.12547953415857194 +2025-10-15 00:57:31,685 - train - INFO - test_loss : 0.43620326751615945 +2025-10-15 00:57:31,685 - train - INFO - test_CER_(Argmax): 0.12184782350276652 +2025-10-15 00:57:31,685 - train - INFO - test_WER_(Argmax): 0.36093858955137864 +2025-10-15 00:57:31,685 - train - INFO - test_WER : 0.36093858955137864 +2025-10-15 00:57:31,685 - train - INFO - test_CER : 0.12182641046616761 +2025-10-15 00:58:46,849 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 01:08:54,287 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 01:18:58,969 - train - INFO - epoch : 27 +2025-10-15 01:18:58,970 - train - INFO - loss : 0.23763787642121315 +2025-10-15 01:18:58,970 - train - INFO - grad_norm : 0.33571442432701587 +2025-10-15 01:18:58,970 - train - INFO - val_loss : 0.43435747272828046 +2025-10-15 01:18:58,970 - train - INFO - val_CER_(Argmax): 0.12460465956934305 +2025-10-15 01:18:58,970 - train - INFO - val_WER_(Argmax): 0.3689147427735292 +2025-10-15 01:18:58,970 - train - INFO - val_WER : 0.3689147427735292 +2025-10-15 01:18:58,970 - train - INFO - val_CER : 0.12435221723652463 +2025-10-15 01:18:58,970 - train - INFO - test_loss : 0.42551243159829116 +2025-10-15 01:18:58,971 - train - INFO - test_CER_(Argmax): 0.12051501557557807 +2025-10-15 01:18:58,971 - train - INFO - test_WER_(Argmax): 0.35871069551154516 +2025-10-15 01:18:58,971 - train - INFO - test_WER : 0.35871069551154516 +2025-10-15 01:18:58,971 - train - INFO - test_CER : 0.12040524447063847 +2025-10-15 01:28:06,616 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 01:40:43,152 - train - INFO - epoch : 28 +2025-10-15 01:40:43,152 - train - INFO - loss : 0.2343284837414871 +2025-10-15 01:40:43,152 - train - INFO - grad_norm : 0.32623336228293986 +2025-10-15 01:40:43,152 - train - INFO - val_loss : 0.45126108702491313 +2025-10-15 01:40:43,153 - train - INFO - val_CER_(Argmax): 0.12340088072525852 +2025-10-15 01:40:43,153 - train - INFO - val_WER_(Argmax): 0.36056492495645576 +2025-10-15 01:40:43,153 - train - INFO - val_WER : 0.36056492495645576 +2025-10-15 01:40:43,153 - train - INFO - val_CER : 0.12344191243753086 +2025-10-15 01:40:43,153 - train - INFO - test_loss : 0.44489473068132634 +2025-10-15 01:40:43,153 - train - INFO - test_CER_(Argmax): 0.12100421883695911 +2025-10-15 01:40:43,153 - train - INFO - test_WER_(Argmax): 0.3557473436617356 +2025-10-15 01:40:43,153 - train - INFO - test_WER : 0.3557473436617356 +2025-10-15 01:40:43,153 - train - INFO - test_CER : 0.1209957554068205 +2025-10-15 01:40:43,788 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 01:45:52,826 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 02:02:15,203 - train - INFO - epoch : 29 +2025-10-15 02:02:15,204 - train - INFO - loss : 0.2276789116859436 +2025-10-15 02:02:15,204 - train - INFO - grad_norm : 0.31875048592686656 +2025-10-15 02:02:15,204 - train - INFO - val_loss : 0.4515169536366182 +2025-10-15 02:02:15,204 - train - INFO - val_CER_(Argmax): 0.12343411279442047 +2025-10-15 02:02:15,205 - train - INFO - val_WER_(Argmax): 0.35693396607175665 +2025-10-15 02:02:15,205 - train - INFO - val_WER : 0.35693396607175665 +2025-10-15 02:02:15,205 - train - INFO - val_CER : 0.12296479435758914 +2025-10-15 02:02:15,205 - train - INFO - test_loss : 0.4444568338917523 +2025-10-15 02:02:15,205 - train - INFO - test_CER_(Argmax): 0.12155681713388337 +2025-10-15 02:02:15,205 - train - INFO - test_WER_(Argmax): 0.355160702184518 +2025-10-15 02:02:15,206 - train - INFO - test_WER : 0.355160702184518 +2025-10-15 02:02:15,206 - train - INFO - test_CER : 0.1213823591345638 +2025-10-15 02:02:16,139 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 02:10:06,354 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 02:23:50,140 - train - INFO - epoch : 30 +2025-10-15 02:23:50,141 - train - INFO - loss : 0.23757737919912866 +2025-10-15 02:23:50,141 - train - INFO - grad_norm : 0.3481074444313145 +2025-10-15 02:23:50,141 - train - INFO - val_loss : 0.4355435743051417 +2025-10-15 02:23:50,141 - train - INFO - val_CER_(Argmax): 0.12227846859888573 +2025-10-15 02:23:50,141 - train - INFO - val_WER_(Argmax): 0.3600137657491089 +2025-10-15 02:23:50,141 - train - INFO - val_WER : 0.3600137657491089 +2025-10-15 02:23:50,142 - train - INFO - val_CER : 0.12224294816692473 +2025-10-15 02:23:50,142 - train - INFO - test_loss : 0.4234782213844904 +2025-10-15 02:23:50,142 - train - INFO - test_CER_(Argmax): 0.11799567883835432 +2025-10-15 02:23:50,142 - train - INFO - test_WER_(Argmax): 0.3509967284700314 +2025-10-15 02:23:50,142 - train - INFO - test_WER : 0.3509967284700314 +2025-10-15 02:23:50,142 - train - INFO - test_CER : 0.11798016735938993 +2025-10-15 02:23:50,653 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch30.pth ... +2025-10-15 02:24:02,264 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 02:34:18,495 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 02:45:15,288 - train - INFO - epoch : 31 +2025-10-15 02:45:15,289 - train - INFO - loss : 0.2335624401271343 +2025-10-15 02:45:15,289 - train - INFO - grad_norm : 0.33499817200005055 +2025-10-15 02:45:15,289 - train - INFO - val_loss : 0.42463123097139244 +2025-10-15 02:45:15,289 - train - INFO - val_CER_(Argmax): 0.1214074309645935 +2025-10-15 02:45:15,289 - train - INFO - val_WER_(Argmax): 0.36004170998968105 +2025-10-15 02:45:15,289 - train - INFO - val_WER : 0.36004170998968105 +2025-10-15 02:45:15,289 - train - INFO - val_CER : 0.12133784581497396 +2025-10-15 02:45:15,289 - train - INFO - test_loss : 0.4196070029241283 +2025-10-15 02:45:15,289 - train - INFO - test_CER_(Argmax): 0.1190913597673928 +2025-10-15 02:45:15,289 - train - INFO - test_WER_(Argmax): 0.3543684771257035 +2025-10-15 02:45:15,289 - train - INFO - test_WER : 0.3543684771257035 +2025-10-15 02:45:15,290 - train - INFO - test_CER : 0.11909188803428304 +2025-10-15 03:06:34,594 - train - INFO - epoch : 32 +2025-10-15 03:06:34,595 - train - INFO - loss : 0.19390573374927045 +2025-10-15 03:06:34,595 - train - INFO - grad_norm : 0.2884246703982353 +2025-10-15 03:06:34,595 - train - INFO - val_loss : 0.4251282506129321 +2025-10-15 03:06:34,595 - train - INFO - val_CER_(Argmax): 0.11977977920474925 +2025-10-15 03:06:34,596 - train - INFO - val_WER_(Argmax): 0.35403802314681054 +2025-10-15 03:06:34,596 - train - INFO - val_WER : 0.35403802314681054 +2025-10-15 03:06:34,596 - train - INFO - val_CER : 0.11974593748907238 +2025-10-15 03:06:34,596 - train - INFO - test_loss : 0.4268707185983658 +2025-10-15 03:06:34,596 - train - INFO - test_CER_(Argmax): 0.11750315678556099 +2025-10-15 03:06:34,596 - train - INFO - test_WER_(Argmax): 0.34796513829846193 +2025-10-15 03:06:34,596 - train - INFO - test_WER : 0.34796513829846193 +2025-10-15 03:06:34,596 - train - INFO - test_CER : 0.11745897923149805 +2025-10-15 03:06:35,206 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 03:07:47,566 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 03:17:49,399 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 03:27:58,262 - train - INFO - epoch : 33 +2025-10-15 03:27:58,263 - train - INFO - loss : 0.18811449602246286 +2025-10-15 03:27:58,263 - train - INFO - grad_norm : 0.3027683352679014 +2025-10-15 03:27:58,263 - train - INFO - val_loss : 0.4447146780350629 +2025-10-15 03:27:58,263 - train - INFO - val_CER_(Argmax): 0.1167911485471282 +2025-10-15 03:27:58,263 - train - INFO - val_WER_(Argmax): 0.34777324203356125 +2025-10-15 03:27:58,263 - train - INFO - val_WER : 0.34777324203356125 +2025-10-15 03:27:58,264 - train - INFO - val_CER : 0.11672586478730562 +2025-10-15 03:27:58,264 - train - INFO - test_loss : 0.44641188295876105 +2025-10-15 03:27:58,264 - train - INFO - test_CER_(Argmax): 0.11520872571090347 +2025-10-15 03:27:58,264 - train - INFO - test_WER_(Argmax): 0.34454126380885003 +2025-10-15 03:27:58,264 - train - INFO - test_WER : 0.34454126380885003 +2025-10-15 03:27:58,264 - train - INFO - test_CER : 0.11515496969675332 +2025-10-15 03:27:58,876 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 03:39:06,057 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 03:49:07,853 - train - INFO - epoch : 34 +2025-10-15 03:49:07,854 - train - INFO - loss : 0.19895173616707326 +2025-10-15 03:49:07,854 - train - INFO - grad_norm : 0.3446432762593031 +2025-10-15 03:49:07,854 - train - INFO - val_loss : 0.438978297920788 +2025-10-15 03:49:07,854 - train - INFO - val_CER_(Argmax): 0.12138526768120507 +2025-10-15 03:49:07,854 - train - INFO - val_WER_(Argmax): 0.3567880885491568 +2025-10-15 03:49:07,854 - train - INFO - val_WER : 0.3567880885491568 +2025-10-15 03:49:07,855 - train - INFO - val_CER : 0.12133834485438902 +2025-10-15 03:49:07,855 - train - INFO - test_loss : 0.4366752952337265 +2025-10-15 03:49:07,855 - train - INFO - test_CER_(Argmax): 0.11889285769529587 +2025-10-15 03:49:07,855 - train - INFO - test_WER_(Argmax): 0.3499597095170695 +2025-10-15 03:49:07,855 - train - INFO - test_WER : 0.3499597095170695 +2025-10-15 03:49:07,855 - train - INFO - test_CER : 0.11886283480129935 +2025-10-15 03:54:43,121 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 04:10:10,751 - train - INFO - epoch : 35 +2025-10-15 04:10:10,752 - train - INFO - loss : 0.18583452805876732 +2025-10-15 04:10:10,752 - train - INFO - grad_norm : 0.2995685636997223 +2025-10-15 04:10:10,752 - train - INFO - val_loss : 0.4410898643381455 +2025-10-15 04:10:10,752 - train - INFO - val_CER_(Argmax): 0.11951493903695873 +2025-10-15 04:10:10,752 - train - INFO - val_WER_(Argmax): 0.3514972872290408 +2025-10-15 04:10:10,752 - train - INFO - val_WER : 0.3514972872290408 +2025-10-15 04:10:10,752 - train - INFO - val_CER : 0.11950992517253951 +2025-10-15 04:10:10,752 - train - INFO - test_loss : 0.43057214050758175 +2025-10-15 04:10:10,753 - train - INFO - test_CER_(Argmax): 0.11632696628536299 +2025-10-15 04:10:10,753 - train - INFO - test_WER_(Argmax): 0.34279318429294053 +2025-10-15 04:10:10,753 - train - INFO - test_WER : 0.34279318429294053 +2025-10-15 04:10:10,753 - train - INFO - test_CER : 0.11630289332594408 +2025-10-15 04:10:11,240 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch35.pth ... +2025-10-15 04:31:14,040 - train - INFO - epoch : 36 +2025-10-15 04:31:14,041 - train - INFO - loss : 0.20375656172633172 +2025-10-15 04:31:14,041 - train - INFO - grad_norm : 0.3344234121590853 +2025-10-15 04:31:14,041 - train - INFO - val_loss : 0.4425490912269143 +2025-10-15 04:31:14,042 - train - INFO - val_CER_(Argmax): 0.12107380933012289 +2025-10-15 04:31:14,042 - train - INFO - val_WER_(Argmax): 0.35624895795851813 +2025-10-15 04:31:14,042 - train - INFO - val_WER : 0.35624895795851813 +2025-10-15 04:31:14,042 - train - INFO - val_CER : 0.12094349863299866 +2025-10-15 04:31:14,042 - train - INFO - test_loss : 0.4421439832303582 +2025-10-15 04:31:14,042 - train - INFO - test_CER_(Argmax): 0.1203417035792212 +2025-10-15 04:31:14,042 - train - INFO - test_WER_(Argmax): 0.3539471761721452 +2025-10-15 04:31:14,042 - train - INFO - test_WER : 0.3539471761721452 +2025-10-15 04:31:14,042 - train - INFO - test_CER : 0.12014604748975138 +2025-10-15 04:31:55,800 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 04:38:23,938 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 04:52:26,034 - train - INFO - epoch : 37 +2025-10-15 04:52:26,035 - train - INFO - loss : 0.19795498616993426 +2025-10-15 04:52:26,035 - train - INFO - grad_norm : 0.3208508171141148 +2025-10-15 04:52:26,036 - train - INFO - val_loss : 0.4592762873453252 +2025-10-15 04:52:26,036 - train - INFO - val_CER_(Argmax): 0.1294481027382875 +2025-10-15 04:52:26,036 - train - INFO - val_WER_(Argmax): 0.3774834616766213 +2025-10-15 04:52:26,036 - train - INFO - val_WER : 0.3774834616766213 +2025-10-15 04:52:26,037 - train - INFO - val_CER : 0.1291189030153579 +2025-10-15 04:52:26,037 - train - INFO - test_loss : 0.45307019171191426 +2025-10-15 04:52:26,037 - train - INFO - test_CER_(Argmax): 0.12595557229485574 +2025-10-15 04:52:26,037 - train - INFO - test_WER_(Argmax): 0.36986993718068867 +2025-10-15 04:52:26,038 - train - INFO - test_WER : 0.36986993718068867 +2025-10-15 04:52:26,038 - train - INFO - test_CER : 0.12586377840829696 +2025-10-15 04:54:23,286 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 05:13:56,065 - train - INFO - epoch : 38 +2025-10-15 05:13:56,065 - train - INFO - loss : 0.19663494504988194 +2025-10-15 05:13:56,065 - train - INFO - grad_norm : 0.3321287203580141 +2025-10-15 05:13:56,066 - train - INFO - val_loss : 0.4284120608778561 +2025-10-15 05:13:56,066 - train - INFO - val_CER_(Argmax): 0.11491743825574693 +2025-10-15 05:13:56,066 - train - INFO - val_WER_(Argmax): 0.34405708317994527 +2025-10-15 05:13:56,066 - train - INFO - val_WER : 0.34405708317994527 +2025-10-15 05:13:56,066 - train - INFO - val_CER : 0.11487393368251399 +2025-10-15 05:13:56,066 - train - INFO - test_loss : 0.42923609730674 +2025-10-15 05:13:56,066 - train - INFO - test_CER_(Argmax): 0.11365153750471271 +2025-10-15 05:13:56,066 - train - INFO - test_WER_(Argmax): 0.34081102085255915 +2025-10-15 05:13:56,066 - train - INFO - test_WER : 0.34081102085255915 +2025-10-15 05:13:56,066 - train - INFO - test_CER : 0.11363074827669829 +2025-10-15 05:13:56,668 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 05:23:38,950 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 05:35:12,536 - train - INFO - epoch : 39 +2025-10-15 05:35:12,538 - train - INFO - loss : 0.18985670647728983 +2025-10-15 05:35:12,538 - train - INFO - grad_norm : 0.3164519183150488 +2025-10-15 05:35:12,538 - train - INFO - val_loss : 0.4361394615734325 +2025-10-15 05:35:12,538 - train - INFO - val_CER_(Argmax): 0.11303924020618653 +2025-10-15 05:35:12,538 - train - INFO - val_WER_(Argmax): 0.336099117293402 +2025-10-15 05:35:12,539 - train - INFO - val_WER : 0.336099117293402 +2025-10-15 05:35:12,539 - train - INFO - val_CER : 0.11297259763035278 +2025-10-15 05:35:12,539 - train - INFO - test_loss : 0.4445284605026245 +2025-10-15 05:35:12,539 - train - INFO - test_CER_(Argmax): 0.11357265394174307 +2025-10-15 05:35:12,540 - train - INFO - test_WER_(Argmax): 0.33712349274851 +2025-10-15 05:35:12,540 - train - INFO - test_WER : 0.33712349274851 +2025-10-15 05:35:12,540 - train - INFO - test_CER : 0.11356115257526862 +2025-10-15 05:35:13,651 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 05:40:26,534 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 05:56:08,877 - train - INFO - epoch : 40 +2025-10-15 05:56:08,878 - train - INFO - loss : 0.1758589457720518 +2025-10-15 05:56:08,878 - train - INFO - grad_norm : 0.3102634911984205 +2025-10-15 05:56:08,878 - train - INFO - val_loss : 0.43493274906102347 +2025-10-15 05:56:08,878 - train - INFO - val_CER_(Argmax): 0.1127735913870602 +2025-10-15 05:56:08,878 - train - INFO - val_WER_(Argmax): 0.3333324824481823 +2025-10-15 05:56:08,878 - train - INFO - val_WER : 0.3333324824481823 +2025-10-15 05:56:08,879 - train - INFO - val_CER : 0.11270196248096376 +2025-10-15 05:56:08,879 - train - INFO - test_loss : 0.4304133802652359 +2025-10-15 05:56:08,879 - train - INFO - test_CER_(Argmax): 0.10964191141102644 +2025-10-15 05:56:08,879 - train - INFO - test_WER_(Argmax): 0.3288502833506312 +2025-10-15 05:56:08,879 - train - INFO - test_WER : 0.3288502833506312 +2025-10-15 05:56:08,879 - train - INFO - test_CER : 0.10959007331535237 +2025-10-15 05:56:09,478 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 06:02:47,603 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 06:07:57,641 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 06:17:32,447 - train - INFO - epoch : 41 +2025-10-15 06:17:32,448 - train - INFO - loss : 0.1584758448600769 +2025-10-15 06:17:32,448 - train - INFO - grad_norm : 0.31132353775203225 +2025-10-15 06:17:32,448 - train - INFO - val_loss : 0.4486117271816029 +2025-10-15 06:17:32,449 - train - INFO - val_CER_(Argmax): 0.11394301416811013 +2025-10-15 06:17:32,449 - train - INFO - val_WER_(Argmax): 0.3352175680667114 +2025-10-15 06:17:32,449 - train - INFO - val_WER : 0.3352175680667114 +2025-10-15 06:17:32,449 - train - INFO - val_CER : 0.11388101864339778 +2025-10-15 06:17:32,449 - train - INFO - test_loss : 0.4439057616925821 +2025-10-15 06:17:32,450 - train - INFO - test_CER_(Argmax): 0.11181283918602594 +2025-10-15 06:17:32,450 - train - INFO - test_WER_(Argmax): 0.3330420680106056 +2025-10-15 06:17:32,450 - train - INFO - test_WER : 0.3330420680106056 +2025-10-15 06:17:32,450 - train - INFO - test_CER : 0.11175747260237462 +2025-10-15 06:38:32,900 - train - INFO - epoch : 42 +2025-10-15 06:38:32,901 - train - INFO - loss : 0.15154085498303174 +2025-10-15 06:38:32,901 - train - INFO - grad_norm : 0.31321163214743136 +2025-10-15 06:38:32,901 - train - INFO - val_loss : 0.4170303176431095 +2025-10-15 06:38:32,901 - train - INFO - val_CER_(Argmax): 0.11403384671386754 +2025-10-15 06:38:32,901 - train - INFO - val_WER_(Argmax): 0.339910982840436 +2025-10-15 06:38:32,901 - train - INFO - val_WER : 0.339910982840436 +2025-10-15 06:38:32,901 - train - INFO - val_CER : 0.11396475986393424 +2025-10-15 06:38:32,902 - train - INFO - test_loss : 0.4154184039046125 +2025-10-15 06:38:32,902 - train - INFO - test_CER_(Argmax): 0.11247534045841451 +2025-10-15 06:38:32,902 - train - INFO - test_WER_(Argmax): 0.3328717059031543 +2025-10-15 06:38:32,902 - train - INFO - test_WER : 0.3328717059031543 +2025-10-15 06:38:32,902 - train - INFO - test_CER : 0.1123930726890102 +2025-10-15 06:42:35,159 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 06:49:28,757 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 06:59:48,120 - train - INFO - epoch : 43 +2025-10-15 06:59:48,120 - train - INFO - loss : 0.15414640687406064 +2025-10-15 06:59:48,121 - train - INFO - grad_norm : 0.2877813006937504 +2025-10-15 06:59:48,121 - train - INFO - val_loss : 0.4414702657391043 +2025-10-15 06:59:48,121 - train - INFO - val_CER_(Argmax): 0.12070044864926999 +2025-10-15 06:59:48,121 - train - INFO - val_WER_(Argmax): 0.3537302879447558 +2025-10-15 06:59:48,121 - train - INFO - val_WER : 0.3537302879447558 +2025-10-15 06:59:48,121 - train - INFO - val_CER : 0.12068197627972231 +2025-10-15 06:59:48,121 - train - INFO - test_loss : 0.4438759840843154 +2025-10-15 06:59:48,121 - train - INFO - test_CER_(Argmax): 0.11862870450615648 +2025-10-15 06:59:48,121 - train - INFO - test_WER_(Argmax): 0.34733577061982585 +2025-10-15 06:59:48,121 - train - INFO - test_WER : 0.34733577061982585 +2025-10-15 06:59:48,122 - train - INFO - test_CER : 0.11862613781757472 +2025-10-15 07:20:43,415 - train - INFO - epoch : 44 +2025-10-15 07:20:43,415 - train - INFO - loss : 0.1363305367529392 +2025-10-15 07:20:43,416 - train - INFO - grad_norm : 0.2479817882925272 +2025-10-15 07:20:43,416 - train - INFO - val_loss : 0.4520348629530738 +2025-10-15 07:20:43,416 - train - INFO - val_CER_(Argmax): 0.11507168537958189 +2025-10-15 07:20:43,416 - train - INFO - val_WER_(Argmax): 0.3395088421236314 +2025-10-15 07:20:43,416 - train - INFO - val_WER : 0.3395088421236314 +2025-10-15 07:20:43,417 - train - INFO - val_CER : 0.11507427832449559 +2025-10-15 07:20:43,417 - train - INFO - test_loss : 0.4543668962106472 +2025-10-15 07:20:43,417 - train - INFO - test_CER_(Argmax): 0.11321039333306016 +2025-10-15 07:20:43,417 - train - INFO - test_WER_(Argmax): 0.3319809926281677 +2025-10-15 07:20:43,417 - train - INFO - test_WER : 0.3319809926281677 +2025-10-15 07:20:43,417 - train - INFO - test_CER : 0.1131822929806515 +2025-10-15 07:20:51,021 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 07:23:10,474 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 07:41:51,854 - train - INFO - epoch : 45 +2025-10-15 07:41:51,855 - train - INFO - loss : 0.13987333193421364 +2025-10-15 07:41:51,855 - train - INFO - grad_norm : 0.26614324234426023 +2025-10-15 07:41:51,855 - train - INFO - val_loss : 0.4230976984781377 +2025-10-15 07:41:51,855 - train - INFO - val_CER_(Argmax): 0.11541515342609154 +2025-10-15 07:41:51,855 - train - INFO - val_WER_(Argmax): 0.34157605694474263 +2025-10-15 07:41:51,855 - train - INFO - val_WER : 0.34157605694474263 +2025-10-15 07:41:51,856 - train - INFO - val_CER : 0.11539626185146624 +2025-10-15 07:41:51,856 - train - INFO - test_loss : 0.4274728712512226 +2025-10-15 07:41:51,856 - train - INFO - test_CER_(Argmax): 0.11464870047179306 +2025-10-15 07:41:51,856 - train - INFO - test_WER_(Argmax): 0.3363408865261514 +2025-10-15 07:41:51,856 - train - INFO - test_WER : 0.3363408865261514 +2025-10-15 07:41:51,856 - train - INFO - test_CER : 0.11463597320095606 +2025-10-15 07:41:52,350 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch45.pth ... +2025-10-15 07:49:37,815 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 08:03:12,872 - train - INFO - epoch : 46 +2025-10-15 08:03:12,873 - train - INFO - loss : 0.15541664376180975 +2025-10-15 08:03:12,873 - train - INFO - grad_norm : 0.30704583459763074 +2025-10-15 08:03:12,873 - train - INFO - val_loss : 0.4262906141140882 +2025-10-15 08:03:12,873 - train - INFO - val_CER_(Argmax): 0.10697360241698037 +2025-10-15 08:03:12,873 - train - INFO - val_WER_(Argmax): 0.31894312216255877 +2025-10-15 08:03:12,873 - train - INFO - val_WER : 0.31894312216255877 +2025-10-15 08:03:12,873 - train - INFO - val_CER : 0.10697432792093566 +2025-10-15 08:03:12,874 - train - INFO - test_loss : 0.4236972459205767 +2025-10-15 08:03:12,874 - train - INFO - test_CER_(Argmax): 0.1036966429887784 +2025-10-15 08:03:12,874 - train - INFO - test_WER_(Argmax): 0.31260322305849975 +2025-10-15 08:03:12,874 - train - INFO - test_WER : 0.31260322305849975 +2025-10-15 08:03:12,874 - train - INFO - test_CER : 0.10368049391363919 +2025-10-15 08:03:13,468 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 08:11:04,939 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 08:14:33,659 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 08:24:28,330 - train - INFO - epoch : 47 +2025-10-15 08:24:28,331 - train - INFO - loss : 0.1484343280369912 +2025-10-15 08:24:28,331 - train - INFO - grad_norm : 0.2978131380212966 +2025-10-15 08:24:28,331 - train - INFO - val_loss : 0.41686498592881593 +2025-10-15 08:24:28,331 - train - INFO - val_CER_(Argmax): 0.10715906838873794 +2025-10-15 08:24:28,331 - train - INFO - val_WER_(Argmax): 0.3189208901687594 +2025-10-15 08:24:28,331 - train - INFO - val_WER : 0.3189208901687594 +2025-10-15 08:24:28,331 - train - INFO - val_CER : 0.1070721680237797 +2025-10-15 08:24:28,331 - train - INFO - test_loss : 0.4248736776956698 +2025-10-15 08:24:28,332 - train - INFO - test_CER_(Argmax): 0.10606248870942041 +2025-10-15 08:24:28,332 - train - INFO - test_WER_(Argmax): 0.31392775456406125 +2025-10-15 08:24:28,332 - train - INFO - test_WER : 0.31392775456406125 +2025-10-15 08:24:28,332 - train - INFO - test_CER : 0.10608059527809617 +2025-10-15 08:24:28,964 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 08:45:42,227 - train - INFO - epoch : 48 +2025-10-15 08:45:42,227 - train - INFO - loss : 0.1307941133156419 +2025-10-15 08:45:42,228 - train - INFO - grad_norm : 0.27077246375381947 +2025-10-15 08:45:42,228 - train - INFO - val_loss : 0.42694155994583577 +2025-10-15 08:45:42,228 - train - INFO - val_CER_(Argmax): 0.10582719959543849 +2025-10-15 08:45:42,228 - train - INFO - val_WER_(Argmax): 0.31559900742699704 +2025-10-15 08:45:42,228 - train - INFO - val_WER : 0.31559900742699704 +2025-10-15 08:45:42,228 - train - INFO - val_CER : 0.10572084591343127 +2025-10-15 08:45:42,228 - train - INFO - test_loss : 0.43384504790713146 +2025-10-15 08:45:42,228 - train - INFO - test_CER_(Argmax): 0.10381538597811517 +2025-10-15 08:45:42,228 - train - INFO - test_WER_(Argmax): 0.311420565217544 +2025-10-15 08:45:42,229 - train - INFO - test_WER : 0.311420565217544 +2025-10-15 08:45:42,229 - train - INFO - test_CER : 0.10370990247742001 +2025-10-15 08:45:42,872 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 08:50:42,467 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 08:53:44,773 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 09:07:09,369 - train - INFO - epoch : 49 +2025-10-15 09:07:09,371 - train - INFO - loss : 0.10918797556329612 +2025-10-15 09:07:09,371 - train - INFO - grad_norm : 0.25354656966487366 +2025-10-15 09:07:09,371 - train - INFO - val_loss : 0.45092080375727484 +2025-10-15 09:07:09,371 - train - INFO - val_CER_(Argmax): 0.10998150896043028 +2025-10-15 09:07:09,371 - train - INFO - val_WER_(Argmax): 0.3251663764785473 +2025-10-15 09:07:09,372 - train - INFO - val_WER : 0.3251663764785473 +2025-10-15 09:07:09,372 - train - INFO - val_CER : 0.10883912407170318 +2025-10-15 09:07:09,372 - train - INFO - test_loss : 0.45026707503853775 +2025-10-15 09:07:09,372 - train - INFO - test_CER_(Argmax): 0.1083685399307039 +2025-10-15 09:07:09,372 - train - INFO - test_WER_(Argmax): 0.31903264381702373 +2025-10-15 09:07:09,373 - train - INFO - test_WER : 0.31903264381702373 +2025-10-15 09:07:09,373 - train - INFO - test_CER : 0.10773803375770985 +2025-10-15 09:28:10,598 - train - INFO - epoch : 50 +2025-10-15 09:28:10,598 - train - INFO - loss : 0.10687003966420888 +2025-10-15 09:28:10,598 - train - INFO - grad_norm : 0.2543030245602131 +2025-10-15 09:28:10,599 - train - INFO - val_loss : 0.45245280335931215 +2025-10-15 09:28:10,599 - train - INFO - val_CER_(Argmax): 0.10794419929359747 +2025-10-15 09:28:10,599 - train - INFO - val_WER_(Argmax): 0.319912166248811 +2025-10-15 09:28:10,599 - train - INFO - val_WER : 0.319912166248811 +2025-10-15 09:28:10,599 - train - INFO - val_CER : 0.10779546715487702 +2025-10-15 09:28:10,599 - train - INFO - test_loss : 0.44923570715799566 +2025-10-15 09:28:10,599 - train - INFO - test_CER_(Argmax): 0.10577866073952201 +2025-10-15 09:28:10,599 - train - INFO - test_WER_(Argmax): 0.3148882348303448 +2025-10-15 09:28:10,599 - train - INFO - test_WER : 0.3148882348303448 +2025-10-15 09:28:10,599 - train - INFO - test_CER : 0.10566427550858948 +2025-10-15 09:28:11,086 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch50.pth ... +2025-10-15 09:31:04,615 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 09:33:16,245 - train - INFO - Saving model on keyboard interrupt +2025-10-15 09:33:16,720 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch51.pth ... +2025-10-15 21:28:51,805 - train - INFO - ConformerModel( + (linear_layer_1): Linear(in_features=128, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (conformer_blocks): Sequential( + (0): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (1): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (2): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (3): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (4): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (5): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (6): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (7): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (8): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (9): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + ) + (linear_layer_2): Linear(in_features=256, out_features=28, bias=True) +) +2025-10-15 21:28:51,826 - train - INFO - Loading checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/conformer_30m/checkpoint-epoch51.pth ... +2025-10-15 21:29:48,755 - train - INFO - ConformerModel( + (linear_layer_1): Linear(in_features=128, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (conformer_blocks): Sequential( + (0): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (1): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (2): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (3): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (4): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (5): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (6): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (7): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (8): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (9): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + ) + (linear_layer_2): Linear(in_features=256, out_features=28, bias=True) +) +2025-10-15 21:29:48,772 - train - INFO - Loading checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch51.pth ... +2025-10-15 21:29:49,382 - train - WARNING - Warning: Optimizer or lr_scheduler given in the config file is different from that of the checkpoint. Optimizer and scheduler parameters are not resumed. +2025-10-15 21:29:49,383 - train - INFO - Checkpoint loaded. Resume training from epoch 52 +2025-10-15 21:32:46,047 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 21:36:14,907 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 21:36:28,019 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 21:37:11,355 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 21:37:28,040 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 21:38:05,327 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 21:38:37,856 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 21:59:34,376 - train - INFO - ConformerModel( + (linear_layer_1): Linear(in_features=128, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (conformer_blocks): Sequential( + (0): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (1): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (2): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (3): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (4): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (5): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (6): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (7): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (8): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (9): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + ) + (linear_layer_2): Linear(in_features=256, out_features=28, bias=True) +) +2025-10-15 21:59:34,392 - train - INFO - Loading checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch51.pth ... +2025-10-15 21:59:34,755 - train - WARNING - Warning: Optimizer or lr_scheduler given in the config file is different from that of the checkpoint. Optimizer and scheduler parameters are not resumed. +2025-10-15 21:59:34,756 - train - INFO - Checkpoint loaded. Resume training from epoch 52 +2025-10-15 22:02:34,318 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:06:06,199 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:06:18,792 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:07:20,904 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:07:57,912 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:08:30,148 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:09:56,709 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:10:56,816 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:18:26,896 - train - INFO - epoch : 52 +2025-10-15 22:18:26,897 - train - INFO - loss : 0.5125516548677145 +2025-10-15 22:18:26,897 - train - INFO - grad_norm : 0.5027066829240867 +2025-10-15 22:18:26,897 - train - INFO - val_loss : 0.3037505878524466 +2025-10-15 22:18:26,897 - train - INFO - val_CER_(Argmax): 0.09129654682855147 +2025-10-15 22:18:26,897 - train - INFO - val_WER_(Argmax): 0.28161343918875303 +2025-10-15 22:18:26,897 - train - INFO - val_WER : 0.28161343918875303 +2025-10-15 22:18:26,897 - train - INFO - val_CER : 0.09130862404528663 +2025-10-15 22:18:26,897 - train - INFO - test_loss : 0.30807594836435537 +2025-10-15 22:18:26,897 - train - INFO - test_CER_(Argmax): 0.09054438299605952 +2025-10-15 22:18:26,898 - train - INFO - test_WER_(Argmax): 0.2782349555610264 +2025-10-15 22:18:26,898 - train - INFO - test_WER : 0.2782349555610264 +2025-10-15 22:18:26,898 - train - INFO - test_CER : 0.09054140648346386 +2025-10-15 22:18:27,452 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 22:18:34,798 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:19:35,681 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:20:45,068 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:21:17,097 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:28:13,972 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:28:43,629 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:37:01,553 - train - INFO - epoch : 53 +2025-10-15 22:37:01,554 - train - INFO - loss : 0.4767994123697281 +2025-10-15 22:37:01,554 - train - INFO - grad_norm : 0.5589224849641323 +2025-10-15 22:37:01,554 - train - INFO - val_loss : 0.287550592979232 +2025-10-15 22:37:01,555 - train - INFO - val_CER_(Argmax): 0.08663647521630864 +2025-10-15 22:37:01,555 - train - INFO - val_WER_(Argmax): 0.26653980578276465 +2025-10-15 22:37:01,555 - train - INFO - val_WER : 0.26653980578276465 +2025-10-15 22:37:01,555 - train - INFO - val_CER : 0.08663647998507391 +2025-10-15 22:37:01,555 - train - INFO - test_loss : 0.29402880726212804 +2025-10-15 22:37:01,555 - train - INFO - test_CER_(Argmax): 0.08634445495520625 +2025-10-15 22:37:01,555 - train - INFO - test_WER_(Argmax): 0.26760766956081206 +2025-10-15 22:37:01,555 - train - INFO - test_WER : 0.26760766956081206 +2025-10-15 22:37:01,555 - train - INFO - test_CER : 0.0863314313346999 +2025-10-15 22:37:02,160 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 22:38:36,055 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:41:57,959 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:42:05,218 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:47:32,830 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:55:34,476 - train - INFO - epoch : 54 +2025-10-15 22:55:34,476 - train - INFO - loss : 0.4517164458334446 +2025-10-15 22:55:34,476 - train - INFO - grad_norm : 0.6237075063586235 +2025-10-15 22:55:34,476 - train - INFO - val_loss : 0.28015144562328254 +2025-10-15 22:55:34,476 - train - INFO - val_CER_(Argmax): 0.08501164964665332 +2025-10-15 22:55:34,477 - train - INFO - val_WER_(Argmax): 0.26355578235185695 +2025-10-15 22:55:34,477 - train - INFO - val_WER : 0.26355578235185695 +2025-10-15 22:55:34,477 - train - INFO - val_CER : 0.08501273757687683 +2025-10-15 22:55:34,477 - train - INFO - test_loss : 0.28677379729395563 +2025-10-15 22:55:34,477 - train - INFO - test_CER_(Argmax): 0.08556362216950926 +2025-10-15 22:55:34,477 - train - INFO - test_WER_(Argmax): 0.26548957249503236 +2025-10-15 22:55:34,477 - train - INFO - test_WER : 0.26548957249503236 +2025-10-15 22:55:34,477 - train - INFO - test_CER : 0.08552682603878249 +2025-10-15 22:55:35,084 - train - INFO - Saving current best: model_best.pth ... +2025-10-15 22:58:30,490 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 22:59:07,816 - urllib3.connectionpool - WARNING - Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='www.comet.com', port=443): Read timed out. (read timeout=10)")': /clientlib/rest/v2/write/experiment/output +2025-10-15 23:03:27,906 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:14:15,775 - train - INFO - epoch : 55 +2025-10-15 23:14:15,775 - train - INFO - loss : 0.43951718381900884 +2025-10-15 23:14:15,775 - train - INFO - grad_norm : 0.610138273718369 +2025-10-15 23:14:15,775 - train - INFO - val_loss : 0.28494494465681225 +2025-10-15 23:14:15,775 - train - INFO - val_CER_(Argmax): 0.08762042370466316 +2025-10-15 23:14:15,775 - train - INFO - val_WER_(Argmax): 0.26894677637977005 +2025-10-15 23:14:15,776 - train - INFO - val_WER : 0.26894677637977005 +2025-10-15 23:14:15,776 - train - INFO - val_CER : 0.08761403647985697 +2025-10-15 23:14:15,776 - train - INFO - test_loss : 0.284894541583278 +2025-10-15 23:14:15,776 - train - INFO - test_CER_(Argmax): 0.08618727251740227 +2025-10-15 23:14:15,776 - train - INFO - test_WER_(Argmax): 0.2651266555833503 +2025-10-15 23:14:15,776 - train - INFO - test_WER : 0.2651266555833503 +2025-10-15 23:14:15,776 - train - INFO - test_CER : 0.08613433873893521 +2025-10-15 23:14:16,264 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch55.pth ... +2025-10-15 23:17:14,837 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:24:01,730 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:32:58,682 - train - INFO - epoch : 56 +2025-10-15 23:32:58,682 - train - INFO - loss : 0.4343975530564785 +2025-10-15 23:32:58,682 - train - INFO - grad_norm : 0.6229039686918258 +2025-10-15 23:32:58,682 - train - INFO - val_loss : 0.29983231277911215 +2025-10-15 23:32:58,682 - train - INFO - val_CER_(Argmax): 0.08955679162254075 +2025-10-15 23:32:58,683 - train - INFO - val_WER_(Argmax): 0.27534571803696956 +2025-10-15 23:32:58,683 - train - INFO - val_WER : 0.27534571803696956 +2025-10-15 23:32:58,683 - train - INFO - val_CER : 0.08951839197980865 +2025-10-15 23:32:58,683 - train - INFO - test_loss : 0.29415979016233573 +2025-10-15 23:32:58,683 - train - INFO - test_CER_(Argmax): 0.08593879618718768 +2025-10-15 23:32:58,683 - train - INFO - test_WER_(Argmax): 0.27041946687163093 +2025-10-15 23:32:58,683 - train - INFO - test_WER : 0.27041946687163093 +2025-10-15 23:32:58,683 - train - INFO - test_CER : 0.0859091788688324 +2025-10-15 23:34:07,974 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:34:55,062 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:37:46,718 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:51:41,538 - train - INFO - epoch : 57 +2025-10-15 23:51:41,539 - train - INFO - loss : 0.44212916776537897 +2025-10-15 23:51:41,539 - train - INFO - grad_norm : 0.6302911515533924 +2025-10-15 23:51:41,539 - train - INFO - val_loss : 0.3111286821601155 +2025-10-15 23:51:41,539 - train - INFO - val_CER_(Argmax): 0.09671470586458569 +2025-10-15 23:51:41,539 - train - INFO - val_WER_(Argmax): 0.2970807443308442 +2025-10-15 23:51:41,539 - train - INFO - val_WER : 0.2970807443308442 +2025-10-15 23:51:41,539 - train - INFO - val_CER : 0.09666179815724249 +2025-10-15 23:51:41,540 - train - INFO - test_loss : 0.3062010456553914 +2025-10-15 23:51:41,540 - train - INFO - test_CER_(Argmax): 0.09478064453852167 +2025-10-15 23:51:41,540 - train - INFO - test_WER_(Argmax): 0.2917665250029616 +2025-10-15 23:51:41,540 - train - INFO - test_WER : 0.2917665250029616 +2025-10-15 23:51:41,540 - train - INFO - test_CER : 0.09473352732799264 +2025-10-15 23:52:00,536 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:52:02,784 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:52:37,963 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:53:55,377 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:54:35,227 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:55:49,514 - train - WARNING - OOM on batch. Skipping batch. +2025-10-15 23:59:59,392 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:01:41,820 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:11:38,086 - train - INFO - epoch : 58 +2025-10-16 00:11:38,087 - train - INFO - loss : 0.4711415856028322 +2025-10-16 00:11:38,088 - train - INFO - grad_norm : 0.5886639985606898 +2025-10-16 00:11:38,088 - train - INFO - val_loss : 0.3353017005291614 +2025-10-16 00:11:38,088 - train - INFO - val_CER_(Argmax): 0.10235041302713359 +2025-10-16 00:11:38,088 - train - INFO - val_WER_(Argmax): 0.3116658465074636 +2025-10-16 00:11:38,088 - train - INFO - val_WER : 0.3116658465074636 +2025-10-16 00:11:38,088 - train - INFO - val_CER : 0.1023209039028333 +2025-10-16 00:11:38,088 - train - INFO - test_loss : 0.3331021143292839 +2025-10-16 00:11:38,089 - train - INFO - test_CER_(Argmax): 0.09998714099883693 +2025-10-16 00:11:38,089 - train - INFO - test_WER_(Argmax): 0.30518309281193207 +2025-10-16 00:11:38,089 - train - INFO - test_WER : 0.30518309281193207 +2025-10-16 00:11:38,089 - train - INFO - test_CER : 0.10000657190144936 +2025-10-16 00:15:31,484 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:17:40,029 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:21:20,102 - train - INFO - Saving model on keyboard interrupt +2025-10-16 00:21:20,725 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch59.pth ... +2025-10-16 00:25:35,941 - train - INFO - ConformerModel( + (linear_layer_1): Linear(in_features=128, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (conformer_blocks): Sequential( + (0): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (1): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (2): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (3): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (4): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (5): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (6): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (7): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (8): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (9): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + ) + (linear_layer_2): Linear(in_features=256, out_features=28, bias=True) +) +2025-10-16 00:25:35,957 - train - INFO - Loading checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch55.pth ... +2025-10-16 00:25:36,354 - train - WARNING - Warning: Optimizer or lr_scheduler given in the config file is different from that of the checkpoint. Optimizer and scheduler parameters are not resumed. +2025-10-16 00:25:36,354 - train - INFO - Checkpoint loaded. Resume training from epoch 56 +2025-10-16 00:28:38,563 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:32:12,325 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:32:25,198 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:33:27,019 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:34:03,944 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:34:36,076 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:36:03,448 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:37:05,549 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:45:33,019 - train - INFO - epoch : 56 +2025-10-16 00:45:33,019 - train - INFO - loss : 0.3975452154420959 +2025-10-16 00:45:33,020 - train - INFO - grad_norm : 0.5853364494851399 +2025-10-16 00:45:33,020 - train - INFO - val_loss : 0.26601492441617525 +2025-10-16 00:45:33,020 - train - INFO - val_CER_(Argmax): 0.08095286654242909 +2025-10-16 00:45:33,020 - train - INFO - val_WER_(Argmax): 0.25238181992836795 +2025-10-16 00:45:33,020 - train - INFO - val_WER : 0.25238181992836795 +2025-10-16 00:45:33,020 - train - INFO - val_CER : 0.08094647931762287 +2025-10-16 00:45:33,020 - train - INFO - test_loss : 0.2680981419980526 +2025-10-16 00:45:33,020 - train - INFO - test_CER_(Argmax): 0.08014114209410332 +2025-10-16 00:45:33,020 - train - INFO - test_WER_(Argmax): 0.2506958176100273 +2025-10-16 00:45:33,021 - train - INFO - test_WER : 0.2506958176100273 +2025-10-16 00:45:33,021 - train - INFO - test_CER : 0.08013405791041706 +2025-10-16 00:45:33,634 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 00:45:40,678 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:46:41,544 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:47:51,335 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:48:23,005 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 00:55:25,594 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:05:45,591 - train - INFO - epoch : 57 +2025-10-16 01:05:45,591 - train - INFO - loss : 0.3919293682277203 +2025-10-16 01:05:45,591 - train - INFO - grad_norm : 0.5571760645508766 +2025-10-16 01:05:45,592 - train - INFO - val_loss : 0.26085271825502204 +2025-10-16 01:05:45,592 - train - INFO - val_CER_(Argmax): 0.07941454723878524 +2025-10-16 01:05:45,592 - train - INFO - val_WER_(Argmax): 0.24847200659684884 +2025-10-16 01:05:45,592 - train - INFO - val_WER : 0.24847200659684884 +2025-10-16 01:05:45,592 - train - INFO - val_CER : 0.07940334027231717 +2025-10-16 01:05:45,592 - train - INFO - test_loss : 0.2639189895919778 +2025-10-16 01:05:45,592 - train - INFO - test_CER_(Argmax): 0.07906222839036091 +2025-10-16 01:05:45,592 - train - INFO - test_WER_(Argmax): 0.2467830935703038 +2025-10-16 01:05:45,592 - train - INFO - test_WER : 0.2467830935703038 +2025-10-16 01:05:45,592 - train - INFO - test_CER : 0.07904224464832055 +2025-10-16 01:05:46,206 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 01:07:22,051 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:08:02,388 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:10:48,748 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:10:56,086 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:16:24,729 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:25:47,709 - train - INFO - epoch : 58 +2025-10-16 01:25:47,709 - train - INFO - loss : 0.36956372365355494 +2025-10-16 01:25:47,710 - train - INFO - grad_norm : 0.59020700648427 +2025-10-16 01:25:47,710 - train - INFO - val_loss : 0.2571919283696583 +2025-10-16 01:25:47,710 - train - INFO - val_CER_(Argmax): 0.07819992281237179 +2025-10-16 01:25:47,710 - train - INFO - val_WER_(Argmax): 0.24590658841056448 +2025-10-16 01:25:47,710 - train - INFO - val_WER : 0.24590658841056448 +2025-10-16 01:25:47,710 - train - INFO - val_CER : 0.07818871584590371 +2025-10-16 01:25:47,710 - train - INFO - test_loss : 0.26166995834897866 +2025-10-16 01:25:47,710 - train - INFO - test_CER_(Argmax): 0.07749860080767827 +2025-10-16 01:25:47,710 - train - INFO - test_WER_(Argmax): 0.24307131531532164 +2025-10-16 01:25:47,710 - train - INFO - test_WER : 0.24307131531532164 +2025-10-16 01:25:47,711 - train - INFO - test_CER : 0.0774786170656379 +2025-10-16 01:25:48,325 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 01:28:44,667 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:31:32,043 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:33:48,273 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:46:04,253 - train - INFO - epoch : 59 +2025-10-16 01:46:04,253 - train - INFO - loss : 0.33486752002382997 +2025-10-16 01:46:04,254 - train - INFO - grad_norm : 0.5633255804902945 +2025-10-16 01:46:04,254 - train - INFO - val_loss : 0.25902849910678444 +2025-10-16 01:46:04,254 - train - INFO - val_CER_(Argmax): 0.07788373600524053 +2025-10-16 01:46:04,254 - train - INFO - val_WER_(Argmax): 0.24356013677092545 +2025-10-16 01:46:04,254 - train - INFO - val_WER : 0.24356013677092545 +2025-10-16 01:46:04,254 - train - INFO - val_CER : 0.07787252903877245 +2025-10-16 01:46:04,254 - train - INFO - test_loss : 0.26355371505699376 +2025-10-16 01:46:04,254 - train - INFO - test_CER_(Argmax): 0.07704687640540066 +2025-10-16 01:46:04,254 - train - INFO - test_WER_(Argmax): 0.24105306456868927 +2025-10-16 01:46:04,255 - train - INFO - test_WER : 0.24105306456868927 +2025-10-16 01:46:04,255 - train - INFO - test_CER : 0.07702689266336028 +2025-10-16 01:46:04,866 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 01:49:04,404 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:55:55,381 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 01:55:55,714 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:06:10,617 - train - INFO - epoch : 60 +2025-10-16 02:06:10,617 - train - INFO - loss : 0.38520306527614595 +2025-10-16 02:06:10,617 - train - INFO - grad_norm : 0.6394448788464069 +2025-10-16 02:06:10,618 - train - INFO - val_loss : 0.2523955891093055 +2025-10-16 02:06:10,618 - train - INFO - val_CER_(Argmax): 0.07642980123388085 +2025-10-16 02:06:10,618 - train - INFO - val_WER_(Argmax): 0.24021242375268156 +2025-10-16 02:06:10,618 - train - INFO - val_WER : 0.24021242375268156 +2025-10-16 02:06:10,618 - train - INFO - val_CER : 0.07641859426741279 +2025-10-16 02:06:10,618 - train - INFO - test_loss : 0.2567285626110705 +2025-10-16 02:06:10,618 - train - INFO - test_CER_(Argmax): 0.07615406577241463 +2025-10-16 02:06:10,618 - train - INFO - test_WER_(Argmax): 0.23853556294885694 +2025-10-16 02:06:10,619 - train - INFO - test_WER : 0.23853556294885694 +2025-10-16 02:06:10,619 - train - INFO - test_CER : 0.07614019151229019 +2025-10-16 02:06:11,228 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 02:06:43,709 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:07:21,268 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:08:09,949 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:11:04,696 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:26:31,842 - train - INFO - epoch : 61 +2025-10-16 02:26:31,844 - train - INFO - loss : 0.36128015503287314 +2025-10-16 02:26:31,844 - train - INFO - grad_norm : 0.6772882167994976 +2025-10-16 02:26:31,844 - train - INFO - val_loss : 0.25020998798228883 +2025-10-16 02:26:31,844 - train - INFO - val_CER_(Argmax): 0.07584409640850495 +2025-10-16 02:26:31,844 - train - INFO - val_WER_(Argmax): 0.23786885936936988 +2025-10-16 02:26:31,845 - train - INFO - val_WER : 0.23786885936936988 +2025-10-16 02:26:31,845 - train - INFO - val_CER : 0.075829165821292 +2025-10-16 02:26:31,845 - train - INFO - test_loss : 0.25460747815668583 +2025-10-16 02:26:31,845 - train - INFO - test_CER_(Argmax): 0.07530382475604616 +2025-10-16 02:26:31,845 - train - INFO - test_WER_(Argmax): 0.23598896298484998 +2025-10-16 02:26:31,845 - train - INFO - test_WER : 0.23598896298484998 +2025-10-16 02:26:31,845 - train - INFO - test_CER : 0.07528315230877163 +2025-10-16 02:26:32,993 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 02:26:52,328 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:26:54,618 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:27:29,413 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:28:46,742 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:29:27,005 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:30:43,684 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:34:56,804 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:36:39,409 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:43:32,699 - train - INFO - Saving model on keyboard interrupt +2025-10-16 02:43:33,623 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch62.pth ... +2025-10-16 02:49:17,502 - train - INFO - ConformerModel( + (linear_layer_1): Linear(in_features=128, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (conformer_blocks): Sequential( + (0): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (1): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (2): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (3): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (4): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (5): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (6): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (7): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (8): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + (9): ConformerBlock( + (feedforward): FeedForward( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (linear_1): Linear(in_features=256, out_features=1024, bias=True) + (swish): SiLU() + (dropout_1): Dropout(p=0.1, inplace=False) + (linear_2): Linear(in_features=1024, out_features=256, bias=True) + (dropout_2): Dropout(p=0.1, inplace=False) + ) + (multiheadselfattn): MultiHeadSelfAttn( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (multiheadattn): MultiheadAttention( + (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) + ) + (dropout): Dropout(p=0.1, inplace=False) + ) + (convmod): ConvolutionModule( + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + (conv_1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (GLu): GLU(dim=1) + (conv_2): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,)) + (bn): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (swish): SiLU() + (conv_3): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (dp): Dropout(p=0.1, inplace=False) + ) + (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True) + ) + ) + (linear_layer_2): Linear(in_features=256, out_features=28, bias=True) +) +2025-10-16 02:49:17,523 - train - INFO - Loading checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch62.pth ... +2025-10-16 02:49:17,913 - train - WARNING - Warning: Optimizer or lr_scheduler given in the config file is different from that of the checkpoint. Optimizer and scheduler parameters are not resumed. +2025-10-16 02:49:17,914 - train - INFO - Checkpoint loaded. Resume training from epoch 63 +2025-10-16 02:52:15,957 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:55:47,500 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:56:00,240 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:57:01,514 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:57:38,947 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:58:11,042 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 02:59:37,819 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:00:38,610 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:01:03,954 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:02:05,006 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:03:14,868 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:03:46,604 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:12:15,075 - train - INFO - epoch : 63 +2025-10-16 03:12:15,075 - train - INFO - loss : 0.3650146044299082 +2025-10-16 03:12:15,076 - train - INFO - grad_norm : 0.732061750695185 +2025-10-16 03:12:15,076 - train - INFO - val_loss : 0.25124785574999725 +2025-10-16 03:12:15,076 - train - INFO - val_CER_(Argmax): 0.0746809727347122 +2025-10-16 03:12:15,076 - train - INFO - val_WER_(Argmax): 0.2340455252925752 +2025-10-16 03:12:15,077 - train - INFO - val_WER : 0.2340455252925752 +2025-10-16 03:12:15,077 - train - INFO - val_CER : 0.07466787604501343 +2025-10-16 03:12:15,077 - train - INFO - test_loss : 0.6272238085464555 +2025-10-16 03:12:15,077 - train - INFO - test_CER_(Argmax): 0.18450640920311695 +2025-10-16 03:12:15,077 - train - INFO - test_WER_(Argmax): 0.45838625598859056 +2025-10-16 03:12:15,078 - train - INFO - test_WER : 0.45838625598859056 +2025-10-16 03:12:15,078 - train - INFO - test_CER : 0.1844315619994462 +2025-10-16 03:12:16,653 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 03:18:39,991 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:21:29,785 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:21:50,742 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:22:29,530 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:25:13,780 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:25:20,946 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:35:12,338 - train - INFO - epoch : 64 +2025-10-16 03:35:12,339 - train - INFO - loss : 0.34100347862701225 +2025-10-16 03:35:12,339 - train - INFO - grad_norm : 0.6329810723210826 +2025-10-16 03:35:12,339 - train - INFO - val_loss : 0.24926169436763634 +2025-10-16 03:35:12,339 - train - INFO - val_CER_(Argmax): 0.07435775967825689 +2025-10-16 03:35:12,339 - train - INFO - val_WER_(Argmax): 0.23329625474160692 +2025-10-16 03:35:12,339 - train - INFO - val_WER : 0.23329625474160692 +2025-10-16 03:35:12,339 - train - INFO - val_CER : 0.07438597296022 +2025-10-16 03:35:12,340 - train - INFO - test_loss : 0.6233528654794304 +2025-10-16 03:35:12,340 - train - INFO - test_CER_(Argmax): 0.18366725143983528 +2025-10-16 03:35:12,340 - train - INFO - test_WER_(Argmax): 0.45719180165705925 +2025-10-16 03:35:12,340 - train - INFO - test_WER : 0.45719180165705925 +2025-10-16 03:35:12,340 - train - INFO - test_CER : 0.18355957764546682 +2025-10-16 03:35:12,971 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 03:38:54,738 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:42:35,097 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:45:20,154 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:47:35,186 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:48:14,719 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:49:30,920 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 03:58:14,580 - train - INFO - epoch : 65 +2025-10-16 03:58:14,581 - train - INFO - loss : 0.31584615042113295 +2025-10-16 03:58:14,581 - train - INFO - grad_norm : 0.5978777054885421 +2025-10-16 03:58:14,581 - train - INFO - val_loss : 0.2521649480543353 +2025-10-16 03:58:14,581 - train - INFO - val_CER_(Argmax): 0.07417007913966524 +2025-10-16 03:58:14,582 - train - INFO - val_WER_(Argmax): 0.23324526469575566 +2025-10-16 03:58:14,582 - train - INFO - val_WER : 0.23324526469575566 +2025-10-16 03:58:14,582 - train - INFO - val_CER : 0.07415698244996646 +2025-10-16 03:58:14,582 - train - INFO - test_loss : 0.6326038317412747 +2025-10-16 03:58:14,583 - train - INFO - test_CER_(Argmax): 0.1823612314736419 +2025-10-16 03:58:14,583 - train - INFO - test_WER_(Argmax): 0.4545763313591499 +2025-10-16 03:58:14,583 - train - INFO - test_WER : 0.4545763313591499 +2025-10-16 03:58:14,583 - train - INFO - test_CER : 0.18233351503535009 +2025-10-16 03:58:15,691 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 04:02:20,768 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:04:19,315 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:09:07,248 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:09:07,646 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:09:21,702 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:11:07,133 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:11:42,931 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:21:07,208 - train - INFO - epoch : 66 +2025-10-16 04:21:07,210 - train - INFO - loss : 0.35260840369896457 +2025-10-16 04:21:07,210 - train - INFO - grad_norm : 0.6941393242038861 +2025-10-16 04:21:07,210 - train - INFO - val_loss : 0.24919554303315553 +2025-10-16 04:21:07,211 - train - INFO - val_CER_(Argmax): 0.07394414424173303 +2025-10-16 04:21:07,211 - train - INFO - val_WER_(Argmax): 0.2325940054693939 +2025-10-16 04:21:07,211 - train - INFO - val_WER : 0.2325940054693939 +2025-10-16 04:21:07,211 - train - INFO - val_CER : 0.07393104755203425 +2025-10-16 04:21:07,211 - train - INFO - test_loss : 0.6230674550241354 +2025-10-16 04:21:07,212 - train - INFO - test_CER_(Argmax): 0.18231329163386267 +2025-10-16 04:21:07,212 - train - INFO - test_WER_(Argmax): 0.4550589076913371 +2025-10-16 04:21:07,212 - train - INFO - test_WER : 0.4550589076913371 +2025-10-16 04:21:07,212 - train - INFO - test_CER : 0.18230684880807213 +2025-10-16 04:21:08,325 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 04:22:33,557 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:27:24,290 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:27:45,211 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:30:06,656 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:33:36,847 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:34:10,539 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:34:23,483 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:34:37,393 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:43:57,489 - train - INFO - epoch : 67 +2025-10-16 04:43:57,489 - train - INFO - loss : 0.3447438172843991 +2025-10-16 04:43:57,489 - train - INFO - grad_norm : 0.6474652308590558 +2025-10-16 04:43:57,490 - train - INFO - val_loss : 0.24890769882635636 +2025-10-16 04:43:57,490 - train - INFO - val_CER_(Argmax): 0.07402464042128627 +2025-10-16 04:43:57,490 - train - INFO - val_WER_(Argmax): 0.23276514236104118 +2025-10-16 04:43:57,490 - train - INFO - val_WER : 0.23276514236104118 +2025-10-16 04:43:57,490 - train - INFO - val_CER : 0.07401007745592768 +2025-10-16 04:43:57,490 - train - INFO - test_loss : 0.6215816164503292 +2025-10-16 04:43:57,490 - train - INFO - test_CER_(Argmax): 0.18244981545202055 +2025-10-16 04:43:57,490 - train - INFO - test_WER_(Argmax): 0.4545860848775349 +2025-10-16 04:43:57,490 - train - INFO - test_WER : 0.4545860848775349 +2025-10-16 04:43:57,490 - train - INFO - test_CER : 0.18234336381594599 +2025-10-16 04:48:59,867 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:49:49,544 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:52:50,756 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:54:56,214 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:56:32,796 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 04:56:41,395 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:06:56,800 - train - INFO - epoch : 68 +2025-10-16 05:06:56,801 - train - INFO - loss : 0.34905940896332865 +2025-10-16 05:06:56,801 - train - INFO - grad_norm : 0.6677498564575658 +2025-10-16 05:06:56,801 - train - INFO - val_loss : 0.24816725386137312 +2025-10-16 05:06:56,801 - train - INFO - val_CER_(Argmax): 0.07392974273734511 +2025-10-16 05:06:56,801 - train - INFO - val_WER_(Argmax): 0.23159585286071424 +2025-10-16 05:06:56,802 - train - INFO - val_WER : 0.23159585286071424 +2025-10-16 05:06:56,802 - train - INFO - val_CER : 0.07391517977198651 +2025-10-16 05:06:56,802 - train - INFO - test_loss : 0.6195810704815145 +2025-10-16 05:06:56,802 - train - INFO - test_CER_(Argmax): 0.18233763775457032 +2025-10-16 05:06:56,802 - train - INFO - test_WER_(Argmax): 0.4545479979074868 +2025-10-16 05:06:56,802 - train - INFO - test_WER : 0.4545479979074868 +2025-10-16 05:06:56,802 - train - INFO - test_CER : 0.1823116928582287 +2025-10-16 05:06:57,368 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 05:07:50,812 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:09:15,900 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:09:28,324 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:12:10,737 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:12:15,674 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:15:57,493 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:17:11,071 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:17:28,170 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:18:48,935 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:19:16,909 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:20:11,772 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:20:34,940 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:29:55,155 - train - INFO - epoch : 69 +2025-10-16 05:29:55,156 - train - INFO - loss : 0.34450486673013814 +2025-10-16 05:29:55,156 - train - INFO - grad_norm : 0.6989231766177918 +2025-10-16 05:29:55,157 - train - INFO - val_loss : 0.24752564846792005 +2025-10-16 05:29:55,157 - train - INFO - val_CER_(Argmax): 0.07375343535866713 +2025-10-16 05:29:55,157 - train - INFO - val_WER_(Argmax): 0.23090194026229824 +2025-10-16 05:29:55,157 - train - INFO - val_WER : 0.23090194026229824 +2025-10-16 05:29:55,157 - train - INFO - val_CER : 0.07373887239330854 +2025-10-16 05:29:55,157 - train - INFO - test_loss : 0.6168282333077216 +2025-10-16 05:29:55,157 - train - INFO - test_CER_(Argmax): 0.18170050435403806 +2025-10-16 05:29:55,157 - train - INFO - test_WER_(Argmax): 0.45372425677781314 +2025-10-16 05:29:55,157 - train - INFO - test_WER : 0.45372425677781314 +2025-10-16 05:29:55,157 - train - INFO - test_CER : 0.1816881820827073 +2025-10-16 05:29:55,750 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 05:31:10,506 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:32:32,422 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:37:27,413 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:38:39,694 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:39:33,806 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:39:38,992 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:40:03,551 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:43:34,425 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:52:58,744 - train - INFO - epoch : 70 +2025-10-16 05:52:58,745 - train - INFO - loss : 0.3450617952644825 +2025-10-16 05:52:58,745 - train - INFO - grad_norm : 0.6964107640087605 +2025-10-16 05:52:58,745 - train - INFO - val_loss : 0.2472096932205287 +2025-10-16 05:52:58,745 - train - INFO - val_CER_(Argmax): 0.07284972328502712 +2025-10-16 05:52:58,745 - train - INFO - val_WER_(Argmax): 0.22901276614441682 +2025-10-16 05:52:58,745 - train - INFO - val_WER : 0.22901276614441682 +2025-10-16 05:52:58,745 - train - INFO - val_CER : 0.07284273607724429 +2025-10-16 05:52:58,746 - train - INFO - test_loss : 0.6180139366461306 +2025-10-16 05:52:58,746 - train - INFO - test_CER_(Argmax): 0.1804781446442257 +2025-10-16 05:52:58,746 - train - INFO - test_WER_(Argmax): 0.45284049314400276 +2025-10-16 05:52:58,746 - train - INFO - test_WER : 0.45284049314400276 +2025-10-16 05:52:58,746 - train - INFO - test_CER : 0.1804666251609065 +2025-10-16 05:52:59,372 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 05:56:08,152 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:59:33,037 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 05:59:56,321 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:05:25,117 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:07:33,143 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:08:00,984 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:16:10,784 - train - INFO - epoch : 71 +2025-10-16 06:16:10,784 - train - INFO - loss : 0.34330349164692003 +2025-10-16 06:16:10,784 - train - INFO - grad_norm : 0.7429921634532698 +2025-10-16 06:16:10,785 - train - INFO - val_loss : 0.2456764829429713 +2025-10-16 06:16:10,785 - train - INFO - val_CER_(Argmax): 0.07272585291214713 +2025-10-16 06:16:10,785 - train - INFO - val_WER_(Argmax): 0.2281453037552794 +2025-10-16 06:16:10,785 - train - INFO - val_WER : 0.2281453037552794 +2025-10-16 06:16:10,785 - train - INFO - val_CER : 0.07276095324645183 +2025-10-16 06:16:10,785 - train - INFO - test_loss : 0.6153945196039823 +2025-10-16 06:16:10,785 - train - INFO - test_CER_(Argmax): 0.18028797138951858 +2025-10-16 06:16:10,785 - train - INFO - test_WER_(Argmax): 0.45096762978426175 +2025-10-16 06:16:10,785 - train - INFO - test_WER : 0.45096762978426175 +2025-10-16 06:16:10,785 - train - INFO - test_CER : 0.18027404470283673 +2025-10-16 06:16:11,370 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 06:22:36,692 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:22:50,342 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:24:49,221 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:25:27,796 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:27:48,359 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:29:38,242 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:39:03,837 - train - INFO - epoch : 72 +2025-10-16 06:39:03,838 - train - INFO - loss : 0.339138802103322 +2025-10-16 06:39:03,838 - train - INFO - grad_norm : 0.7133045122779981 +2025-10-16 06:39:03,838 - train - INFO - val_loss : 0.2443471114066514 +2025-10-16 06:39:03,839 - train - INFO - val_CER_(Argmax): 0.07240100881127012 +2025-10-16 06:39:03,839 - train - INFO - val_WER_(Argmax): 0.22696484163654596 +2025-10-16 06:39:03,839 - train - INFO - val_WER : 0.22696484163654596 +2025-10-16 06:39:03,839 - train - INFO - val_CER : 0.07238644584591152 +2025-10-16 06:39:03,839 - train - INFO - test_loss : 0.612513683888377 +2025-10-16 06:39:03,840 - train - INFO - test_CER_(Argmax): 0.1791687001827859 +2025-10-16 06:39:03,840 - train - INFO - test_WER_(Argmax): 0.4488865730597953 +2025-10-16 06:39:03,840 - train - INFO - test_WER : 0.4488865730597953 +2025-10-16 06:39:03,840 - train - INFO - test_CER : 0.17915718069946662 +2025-10-16 06:39:04,972 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 06:41:02,810 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:45:51,176 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:47:05,013 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:49:14,158 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 06:49:39,422 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:01:47,234 - train - INFO - epoch : 73 +2025-10-16 07:01:47,235 - train - INFO - loss : 0.3470729561895132 +2025-10-16 07:01:47,235 - train - INFO - grad_norm : 0.7514531427621841 +2025-10-16 07:01:47,235 - train - INFO - val_loss : 0.24378775940700012 +2025-10-16 07:01:47,235 - train - INFO - val_CER_(Argmax): 0.07183435190371569 +2025-10-16 07:01:47,235 - train - INFO - val_WER_(Argmax): 0.2256147255339385 +2025-10-16 07:01:47,235 - train - INFO - val_WER : 0.2256147255339385 +2025-10-16 07:01:47,235 - train - INFO - val_CER : 0.0718197889383571 +2025-10-16 07:01:47,235 - train - INFO - test_loss : 0.6091715346793739 +2025-10-16 07:01:47,236 - train - INFO - test_CER_(Argmax): 0.17911457671301328 +2025-10-16 07:01:47,236 - train - INFO - test_WER_(Argmax): 0.44794237444369694 +2025-10-16 07:01:47,236 - train - INFO - test_WER : 0.44794237444369694 +2025-10-16 07:01:47,236 - train - INFO - test_CER : 0.17910649294741543 +2025-10-16 07:01:47,826 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 07:04:43,210 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:05:38,351 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:07:33,361 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:10:39,941 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:16:21,980 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:24:38,614 - train - INFO - epoch : 74 +2025-10-16 07:24:38,615 - train - INFO - loss : 0.33683984458446503 +2025-10-16 07:24:38,615 - train - INFO - grad_norm : 0.7467496731877327 +2025-10-16 07:24:38,615 - train - INFO - val_loss : 0.2443155725909905 +2025-10-16 07:24:38,616 - train - INFO - val_CER_(Argmax): 0.07227646308735522 +2025-10-16 07:24:38,616 - train - INFO - val_WER_(Argmax): 0.22735330118624314 +2025-10-16 07:24:38,616 - train - INFO - val_WER : 0.22735330118624314 +2025-10-16 07:24:38,616 - train - INFO - val_CER : 0.07227911775285062 +2025-10-16 07:24:38,616 - train - INFO - test_loss : 0.6112303989274162 +2025-10-16 07:24:38,616 - train - INFO - test_CER_(Argmax): 0.17907364584589097 +2025-10-16 07:24:38,616 - train - INFO - test_WER_(Argmax): 0.44814157881320876 +2025-10-16 07:24:38,616 - train - INFO - test_WER : 0.44814157881320876 +2025-10-16 07:24:38,616 - train - INFO - test_CER : 0.17904936512532063 +2025-10-16 07:25:13,751 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:25:37,831 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:28:57,741 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:29:38,177 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:34:16,623 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:37:09,990 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:47:25,461 - train - INFO - epoch : 75 +2025-10-16 07:47:25,461 - train - INFO - loss : 0.33465448658370495 +2025-10-16 07:47:25,461 - train - INFO - grad_norm : 0.7519346656811298 +2025-10-16 07:47:25,462 - train - INFO - val_loss : 0.242401727729223 +2025-10-16 07:47:25,462 - train - INFO - val_CER_(Argmax): 0.07174316270698235 +2025-10-16 07:47:25,462 - train - INFO - val_WER_(Argmax): 0.2253097904291825 +2025-10-16 07:47:25,462 - train - INFO - val_WER : 0.2253097904291825 +2025-10-16 07:47:25,462 - train - INFO - val_CER : 0.07172859974162374 +2025-10-16 07:47:25,462 - train - INFO - test_loss : 0.6091889444054389 +2025-10-16 07:47:25,462 - train - INFO - test_CER_(Argmax): 0.17898326704627793 +2025-10-16 07:47:25,462 - train - INFO - test_WER_(Argmax): 0.4477697155209368 +2025-10-16 07:47:25,462 - train - INFO - test_WER : 0.4477697155209368 +2025-10-16 07:47:25,462 - train - INFO - test_CER : 0.17895496802095612 +2025-10-16 07:47:26,060 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 07:50:39,738 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:50:41,382 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:51:44,271 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:52:08,305 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:57:55,955 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 07:58:00,203 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:00:15,606 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:10:09,966 - train - INFO - epoch : 76 +2025-10-16 08:10:09,967 - train - INFO - loss : 0.32825977680970675 +2025-10-16 08:10:09,967 - train - INFO - grad_norm : 0.7922337020761404 +2025-10-16 08:10:09,967 - train - INFO - val_loss : 0.24259263547984036 +2025-10-16 08:10:09,967 - train - INFO - val_CER_(Argmax): 0.07152227206748903 +2025-10-16 08:10:09,967 - train - INFO - val_WER_(Argmax): 0.22444476221779855 +2025-10-16 08:10:09,967 - train - INFO - val_WER : 0.22444476221779855 +2025-10-16 08:10:09,968 - train - INFO - val_CER : 0.0715126284252316 +2025-10-16 08:10:09,968 - train - INFO - test_loss : 0.60749830001471 +2025-10-16 08:10:09,968 - train - INFO - test_CER_(Argmax): 0.1776300184520875 +2025-10-16 08:10:09,968 - train - INFO - test_WER_(Argmax): 0.4444251261265295 +2025-10-16 08:10:09,968 - train - INFO - test_WER : 0.4444251261265295 +2025-10-16 08:10:09,968 - train - INFO - test_CER : 0.177609227132328 +2025-10-16 08:10:10,533 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 08:11:59,900 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:12:54,045 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:14:34,964 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:21:55,106 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:21:55,451 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:22:06,483 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:33:19,822 - train - INFO - epoch : 77 +2025-10-16 08:33:19,822 - train - INFO - loss : 0.33436640658354394 +2025-10-16 08:33:19,822 - train - INFO - grad_norm : 0.7152383050640222 +2025-10-16 08:33:19,822 - train - INFO - val_loss : 0.24181210639124567 +2025-10-16 08:33:19,823 - train - INFO - val_CER_(Argmax): 0.07168098424058468 +2025-10-16 08:33:19,823 - train - INFO - val_WER_(Argmax): 0.22612596969878818 +2025-10-16 08:33:19,823 - train - INFO - val_WER : 0.22612596969878818 +2025-10-16 08:33:19,823 - train - INFO - val_CER : 0.07166642127522609 +2025-10-16 08:33:19,823 - train - INFO - test_loss : 0.6055028849110311 +2025-10-16 08:33:19,823 - train - INFO - test_CER_(Argmax): 0.17730527841163873 +2025-10-16 08:33:19,823 - train - INFO - test_WER_(Argmax): 0.444182600467874 +2025-10-16 08:33:19,823 - train - INFO - test_WER : 0.444182600467874 +2025-10-16 08:33:19,823 - train - INFO - test_CER : 0.1772793335152971 +2025-10-16 08:35:13,806 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:36:08,863 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:36:27,911 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:37:11,720 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:42:38,767 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:44:59,770 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:56:30,978 - train - INFO - epoch : 78 +2025-10-16 08:56:30,979 - train - INFO - loss : 0.3246919604102571 +2025-10-16 08:56:30,979 - train - INFO - grad_norm : 0.7791670113053154 +2025-10-16 08:56:30,979 - train - INFO - val_loss : 0.24172794463282282 +2025-10-16 08:56:30,979 - train - INFO - val_CER_(Argmax): 0.07153011484291474 +2025-10-16 08:56:30,979 - train - INFO - val_WER_(Argmax): 0.22511833258022118 +2025-10-16 08:56:30,979 - train - INFO - val_WER : 0.22511833258022118 +2025-10-16 08:56:30,979 - train - INFO - val_CER : 0.071512884357283 +2025-10-16 08:56:30,979 - train - INFO - test_loss : 0.6050408068968325 +2025-10-16 08:56:30,980 - train - INFO - test_CER_(Argmax): 0.17660421668923618 +2025-10-16 08:56:30,980 - train - INFO - test_WER_(Argmax): 0.44288891060599644 +2025-10-16 08:56:30,980 - train - INFO - test_WER : 0.44288891060599644 +2025-10-16 08:56:30,980 - train - INFO - test_CER : 0.1765579884047018 +2025-10-16 08:59:27,293 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 08:59:27,869 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:02:30,628 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:07:50,034 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:09:45,540 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:10:48,602 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:19:14,555 - train - INFO - epoch : 79 +2025-10-16 09:19:14,556 - train - INFO - loss : 0.32903695121558024 +2025-10-16 09:19:14,556 - train - INFO - grad_norm : 0.7922589413144372 +2025-10-16 09:19:14,556 - train - INFO - val_loss : 0.24085671488534321 +2025-10-16 09:19:14,556 - train - INFO - val_CER_(Argmax): 0.07120119172424885 +2025-10-16 09:19:14,556 - train - INFO - val_WER_(Argmax): 0.2238828798942847 +2025-10-16 09:19:14,556 - train - INFO - val_WER : 0.2238828798942847 +2025-10-16 09:19:14,556 - train - INFO - val_CER : 0.07118662875889026 +2025-10-16 09:19:14,557 - train - INFO - test_loss : 0.6060074610369546 +2025-10-16 09:19:14,557 - train - INFO - test_CER_(Argmax): 0.17613761186333135 +2025-10-16 09:19:14,557 - train - INFO - test_WER_(Argmax): 0.4415207936896337 +2025-10-16 09:19:14,557 - train - INFO - test_WER : 0.4415207936896337 +2025-10-16 09:19:14,557 - train - INFO - test_CER : 0.1760987778408496 +2025-10-16 09:19:15,154 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 09:21:32,328 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:25:32,846 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:26:24,808 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:27:11,142 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:31:43,693 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:33:07,799 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:42:14,548 - train - INFO - epoch : 80 +2025-10-16 09:42:14,549 - train - INFO - loss : 0.32665394328946445 +2025-10-16 09:42:14,549 - train - INFO - grad_norm : 0.785485486439125 +2025-10-16 09:42:14,549 - train - INFO - val_loss : 0.23916942947967487 +2025-10-16 09:42:14,549 - train - INFO - val_CER_(Argmax): 0.07079503833489652 +2025-10-16 09:42:14,549 - train - INFO - val_WER_(Argmax): 0.2233928609416103 +2025-10-16 09:42:14,549 - train - INFO - val_WER : 0.2233928609416103 +2025-10-16 09:42:14,549 - train - INFO - val_CER : 0.07078805112711367 +2025-10-16 09:42:14,549 - train - INFO - test_loss : 0.5997372880882147 +2025-10-16 09:42:14,549 - train - INFO - test_CER_(Argmax): 0.17529654974260578 +2025-10-16 09:42:14,550 - train - INFO - test_WER_(Argmax): 0.4405526126548696 +2025-10-16 09:42:14,550 - train - INFO - test_WER : 0.4405526126548696 +2025-10-16 09:42:14,550 - train - INFO - test_CER : 0.1752632917210163 +2025-10-16 09:42:15,152 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 09:42:15,591 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:42:48,136 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:43:41,631 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:44:09,545 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:45:05,149 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:49:31,128 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:55:41,850 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 09:55:42,189 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:05:05,706 - train - INFO - epoch : 81 +2025-10-16 10:05:05,707 - train - INFO - loss : 0.32321790718671045 +2025-10-16 10:05:05,707 - train - INFO - grad_norm : 0.8345202321958061 +2025-10-16 10:05:05,707 - train - INFO - val_loss : 0.2388677727431059 +2025-10-16 10:05:05,707 - train - INFO - val_CER_(Argmax): 0.07033447157901968 +2025-10-16 10:05:05,707 - train - INFO - val_WER_(Argmax): 0.22307846231609646 +2025-10-16 10:05:05,707 - train - INFO - val_WER : 0.22307846231609646 +2025-10-16 10:05:05,707 - train - INFO - val_CER : 0.0703199086136611 +2025-10-16 10:05:05,707 - train - INFO - test_loss : 0.599141996429891 +2025-10-16 10:05:05,708 - train - INFO - test_CER_(Argmax): 0.174724708652118 +2025-10-16 10:05:05,708 - train - INFO - test_WER_(Argmax): 0.43903859863372224 +2025-10-16 10:05:05,708 - train - INFO - test_WER : 0.43903859863372224 +2025-10-16 10:05:05,708 - train - INFO - test_CER : 0.1747035251574909 +2025-10-16 10:05:06,299 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 10:09:04,114 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:10:02,193 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:10:55,240 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:14:42,223 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:15:31,459 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:18:16,351 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:19:34,572 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:28:09,798 - train - INFO - epoch : 82 +2025-10-16 10:28:09,799 - train - INFO - loss : 0.3183249425348924 +2025-10-16 10:28:09,799 - train - INFO - grad_norm : 0.8161137418531293 +2025-10-16 10:28:09,799 - train - INFO - val_loss : 0.2384590067985383 +2025-10-16 10:28:09,799 - train - INFO - val_CER_(Argmax): 0.07017255953778744 +2025-10-16 10:28:09,799 - train - INFO - val_WER_(Argmax): 0.22131364034164988 +2025-10-16 10:28:09,799 - train - INFO - val_WER : 0.22131364034164988 +2025-10-16 10:28:09,800 - train - INFO - val_CER : 0.07015799657242883 +2025-10-16 10:28:09,800 - train - INFO - test_loss : 0.5989029520020193 +2025-10-16 10:28:09,800 - train - INFO - test_CER_(Argmax): 0.1743205254276692 +2025-10-16 10:28:09,800 - train - INFO - test_WER_(Argmax): 0.43704721422289666 +2025-10-16 10:28:09,800 - train - INFO - test_WER : 0.43704721422289666 +2025-10-16 10:28:09,800 - train - INFO - test_CER : 0.1743163534871922 +2025-10-16 10:28:10,392 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 10:30:51,295 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:32:11,943 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:33:27,960 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:36:33,747 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:37:19,556 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:40:31,804 - train - WARNING - OOM on batch. Skipping batch. +2025-10-16 10:51:18,473 - train - INFO - epoch : 83 +2025-10-16 10:51:18,473 - train - INFO - loss : 0.3176834620722574 +2025-10-16 10:51:18,473 - train - INFO - grad_norm : 0.8232347652241213 +2025-10-16 10:51:18,474 - train - INFO - val_loss : 0.23727865584871985 +2025-10-16 10:51:18,474 - train - INFO - val_CER_(Argmax): 0.07039769349766453 +2025-10-16 10:51:18,474 - train - INFO - val_WER_(Argmax): 0.2212025348655912 +2025-10-16 10:51:18,474 - train - INFO - val_WER : 0.2212025348655912 +2025-10-16 10:51:18,474 - train - INFO - val_CER : 0.07038313053230592 +2025-10-16 10:51:18,474 - train - INFO - test_loss : 0.5976807195313123 +2025-10-16 10:51:18,474 - train - INFO - test_CER_(Argmax): 0.17417757347516163 +2025-10-16 10:51:18,474 - train - INFO - test_WER_(Argmax): 0.4383814222677989 +2025-10-16 10:51:18,474 - train - INFO - test_WER : 0.4383814222677989 +2025-10-16 10:51:18,474 - train - INFO - test_CER : 0.1741524139310583 +2025-10-16 10:51:19,085 - train - INFO - Saving current best: model_best.pth ... +2025-10-16 10:51:40,238 - train - INFO - Saving model on keyboard interrupt +2025-10-16 10:51:40,720 - train - INFO - Saving checkpoint: /home/nyakovchuk/wave_rover/rover-Conformer-ASR/saved/conformer_30m/checkpoint-epoch84.pth ...