Spaces:

changelinglab
/

PhoneticXeus

Running

App Files Files Community

Shikhar commited on 26 days ago

Commit

84f8437

1 Parent(s): d876521

Deploy PhoneticXeus Gradio demo (CPU)

Browse files

Files changed (35) hide show

README.md +15 -5
app.py +110 -0
requirements.txt +8 -0
src/__init__.py +0 -0
src/core/__init__.py +1 -0
src/core/utils.py +59 -0
src/espnet_import/__init__.py +0 -0
src/espnet_import/attention.py +457 -0
src/espnet_import/cgmlp.py +123 -0
src/espnet_import/embedding.py +523 -0
src/espnet_import/fastformer.py +153 -0
src/espnet_import/label_smoothing_loss.py +64 -0
src/espnet_import/layer_norm.py +43 -0
src/espnet_import/nets_utils.py +690 -0
src/espnet_import/positionwise_feed_forward.py +32 -0
src/espnet_import/repeat.py +46 -0
src/espnet_import/subsampling.py +873 -0
src/model/__init__.py +0 -0
src/model/powsm/__init__.py +0 -0
src/model/powsm/ctc.py +230 -0
src/model/powsm/e_branchformer.py +555 -0
src/model/powsm/specaug.py +384 -0
src/model/powsm/utils.py +80 -0
src/model/xeusphoneme/__init__.py +0 -0
src/model/xeusphoneme/builders.py +307 -0
src/model/xeusphoneme/cnn_frontend.py +261 -0
src/model/xeusphoneme/linear_layer.py +21 -0
src/model/xeusphoneme/resources/ipa_vocab.json +430 -0
src/model/xeusphoneme/xeuspr_inference.py +86 -0
src/model/xeusphoneme/xeuspr_model.py +378 -0
src/recipe/__init__.py +0 -0
src/recipe/phone_recognition/__init__.py +0 -0
src/recipe/phone_recognition/greedy_ctc_strategy.py +63 -0
src/utils/__init__.py +1 -0
src/utils/pylogger.py +23 -0

README.md CHANGED Viewed

@@ -1,12 +1,22 @@
 ---
 title: PhoneticXeus
-emoji: 🚀
-colorFrom: yellow
-colorTo: pink
 sdk: gradio
-sdk_version: 6.11.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: PhoneticXeus
+emoji: "\U0001F4DE"
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: "5.0"
 app_file: app.py
 pinned: false
+license: apache-2.0
+models:
+  - changelinglab/PhoneticXeus
+hardware: cpu-basic
 ---
+# PhoneticXeus -- Multilingual Phone Recognition
+Record or upload audio to get an IPA phone transcription.
+Based on [PhoneticXeus](https://huggingface.co/changelinglab/PhoneticXeus), a multilingual phone recognition model using self-conditioned CTC on the XEUS speech encoder.
+Paper: [An Empirical Recipe for Universal Phone Recognition](https://arxiv.org/abs/2603.29042)

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import sys
+# Ensure vendored src/ is importable
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import gradio as gr
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from src.model.xeusphoneme.builders import build_xeus_pr_inference
+MAX_SECONDS = 60
+SAMPLE_RATE = 16000
+inference = None
+def load_model():
+    ckpt = hf_hub_download(
+        "changelinglab/PhoneticXeus", "checkpoint-22000.ckpt"
+    )
+    vocab = os.path.join(
+        os.path.dirname(__file__),
+        "src", "model", "xeusphoneme", "resources", "ipa_vocab.json",
+    )
+    return build_xeus_pr_inference(
+        work_dir="/tmp/cache/xeus",
+        checkpoint=ckpt,
+        vocab_file=vocab,
+        hf_repo="espnet/xeus",
+        device="cpu",
+    )
+def transcribe(audio_path):
+    """Run phone recognition on uploaded/recorded audio."""
+    global inference
+    if audio_path is None:
+        return "", ""
+    if inference is None:
+        inference = load_model()
+    waveform, sr = torchaudio.load(audio_path)
+    if sr != SAMPLE_RATE:
+        waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)
+    waveform = waveform.mean(dim=0)  # mono
+    waveform = waveform[: SAMPLE_RATE * MAX_SECONDS]
+    if waveform.numel() == 0:
+        return "", ""
+    results = inference(waveform)
+    processed = results[0]["processed_transcript"]
+    predicted = results[0]["predicted_transcript"]
+    spaced = " ".join(
+        t for t in predicted.split("/")
+        if not (t.startswith("<") and t.endswith(">"))
+    )
+    return spaced, processed
+with gr.Blocks(title="PhoneticXeus") as demo:
+    gr.Markdown(
+        "# PhoneticXeus\n"
+        "Multilingual phone recognition -- record or upload audio "
+        "to get an IPA transcription.\n\n"
+        "Model: [changelinglab/PhoneticXeus]"
+        "(https://huggingface.co/changelinglab/PhoneticXeus) "
+        "| Paper: [arXiv 2603.29042]"
+        "(https://arxiv.org/abs/2603.29042)"
+    )
+    with gr.Row():
+        audio_input = gr.Audio(
+            sources=["microphone", "upload"],
+            type="filepath",
+            label="Input Audio",
+        )
+    btn = gr.Button("Transcribe", variant="primary")
+    with gr.Row():
+        phones_output = gr.Textbox(
+            label="IPA Phones (space-separated)",
+            lines=3,
+            show_copy_button=True,
+        )
+        raw_output = gr.Textbox(
+            label="Raw output (concatenated)",
+            lines=3,
+            show_copy_button=True,
+        )
+    btn.click(
+        fn=transcribe,
+        inputs=[audio_input],
+        outputs=[phones_output, raw_output],
+    )
+    gr.Markdown(
+        "---\n"
+        f"Max audio length: {MAX_SECONDS}s. "
+        "Audio is resampled to 16 kHz mono."
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+torchaudio
+huggingface_hub
+pyyaml
+typeguard
+packaging
+numpy
+gradio>=5.0

src/__init__.py ADDED Viewed

File without changes

src/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Core modules for PhoneticXeus."""

src/core/utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import logging
+from huggingface_hub import snapshot_download
+from huggingface_hub.utils import LocalEntryNotFoundError
+def download_hf_snapshot(
+    repo_id: str,
+    work_dir: str,
+    force_download: bool = False,
+    **kwargs,
+) -> str:
+    """Download a snapshot from Hugging Face Hub to `work_dir`.
+    Args:
+        repo_id: e.g. "espnet/xeus"
+        work_dir: path to local directory where to store snapshot
+        force_download: if True, enforce re-download
+        **kwargs: other snapshot_download arguments
+    Returns:
+        The path to the local snapshot folder
+    """
+    if force_download:
+        logging.info(
+            f"Force-downloading snapshot for {repo_id} into {work_dir}..."
+        )
+        path = snapshot_download(
+            repo_id=repo_id,
+            local_dir=work_dir,
+            force_download=True,
+            local_files_only=False,
+            **kwargs,
+        )
+        logging.info(f"Downloaded snapshot for {repo_id} to {path}")
+        return path
+    try:
+        path = snapshot_download(
+            repo_id=repo_id,
+            local_dir=work_dir,
+            local_files_only=True,
+            **kwargs,
+        )
+        logging.info(
+            f"Using existing local snapshot for {repo_id} at {path}"
+        )
+        return path
+    except LocalEntryNotFoundError:
+        logging.info(
+            f"No local snapshot found for {repo_id}. Downloading now..."
+        )
+        path = snapshot_download(
+            repo_id=repo_id,
+            local_dir=work_dir,
+            local_files_only=False,
+            **kwargs,
+        )
+        logging.info(f"Downloaded snapshot for {repo_id} to {path}")
+        return path

src/espnet_import/__init__.py ADDED Viewed

File without changes

src/espnet_import/attention.py ADDED Viewed

	@@ -0,0 +1,457 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Multi-Head Attention layer definition."""
+import logging
+import math
+import torch
+from torch import nn
+from src.espnet_import.layer_norm import LayerNorm
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import pad_input, unpad_input
+except Exception:
+    pass
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        qk_norm (bool): Normalize q and k before dot product.
+        use_flash_attn (bool): Use flash_attn implementation.
+        causal (bool): Apply causal attention.
+        cross_attn (bool): Cross attention instead of self attention.
+        use_sdpa (bool): Use PyTorch's scaled dot product attention.
+    """
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        qk_norm=False,
+        use_flash_attn=False,
+        causal=False,
+        cross_attn=False,
+        use_sdpa=False,
+    ):
+        """Construct an MultiHeadedAttention object."""
+        super(MultiHeadedAttention, self).__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = (
+            nn.Dropout(p=dropout_rate) if not use_flash_attn else nn.Identity()
+        )
+        self.dropout_rate = dropout_rate
+        # LayerNorm for q and k
+        self.q_norm = LayerNorm(self.d_k) if qk_norm else nn.Identity()
+        self.k_norm = LayerNorm(self.d_k) if qk_norm else nn.Identity()
+        self.use_flash_attn = use_flash_attn
+        self.causal = causal  # only used with flash_attn
+        self.cross_attn = cross_attn  # only used with flash_attn
+        self.use_sdpa = use_sdpa
+    def forward_qkv(self, query, key, value, expand_kv=False):
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            expand_kv (bool): Used only for partially autoregressive (PAR) decoding.
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        if expand_kv:
+            k_shape = key.shape
+            k = (
+                self.linear_k(key[:1, :, :])
+                .expand(n_batch, k_shape[1], k_shape[2])
+                .view(n_batch, -1, self.h, self.d_k)
+            )
+            v_shape = value.shape
+            v = (
+                self.linear_v(value[:1, :, :])
+                .expand(n_batch, v_shape[1], v_shape[2])
+                .view(n_batch, -1, self.h, self.d_k)
+            )
+        else:
+            k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+            v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        return q, k, v
+    def forward_attention(self, value, scores, mask):
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            min_value = torch.finfo(scores.dtype).min
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(self, query, key, value, mask, expand_kv=False):
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+            expand_kv (bool): Used only for partially autoregressive (PAR) decoding.
+                When set to `True`, `Linear` layers are computed only for the first
+                batch. This is useful to reduce the memory usage during decoding
+                when the batch size is #beam_size x #mask_count, which can be large.
+                Typically, in single waveform inference of PAR, `Linear` layers
+                should not be computed for all batches for source-attention.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        # Use PyTorch's Scaled Dot Product Attention implementation
+        if getattr(self, "use_sdpa", False):
+            q, k, v = self.forward_qkv(query, key, value, expand_kv)
+            # The shape of mask must be broadcastable to the shape of attention weights
+            out = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                mask.unsqueeze(1) if mask is not None else None,
+                dropout_p=self.dropout_rate if self.training else 0.0,
+            )  # (batch, head, time1, d_k)
+            out = out.transpose(1, 2)  # (batch, time1, head, d_k)
+            out = out.reshape(out.shape[0], out.shape[1], -1)  # (batch, time1, d_model)
+            return self.linear_out(out)  # (batch, time1, d_model)
+        # Use Flash Attention implementation
+        if self.use_flash_attn:
+            try:
+                # In the causal case, the last row will be the key mask
+                key_nonpad_mask = mask[:, -1, :]  # (#batch, time2)
+                if self.cross_attn:
+                    # For cross attention, we do not know the query padding
+                    query_nonpad_mask = torch.ones(
+                        size=query.shape[:2], dtype=torch.bool, device=query.device
+                    )
+                else:
+                    query_nonpad_mask = key_nonpad_mask
+                if key_nonpad_mask.eq(0).any():
+                    # Use variable length implementation if padded
+                    q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(
+                        query, query_nonpad_mask
+                    )[:4]
+                    k, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(
+                        key, key_nonpad_mask
+                    )[:4]
+                    v, _, _, _ = unpad_input(value, key_nonpad_mask)[:4]
+                    q = self.linear_q(q).reshape(-1, self.h, self.d_k)
+                    k = self.linear_k(k).reshape(-1, self.h, self.d_k)
+                    v = self.linear_v(v).reshape(-1, self.h, self.d_k)
+                    q = self.q_norm(q)
+                    k = self.k_norm(k)
+                    out = flash_attn_varlen_func(
+                        q,
+                        k,
+                        v,
+                        cu_seqlens_q,
+                        cu_seqlens_k,
+                        max_seqlen_q,
+                        max_seqlen_k,
+                        dropout_p=self.dropout_rate if self.training else 0.0,
+                        causal=self.causal,
+                    )  # (total, nheads, headdim)
+                    out = out.reshape(out.shape[0], -1)
+                    out = self.linear_out(out)
+                    out = pad_input(out, indices_q, query.shape[0], query.shape[1])
+                    return out
+                else:
+                    # Use fixed length implementation if not padded,
+                    # which is faster than the variable length implementation
+                    del key_nonpad_mask
+                    q, k, v = self.forward_qkv(query, key, value)
+                    out = flash_attn_func(
+                        q.transpose(1, 2),
+                        k.transpose(1, 2),
+                        v.transpose(1, 2),
+                        dropout_p=self.dropout_rate if self.training else 0.0,
+                        causal=self.causal,
+                    )  # (batch_size, seqlen, nheads, headdim)
+                    del q, k, v
+                    out = out.reshape(out.shape[0], out.shape[1], -1)
+                    out = self.linear_out(out)
+                    return out
+            except Exception as e:
+                pass
+                self.use_flash_attn = False
+        # Fall back to the default implementation
+        q, k, v = self.forward_qkv(query, key, value, expand_kv)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
+class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (old version).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+    """
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+        if self.zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+        return x
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor (#batch, time1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time1)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k
+        )  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask)
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+    """
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x):
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        if self.zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)), device=x.device)
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+        return x
+    def forward(self, query, key, value, pos_emb, mask):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, 2*time1-1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, 2*time1-1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k
+        )  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask)

src/espnet_import/cgmlp.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""MLP with convolutional gating (cgMLP) definition.
+References:
+    https://openreview.net/forum?id=RA-zVvZLYIy
+    https://arxiv.org/abs/2105.08050
+"""
+import torch
+from src.espnet_import.nets_utils import get_activation
+from src.espnet_import.layer_norm import LayerNorm
+class ConvolutionalSpatialGatingUnit(torch.nn.Module):
+    """Convolutional Spatial Gating Unit (CSGU)."""
+    def __init__(
+        self,
+        size: int,
+        kernel_size: int,
+        dropout_rate: float,
+        use_linear_after_conv: bool,
+        gate_activation: str,
+    ):
+        super().__init__()
+        n_channels = size // 2  # split input channels
+        self.norm = LayerNorm(n_channels)
+        self.conv = torch.nn.Conv1d(
+            n_channels,
+            n_channels,
+            kernel_size,
+            1,
+            (kernel_size - 1) // 2,
+            groups=n_channels,
+        )
+        if use_linear_after_conv:
+            self.linear = torch.nn.Linear(n_channels, n_channels)
+        else:
+            self.linear = None
+        if gate_activation == "identity":
+            self.act = torch.nn.Identity()
+        else:
+            self.act = get_activation(gate_activation)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+    def espnet_initialization_fn(self):
+        torch.nn.init.normal_(self.conv.weight, std=1e-6)
+        torch.nn.init.ones_(self.conv.bias)
+        if self.linear is not None:
+            torch.nn.init.normal_(self.linear.weight, std=1e-6)
+            torch.nn.init.ones_(self.linear.bias)
+    def forward(self, x, gate_add=None):
+        """Forward method
+        Args:
+            x (torch.Tensor): (N, T, D)
+            gate_add (torch.Tensor): (N, T, D/2)
+        Returns:
+            out (torch.Tensor): (N, T, D/2)
+        """
+        x_r, x_g = x.chunk(2, dim=-1)
+        x_g = self.norm(x_g)  # (N, T, D/2)
+        x_g = self.conv(x_g.transpose(1, 2)).transpose(1, 2)  # (N, T, D/2)
+        if self.linear is not None:
+            x_g = self.linear(x_g)
+        if gate_add is not None:
+            x_g = x_g + gate_add
+        x_g = self.act(x_g)
+        out = x_r * x_g  # (N, T, D/2)
+        out = self.dropout(out)
+        return out
+class ConvolutionalGatingMLP(torch.nn.Module):
+    """Convolutional Gating MLP (cgMLP)."""
+    def __init__(
+        self,
+        size: int,
+        linear_units: int,
+        kernel_size: int,
+        dropout_rate: float,
+        use_linear_after_conv: bool,
+        gate_activation: str,
+    ):
+        super().__init__()
+        self.channel_proj1 = torch.nn.Sequential(
+            torch.nn.Linear(size, linear_units), torch.nn.GELU()
+        )
+        self.csgu = ConvolutionalSpatialGatingUnit(
+            size=linear_units,
+            kernel_size=kernel_size,
+            dropout_rate=dropout_rate,
+            use_linear_after_conv=use_linear_after_conv,
+            gate_activation=gate_activation,
+        )
+        self.channel_proj2 = torch.nn.Linear(linear_units // 2, size)
+    def forward(self, x, mask):
+        if isinstance(x, tuple):
+            xs_pad, pos_emb = x
+        else:
+            xs_pad, pos_emb = x, None
+        xs_pad = self.channel_proj1(xs_pad)  # size -> linear_units
+        xs_pad = self.csgu(xs_pad)  # linear_units -> linear_units/2
+        xs_pad = self.channel_proj2(xs_pad)  # linear_units/2 -> size
+        if pos_emb is not None:
+            out = (xs_pad, pos_emb)
+        else:
+            out = xs_pad
+        return out

src/espnet_import/embedding.py ADDED Viewed

	@@ -0,0 +1,523 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Positional Encoding Module."""
+import logging
+import math
+import torch
+from packaging.version import parse as V
+# from espnet2.asr.frontend.cnn import dim_1_layer_norm
+def dim_1_layer_norm(x, eps=1e-05, gamma=None, beta=None):
+    """Functional version of Dim1LayerNorm."""
+    B, D, T = x.shape
+    mean = torch.mean(x, 1, keepdim=True)
+    variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+    x = (x - mean) * torch.rsqrt(variance + eps)
+    if gamma is not None:
+        x = x * gamma.view(1, -1, 1)
+        if beta is not None:
+            x = x + beta.view(1, -1, 1)
+    return x
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    """Perform pre-hook in load_state_dict for backward compatibility.
+    Note:
+        We saved self.pe until v.0.5.2 but we have omitted it later.
+        Therefore, we remove the item "pe" from `state_dict` for backward compatibility.
+    """
+    k = prefix + "pe"
+    if k in state_dict:
+        state_dict.pop(k)
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position. Only for
+        the class LegacyRelPositionalEncoding. We remove it in the current
+        class RelPositionalEncoding.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+        """Construct an PositionalEncoding object."""
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+        self._register_load_state_dict_pre_hook(_pre_hook)
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        if self.reverse:
+            position = torch.arange(
+                x.size(1) - 1, -1, -1.0, dtype=torch.float32
+            ).unsqueeze(1)
+        else:
+            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class ScaledPositionalEncoding(PositionalEncoding):
+    """Scaled positional encoding module.
+    See Sec. 3.2  https://arxiv.org/abs/1809.08895
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class."""
+        super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
+        self.alpha = torch.nn.Parameter(torch.tensor(1.0))
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.alpha.data = torch.tensor(1.0)
+    def forward(self, x):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x + self.alpha * self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class LearnableFourierPosEnc(torch.nn.Module):
+    """Learnable Fourier Features for Positional Encoding.
+    See https://arxiv.org/pdf/2106.02795.pdf
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        gamma (float): init parameter for the positional kernel variance
+            see https://arxiv.org/pdf/2106.02795.pdf.
+        apply_scaling (bool): Whether to scale the input before adding the pos encoding.
+        hidden_dim (int): if not None, we modulate the pos encodings with
+            an MLP whose hidden layer has hidden_dim neurons.
+    """
+    def __init__(
+        self,
+        d_model,
+        dropout_rate=0.0,
+        max_len=5000,
+        gamma=1.0,
+        apply_scaling=False,
+        hidden_dim=None,
+    ):
+        """Initialize class."""
+        super(LearnableFourierPosEnc, self).__init__()
+        self.d_model = d_model
+        if apply_scaling:
+            self.xscale = math.sqrt(self.d_model)
+        else:
+            self.xscale = 1.0
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.max_len = max_len
+        self.gamma = gamma
+        if self.gamma is None:
+            self.gamma = self.d_model // 2
+        assert (
+            d_model % 2 == 0
+        ), "d_model should be divisible by two in order to use this layer."
+        self.w_r = torch.nn.Parameter(torch.empty(1, d_model // 2))
+        self._reset()  # init the weights
+        self.hidden_dim = hidden_dim
+        if self.hidden_dim is not None:
+            self.mlp = torch.nn.Sequential(
+                torch.nn.Linear(d_model, hidden_dim),
+                torch.nn.GELU(),
+                torch.nn.Linear(hidden_dim, d_model),
+            )
+    def _reset(self):
+        self.w_r.data = torch.normal(
+            0, (1 / math.sqrt(self.gamma)), (1, self.d_model // 2)
+        )
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        position_v = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1).to(x)
+        cosine = torch.cos(torch.matmul(position_v, self.w_r))
+        sine = torch.sin(torch.matmul(position_v, self.w_r))
+        pos_enc = torch.cat((cosine, sine), -1)
+        pos_enc /= math.sqrt(self.d_model)
+        if self.hidden_dim is None:
+            return pos_enc.unsqueeze(0)
+        else:
+            return self.mlp(pos_enc.unsqueeze(0))
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        pe = self.extend_pe(x)
+        x = x * self.xscale + pe
+        return self.dropout(x)
+class LegacyRelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module (old version).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Initialize class."""
+        super().__init__(
+            d_model=d_model,
+            dropout_rate=dropout_rate,
+            max_len=max_len,
+            reverse=True,
+        )
+    def forward(self, x):
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[:, : x.size(1)]
+        return self.dropout(x), self.dropout(pos_emb)
+class RelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super(RelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[
+            :,
+            self.pe.size(1) // 2 - x.size(1) + 1 : self.pe.size(1) // 2 + x.size(1),
+        ]
+        return self.dropout(x), self.dropout(pos_emb)
+class StreamPositionalEncoding(torch.nn.Module):
+    """Streaming Positional encoding.
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super(StreamPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.tmp = torch.tensor(0.0).expand(1, max_len)
+        self.extend_pe(self.tmp.size(1), self.tmp.device, self.tmp.dtype)
+        self._register_load_state_dict_pre_hook(_pre_hook)
+    def extend_pe(self, length, device, dtype):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= length:
+                if self.pe.dtype != dtype or self.pe.device != device:
+                    self.pe = self.pe.to(dtype=dtype, device=device)
+                return
+        pe = torch.zeros(length, self.d_model)
+        position = torch.arange(0, length, dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=device, dtype=dtype)
+    def forward(self, x: torch.Tensor, start_idx: int = 0):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x.size(1) + start_idx, x.device, x.dtype)
+        x = x * self.xscale + self.pe[:, start_idx : start_idx + x.size(1)]
+        return self.dropout(x)
+class ConvolutionalPositionalEmbedding(torch.nn.Module):
+    """Convolutional positional embedding.
+       Used in wav2vec2/HuBERT SSL models.
+       https://arxiv.org/abs/1904.11660
+    Args:
+        embed_dim (int): Feature dimension of the input Tensor.
+        dropout (float): unused
+        max_len (int): unused
+        num_layers (int): number of conv layers
+        kernel_size (int): The number of frames to be use.
+        groups (int): The number of groups in feature dimensions.
+        weight_norm (str): [new, legacy, none].
+            How to init conv weights. Recommended setting is
+            none if num_layers > 1.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        dropout: float,
+        max_len: int = 5000,
+        num_layers: int = 1,
+        kernel_size: int = 128,
+        groups: int = 16,
+        weight_norm: str = "new",
+        use_residual: bool = False,
+    ):
+        """Initialize Convoluational Positional Embedding."""
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kernel_size = kernel_size
+        self.weight_norm = weight_norm
+        convs = []
+        for layer in range(num_layers):
+            conv = torch.nn.Conv1d(
+                in_channels=embed_dim,
+                out_channels=embed_dim,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                groups=groups,
+            )
+            if weight_norm != "none" and weight_norm is not None:
+                std = math.sqrt((4 * (1.0)) / (kernel_size * embed_dim))
+                torch.nn.init.normal_(conv.weight, mean=0, std=std)
+                torch.nn.init.constant_(conv.bias, 0)
+                # torch.nn.utils.weight_norm leads to weird behavior
+                # with copy.deepcopy(). Usually isnt needed,
+                # but its important for models that use EMA
+                if weight_norm == "new":
+                    if V(torch.__version__) >= V("2.2.0"):
+                        conv = torch.nn.utils.parametrizations.weight_norm(
+                            conv, name="weight", dim=2
+                        )
+                    else:
+                        weight_norm = "legacy"
+                        logging.warning(
+                            "torch.nn.utils.parametrizations.weight_norm is only "
+                            + "supported for pytorch versions >= 2.2.0. "
+                            + "Defaulting to torch.nn.utils.weight_norm."
+                        )
+                if weight_norm == "legacy":
+                    conv = torch.nn.utils.weight_norm(conv, name="weight", dim=2)
+            convs.append(conv)
+        self.convs = torch.nn.ModuleList(convs)
+        self.num_remove: int = 1 if kernel_size % 2 == 0 else 0
+        self.use_residual = use_residual
+    def __prepare_scriptable__(self):
+        """Prepare Scriptable method."""
+        for hook in self.conv._forward_pre_hooks.values():
+            # The hook we want to remove is an instance of WeightNorm class, so
+            # normally we would do `if isinstance(...)` but this class is not accessible
+            # because of shadowing, so we check the module name directly.
+            # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
+            if (
+                hook.__module__ == "torch.nn.utils.weight_norm"
+                and hook.__class__.__name__ == "WeightNorm"
+            ):
+                logging.warning("Removing weight_norm from %s", self.__class__.__name__)
+                torch.nn.utils.remove_weight_norm(self.conv)
+        return self
+    def forward(self, x):
+        """Forward Method.
+        Args:
+            x (Tensor): shape ``[batch, frame, feature]``.
+        Returns:
+            Tensor: The resulting feature. Shape ``[batch, frame, feature]``.
+        """
+        if self.use_residual:
+            residual = x
+        x = x.transpose(-2, -1)
+        for conv in self.convs:
+            x = conv(x)
+            # remove extra padding
+            if self.num_remove > 0:
+                x = x[..., : -self.num_remove]
+            x = torch.nn.functional.gelu(x)
+            # manually normalize if the conv is not parameterized
+            # with weight norm
+            if self.weight_norm is None or self.weight_norm == "none":
+                x = dim_1_layer_norm(x)
+        x = x.transpose(-2, -1)
+        if self.use_residual:
+            x = x + residual
+        return x

src/espnet_import/fastformer.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Fastformer attention definition.
+Reference:
+    Wu et al., "Fastformer: Additive Attention Can Be All You Need"
+    https://arxiv.org/abs/2108.09084
+    https://github.com/wuch15/Fastformer
+"""
+import numpy
+import torch
+class FastSelfAttention(torch.nn.Module):
+    """Fast self-attention used in Fastformer."""
+    def __init__(
+        self,
+        size,
+        attention_heads,
+        dropout_rate,
+    ):
+        super().__init__()
+        if size % attention_heads != 0:
+            raise ValueError(
+                f"Hidden size ({size}) is not an integer multiple "
+                f"of attention heads ({attention_heads})"
+            )
+        self.attention_head_size = size // attention_heads
+        self.num_attention_heads = attention_heads
+        self.query = torch.nn.Linear(size, size)
+        self.query_att = torch.nn.Linear(size, attention_heads)
+        self.key = torch.nn.Linear(size, size)
+        self.key_att = torch.nn.Linear(size, attention_heads)
+        self.transform = torch.nn.Linear(size, size)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+    def espnet_initialization_fn(self):
+        self.apply(self.init_weights)
+    def init_weights(self, module):
+        if isinstance(module, torch.nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+        if isinstance(module, torch.nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    def transpose_for_scores(self, x):
+        """Reshape and transpose to compute scores.
+        Args:
+            x: (batch, time, size = n_heads * attn_dim)
+        Returns:
+            (batch, n_heads, time, attn_dim)
+        """
+        new_x_shape = x.shape[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        return x.reshape(*new_x_shape).transpose(1, 2)
+    def forward(self, xs_pad, mask):
+        """Forward method.
+        Args:
+            xs_pad: (batch, time, size = n_heads * attn_dim)
+            mask: (batch, 1, time), nonpadding is 1, padding is 0
+        Returns:
+            torch.Tensor: (batch, time, size)
+        """
+        batch_size, seq_len, _ = xs_pad.shape
+        mixed_query_layer = self.query(xs_pad)  # (batch, time, size)
+        mixed_key_layer = self.key(xs_pad)  # (batch, time, size)
+        if mask is not None:
+            mask = mask.eq(0)  # padding is 1, nonpadding is 0
+        # (batch, n_heads, time)
+        query_for_score = (
+            self.query_att(mixed_query_layer).transpose(1, 2)
+            / self.attention_head_size**0.5
+        )
+        if mask is not None:
+            min_value = float(
+                numpy.finfo(
+                    torch.tensor(0, dtype=query_for_score.dtype).numpy().dtype
+                ).min
+            )
+            query_for_score = query_for_score.masked_fill(mask, min_value)
+            query_weight = torch.softmax(query_for_score, dim=-1).masked_fill(mask, 0.0)
+        else:
+            query_weight = torch.softmax(query_for_score, dim=-1)
+        query_weight = query_weight.unsqueeze(2)  # (batch, n_heads, 1, time)
+        query_layer = self.transpose_for_scores(
+            mixed_query_layer
+        )  # (batch, n_heads, time, attn_dim)
+        pooled_query = (
+            torch.matmul(query_weight, query_layer)
+            .transpose(1, 2)
+            .reshape(-1, 1, self.num_attention_heads * self.attention_head_size)
+        )  # (batch, 1, size = n_heads * attn_dim)
+        pooled_query = self.dropout(pooled_query)
+        pooled_query_repeat = pooled_query.repeat(1, seq_len, 1)  # (batch, time, size)
+        mixed_query_key_layer = (
+            mixed_key_layer * pooled_query_repeat
+        )  # (batch, time, size)
+        # (batch, n_heads, time)
+        query_key_score = (
+            self.key_att(mixed_query_key_layer) / self.attention_head_size**0.5
+        ).transpose(1, 2)
+        if mask is not None:
+            min_value = float(
+                numpy.finfo(
+                    torch.tensor(0, dtype=query_key_score.dtype).numpy().dtype
+                ).min
+            )
+            query_key_score = query_key_score.masked_fill(mask, min_value)
+            query_key_weight = torch.softmax(query_key_score, dim=-1).masked_fill(
+                mask, 0.0
+            )
+        else:
+            query_key_weight = torch.softmax(query_key_score, dim=-1)
+        query_key_weight = query_key_weight.unsqueeze(2)  # (batch, n_heads, 1, time)
+        key_layer = self.transpose_for_scores(
+            mixed_query_key_layer
+        )  # (batch, n_heads, time, attn_dim)
+        pooled_key = torch.matmul(
+            query_key_weight, key_layer
+        )  # (batch, n_heads, 1, attn_dim)
+        pooled_key = self.dropout(pooled_key)
+        # NOTE: value = query, due to param sharing
+        weighted_value = (pooled_key * query_layer).transpose(
+            1, 2
+        )  # (batch, time, n_heads, attn_dim)
+        weighted_value = weighted_value.reshape(
+            weighted_value.shape[:-2]
+            + (self.num_attention_heads * self.attention_head_size,)
+        )  # (batch, time, size)
+        weighted_value = (
+            self.dropout(self.transform(weighted_value)) + mixed_query_layer
+        )
+        return weighted_value

src/espnet_import/label_smoothing_loss.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# from espnet/nets/pytorch_backend/transformer/label_smoothing_loss.py
+"""Label smoothing module."""
+import torch
+from torch import nn
+class LabelSmoothingLoss(nn.Module):
+    """Label-smoothing loss.
+    :param int size: the number of class
+    :param int padding_idx: ignored class id
+    :param float smoothing: smoothing rate (0.0 means the conventional CE)
+    :param bool normalize_length: normalize loss by sequence length if True
+    :param torch.nn.Module criterion: loss function to be smoothed
+    """
+    def __init__(
+        self,
+        size,
+        padding_idx,
+        smoothing,
+        normalize_length=False,
+        criterion=nn.KLDivLoss(reduction="none"),
+    ):
+        """Construct an LabelSmoothingLoss object."""
+        super(LabelSmoothingLoss, self).__init__()
+        self.criterion = criterion
+        self.padding_idx = padding_idx
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.size = size
+        self.true_dist = None
+        self.normalize_length = normalize_length
+    def forward(self, x, target):
+        """Compute loss between x and target.
+        :param torch.Tensor x: prediction (batch, seqlen, class)
+        :param torch.Tensor target:
+            target signal masked with self.padding_id (batch, seqlen)
+        :return: scalar float value
+        :rtype torch.Tensor
+        """
+        assert x.size(2) == self.size
+        batch_size = x.size(0)
+        x = x.view(-1, self.size)
+        target = target.view(-1)
+        with torch.no_grad():
+            true_dist = x.clone()
+            true_dist.fill_(self.smoothing / (self.size - 1))
+            ignore = target == self.padding_idx  # (B,)
+            total = len(target) - ignore.sum().item()
+            target = target.masked_fill(ignore, 0)  # avoid -1 index
+            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
+        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
+        denom = total if self.normalize_length else batch_size
+        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom

src/espnet_import/layer_norm.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# from https://github.com/espnet/espnet/blob/master/espnet2/legacy/nets/pytorch_backend/transformer/layer_norm.py
+"""Layer normalization module."""
+import torch
+class LayerNorm(torch.nn.LayerNorm):
+    """Layer normalization module.
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
+    """
+    def __init__(self, nout, dim=-1):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+    def forward(self, x):
+        """Apply layer normalization.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Normalized tensor.
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return (
+            super(LayerNorm, self)
+            .forward(x.transpose(self.dim, -1))
+            .transpose(self.dim, -1)
+        )

src/espnet_import/nets_utils.py ADDED Viewed

	@@ -0,0 +1,690 @@

+# -*- coding: utf-8 -*-
+# from https://github.com/espnet/espnet/blob/master/espnet2/legacy/nets/pytorch_backend/nets_utils.py
+"""Network related utility tools."""
+import logging
+from typing import Dict, Optional
+import numpy as np
+import torch
+def to_device(m, x):
+    """Send tensor into the device of the module.
+    Args:
+        m (torch.nn.Module): Torch module.
+        x (Tensor): Torch tensor.
+    Returns:
+        Tensor: Torch tensor located in the same place as torch module.
+    """
+    if isinstance(m, torch.nn.Module):
+        device = next(m.parameters()).device
+    elif isinstance(m, torch.Tensor):
+        device = m.device
+    else:
+        raise TypeError(
+            "Expected torch.nn.Module or torch.tensor, " f"bot got: {type(m)}"
+        )
+    return x.to(device)
+def pad_list(xs, pad_value):
+    """Perform padding for the list of tensors.
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+    Examples:
+        >>> x = [torch.ones(4), torch.ones(2), torch.ones(1)]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
+    """
+    n_batch = len(xs)
+    max_len = max(x.size(0) for x in xs)
+    pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
+    for i in range(n_batch):
+        pad[i, : xs[i].size(0)] = xs[i]
+    return pad
+@torch.compiler.disable
+def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None):
+    """Make mask tensor containing indices of padded part.
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+    Returns:
+        Tensor: Mask tensor containing indices of padded part.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+    Examples:
+        With only lengths.
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+        With the reference tensor.
+        >>> xs = torch.zeros((3, 2, 4))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0],
+                 [0, 0, 0, 0]],
+                [[0, 0, 0, 1],
+                 [0, 0, 0, 1]],
+                [[0, 0, 1, 1],
+                 [0, 0, 1, 1]]], dtype=torch.uint8)
+        >>> xs = torch.zeros((3, 2, 6))
+        >>> make_pad_mask(lengths, xs)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)
+        With the reference tensor and dimension indicator.
+        >>> xs = torch.zeros((3, 6, 6))
+        >>> make_pad_mask(lengths, xs, 1)
+        tensor([[[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]],
+                [[0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1]]], dtype=torch.uint8)
+        >>> make_pad_mask(lengths, xs, 2)
+        tensor([[[0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 1]],
+                [[0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1],
+                 [0, 0, 0, 1, 1, 1]],
+                [[0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1],
+                 [0, 0, 1, 1, 1, 1]]], dtype=torch.uint8)
+    """
+    if length_dim == 0:
+        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
+    # If the input dimension is 2 or 3,
+    # then we use ESPnet-ONNX based implementation for tracable modeling.
+    # otherwise we use the traditional implementation for research use.
+    if isinstance(lengths, list):
+        logging.warning(
+            "Using make_pad_mask with a list of lengths is not tracable. "
+            + "If you try to trace this function with type(lengths) == list, "
+            + "please change the type of lengths to torch.LongTensor."
+        )
+    if (
+        (xs is None or xs.dim() in (2, 3))
+        and length_dim <= 2
+        and (not isinstance(lengths, list) and lengths.dim() == 1)
+    ):
+        return _make_pad_mask_traceable(lengths, xs, length_dim, maxlen)
+    else:
+        return _make_pad_mask(lengths, xs, length_dim, maxlen)
+def _make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None):
+    if not isinstance(lengths, list):
+        lengths = lengths.long().tolist()
+    bs = int(len(lengths))
+    if maxlen is None:
+        if xs is None:
+            maxlen = int(max(lengths))
+        else:
+            maxlen = xs.size(length_dim)
+    else:
+        assert xs is None, "When maxlen is specified, xs must not be specified."
+        assert maxlen >= int(
+            max(lengths)
+        ), f"maxlen {maxlen} must be >= max(lengths) {max(lengths)}"
+    seq_range = torch.arange(0, maxlen, dtype=torch.int64)
+    seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
+    seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    if xs is not None:
+        assert (
+            xs.size(0) == bs
+        ), f"The size of x.size(0) {xs.size(0)} must match the batch size {bs}"
+        if length_dim < 0:
+            length_dim = xs.dim() + length_dim
+        # ind = (:, None, ..., None, :, , None, ..., None)
+        ind = tuple(
+            slice(None) if i in (0, length_dim) else None for i in range(xs.dim())
+        )
+        mask = mask[ind].expand_as(xs).to(xs.device)
+    return mask
+def _make_pad_mask_traceable(lengths, xs, length_dim, maxlen=None):
+    """Make mask tensor containing indices of padded part.
+    This is a simplified implementation of make_pad_mask without the xs input
+    that supports JIT tracing for applications like exporting models to ONNX.
+    Dimension length of xs should be 2 or 3
+    This function will create torch.ones(maxlen, maxlen).triu(diagonal=1) and
+    select rows to create mask tensor.
+    """
+    if xs is None:
+        device = lengths.device
+    else:
+        device = xs.device
+    if xs is not None and len(xs.shape) == 3:
+        if length_dim == 1:
+            lengths = lengths.unsqueeze(1).expand(*xs.transpose(1, 2).shape[:2])
+        else:
+            # Then length_dim is 2 or -1.
+            if length_dim not in (-1, 2):
+                logging.warning(
+                    f"Invalid length_dim {length_dim}."
+                    + "We set it to -1, which is the default value."
+                )
+                length_dim = -1
+            lengths = lengths.unsqueeze(1).expand(*xs.shape[:2])
+    if maxlen is not None:
+        assert xs is None
+        assert maxlen >= lengths.max()
+    elif xs is not None:
+        maxlen = xs.shape[length_dim]
+    else:
+        maxlen = lengths.max()
+    # clip max(length) to maxlen
+    lengths = torch.clamp(lengths, max=maxlen).type(torch.long)
+    mask = torch.ones(maxlen + 1, maxlen + 1, dtype=torch.bool, device=device)
+    mask = triu_onnx(mask)[1:, :-1]  # onnx cannot handle diagonal argument.
+    mask = mask[lengths - 1][..., :maxlen]
+    if xs is not None and len(xs.shape) == 3 and length_dim == 1:
+        return mask.transpose(1, 2)
+    else:
+        return mask
+def triu_onnx(x):
+    """Make TriU for ONNX."""
+    arange = torch.arange(x.size(0), device=x.device)
+    mask = arange.unsqueeze(-1).expand(-1, x.size(0)) <= arange
+    return x * mask
+def make_non_pad_mask(lengths, xs=None, length_dim=-1):
+    """Make mask tensor containing indices of non-padded part.
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+    Returns:
+        ByteTensor: mask tensor containing indices of padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+    Examples:
+        With only lengths.
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+        With the reference tensor.
+        >>> xs = torch.zeros((3, 2, 4))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1],
+                 [1, 1, 1, 1]],
+                [[1, 1, 1, 0],
+                 [1, 1, 1, 0]],
+                [[1, 1, 0, 0],
+                 [1, 1, 0, 0]]], dtype=torch.uint8)
+        >>> xs = torch.zeros((3, 2, 6))
+        >>> make_non_pad_mask(lengths, xs)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)
+        With the reference tensor and dimension indicator.
+        >>> xs = torch.zeros((3, 6, 6))
+        >>> make_non_pad_mask(lengths, xs, 1)
+        tensor([[[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]],
+                [[1, 1, 1, 1, 1, 1],
+                 [1, 1, 1, 1, 1, 1],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0],
+                 [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)
+        >>> make_non_pad_mask(lengths, xs, 2)
+        tensor([[[1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0],
+                 [1, 1, 1, 1, 1, 0]],
+                [[1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0],
+                 [1, 1, 1, 0, 0, 0]],
+                [[1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0],
+                 [1, 1, 0, 0, 0, 0]]], dtype=torch.uint8)
+    """
+    return ~make_pad_mask(lengths, xs, length_dim)
+def mask_by_length(xs, lengths, fill=0):
+    """Mask tensor according to length.
+    Args:
+        xs (Tensor): Batch of input tensor (B, `*`).
+        lengths (LongTensor or List): Batch of lengths (B,).
+        fill (int or float): Value to fill masked part.
+    Returns:
+        Tensor: Batch of masked input tensor (B, `*`).
+    Examples:
+        >>> x = torch.arange(5).repeat(3, 1) + 1
+        >>> x
+        tensor([[1, 2, 3, 4, 5],
+                [1, 2, 3, 4, 5],
+                [1, 2, 3, 4, 5]])
+        >>> lengths = [5, 3, 2]
+        >>> mask_by_length(x, lengths)
+        tensor([[1, 2, 3, 4, 5],
+                [1, 2, 3, 0, 0],
+                [1, 2, 0, 0, 0]])
+    """
+    assert xs.size(0) == len(lengths)
+    ret = xs.data.new(*xs.size()).fill_(fill)
+    for i, l in enumerate(lengths):
+        ret[i, :l] = xs[i, :l]
+    return ret
+def th_accuracy(pad_outputs, pad_targets, ignore_label):
+    """Calculate accuracy.
+    Args:
+        pad_outputs (Tensor): Prediction tensors (B * Lmax, D).
+        pad_targets (LongTensor): Target label tensors (B, Lmax, D).
+        ignore_label (int): Ignore label id.
+    Returns:
+        float: Accuracy value (0.0 - 1.0).
+    """
+    pad_pred = pad_outputs.view(
+        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)
+    ).argmax(2)
+    mask = pad_targets != ignore_label
+    numerator = torch.sum(
+        pad_pred.masked_select(mask) == pad_targets.masked_select(mask)
+    )
+    denominator = torch.sum(mask)
+    return float(numerator) / float(denominator)
+def to_torch_tensor(x):
+    """Change to torch.Tensor or ComplexTensor from numpy.ndarray.
+    Args:
+        x: Inputs. It should be one of numpy.ndarray, Tensor, ComplexTensor, and dict.
+    Returns:
+        Tensor or ComplexTensor: Type converted inputs.
+    Examples:
+        >>> xs = np.ones(3, dtype=np.float32)
+        >>> xs = to_torch_tensor(xs)
+        tensor([1., 1., 1.])
+        >>> xs = torch.ones(3, 4, 5)
+        >>> assert to_torch_tensor(xs) is xs
+        >>> xs = {'real': xs, 'imag': xs}
+        >>> to_torch_tensor(xs)
+        ComplexTensor(
+        Real:
+        tensor([1., 1., 1.])
+        Imag;
+        tensor([1., 1., 1.])
+        )
+    """
+    # If numpy, change to torch tensor
+    if isinstance(x, np.ndarray):
+        if x.dtype.kind == "c":
+            # Dynamically importing because torch_complex requires python3
+            from torch_complex.tensor import ComplexTensor
+            return ComplexTensor(x)
+        else:
+            return torch.from_numpy(x)
+    # If {'real': ..., 'imag': ...}, convert to ComplexTensor
+    elif isinstance(x, dict):
+        # Dynamically importing because torch_complex requires python3
+        from torch_complex.tensor import ComplexTensor
+        if "real" not in x or "imag" not in x:
+            raise ValueError("has 'real' and 'imag' keys: {}".format(list(x)))
+        # Relative importing because of using python3 syntax
+        return ComplexTensor(x["real"], x["imag"])
+    # If torch.Tensor, as it is
+    elif isinstance(x, torch.Tensor):
+        return x
+    else:
+        error = (
+            "x must be numpy.ndarray, torch.Tensor or a dict like "
+            "{{'real': torch.Tensor, 'imag': torch.Tensor}}, "
+            "but got {}".format(type(x))
+        )
+        try:
+            from torch_complex.tensor import ComplexTensor
+        except Exception:
+            # If PY2
+            raise ValueError(error)
+        else:
+            # If PY3
+            if isinstance(x, ComplexTensor):
+                return x
+            else:
+                raise ValueError(error)
+def get_subsample(train_args, mode, arch):
+    """Parse the subsampling factors from the args for the specified `mode` and `arch`.
+    Args:
+        train_args: argument Namespace containing options.
+        mode: one of ('asr', 'mt', 'st')
+        arch: one of ('rnn', 'rnn-t', 'rnn_mix', 'rnn_mulenc', 'transformer')
+    Returns:
+        np.ndarray / List[np.ndarray]: subsampling factors.
+    """
+    if arch == "transformer":
+        return np.array([1])
+    elif mode == "mt" and arch == "rnn":
+        # +1 means input (+1) and layers outputs (train_args.elayer)
+        subsample = np.ones(train_args.elayers + 1, dtype=np.int64)
+        logging.warning("Subsampling is not performed for machine translation.")
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif (
+        (mode == "asr" and arch in ("rnn", "rnn-t"))
+        or (mode == "mt" and arch == "rnn")
+        or (mode == "st" and arch == "rnn")
+    ):
+        subsample = np.ones(train_args.elayers + 1, dtype=np.int64)
+        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
+            ss = train_args.subsample.split("_")
+            for j in range(min(train_args.elayers + 1, len(ss))):
+                subsample[j] = int(ss[j])
+        else:
+            logging.warning(
+                "Subsampling is not performed for vgg*. "
+                "It is performed in max pooling layers at CNN."
+            )
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif mode == "asr" and arch == "rnn_mix":
+        subsample = np.ones(
+            train_args.elayers_sd + train_args.elayers + 1, dtype=np.int64
+        )
+        if train_args.etype.endswith("p") and not train_args.etype.startswith("vgg"):
+            ss = train_args.subsample.split("_")
+            for j in range(
+                min(train_args.elayers_sd + train_args.elayers + 1, len(ss))
+            ):
+                subsample[j] = int(ss[j])
+        else:
+            logging.warning(
+                "Subsampling is not performed for vgg*. "
+                "It is performed in max pooling layers at CNN."
+            )
+        logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+        return subsample
+    elif mode == "asr" and arch == "rnn_mulenc":
+        subsample_list = []
+        for idx in range(train_args.num_encs):
+            subsample = np.ones(train_args.elayers[idx] + 1, dtype=np.int64)
+            if train_args.etype[idx].endswith("p") and not train_args.etype[
+                idx
+            ].startswith("vgg"):
+                ss = train_args.subsample[idx].split("_")
+                for j in range(min(train_args.elayers[idx] + 1, len(ss))):
+                    subsample[j] = int(ss[j])
+            else:
+                logging.warning(
+                    "Encoder %d: Subsampling is not performed for vgg*. "
+                    "It is performed in max pooling layers at CNN.",
+                    idx + 1,
+                )
+            logging.info("subsample: " + " ".join([str(x) for x in subsample]))
+            subsample_list.append(subsample)
+        return subsample_list
+    else:
+        raise ValueError("Invalid options: mode={}, arch={}".format(mode, arch))
+def rename_state_dict(
+    old_prefix: str, new_prefix: str, state_dict: Dict[str, torch.Tensor]
+):
+    """Replace keys of old prefix with new prefix in state dict."""
+    # need this list not to break the dict iterator
+    old_keys = [k for k in state_dict if k.startswith(old_prefix)]
+    if len(old_keys) > 0:
+        logging.warning(f"Rename: {old_prefix} -> {new_prefix}")
+    for k in old_keys:
+        v = state_dict.pop(k)
+        new_k = k.replace(old_prefix, new_prefix)
+        state_dict[new_k] = v
+import torch
+# from espnet2.legacy.nets.pytorch_backend.conformer.swish import Swish
+class Swish(torch.nn.Module):
+    """Construct an Swish object."""
+    def forward(self, x):
+        """Return Swich activation function."""
+        return x * torch.sigmoid(x)
+def get_activation(act):
+    """Return activation function."""
+    activation_funcs = {
+        "hardtanh": torch.nn.Hardtanh,
+        "tanh": torch.nn.Tanh,
+        "relu": torch.nn.ReLU,
+        "selu": torch.nn.SELU,
+        "swish": Swish,
+    }
+    return activation_funcs[act]()
+def trim_by_ctc_posterior(
+    h: torch.Tensor,
+    ctc_probs: torch.Tensor,
+    masks: torch.Tensor,
+    pos_emb: torch.Tensor = None,
+):
+    """Trim the encoder hidden output using CTC posterior.
+    The continuous frames in the tail that confidently represent
+    blank symbols are trimmed.
+    """
+    # Empirical settings
+    frame_tolerance = 5
+    conf_tolerance = 0.95
+    blank_id = 0
+    assert masks.size(1) == 1
+    masks = masks.squeeze(1)
+    hlens = masks.sum(dim=1)
+    assert h.size()[:2] == ctc_probs.size()[:2]
+    assert h.size(0) == hlens.size(0)
+    # blank frames
+    max_values, max_indices = ctc_probs.max(dim=2)
+    blank_masks = torch.logical_and(
+        max_values > conf_tolerance, max_indices == blank_id
+    )
+    # plus ignored frames
+    joint_masks = torch.logical_or(blank_masks, ~masks)
+    # lengths after the trimming
+    B, T, _ = h.size()
+    frame_idx = torch.where(
+        joint_masks, -1, torch.arange(T).unsqueeze(0).repeat(B, 1).to(h.device)
+    )
+    after_lens = torch.where(
+        frame_idx.max(dim=-1)[0] + frame_tolerance + 1 < hlens,
+        frame_idx.max(dim=-1)[0] + frame_tolerance + 1,
+        hlens,
+    )
+    h = h[:, : max(after_lens)]
+    masks = ~make_pad_mask(after_lens).to(h.device).unsqueeze(1)
+    if pos_emb is None:
+        pos_emb = None
+    elif (hlens.max() * 2 - 1).item() == pos_emb.size(1):  # RelPositionalEncoding
+        pos_emb = pos_emb[
+            :, pos_emb.size(1) // 2 - h.size(1) + 1 : pos_emb.size(1) // 2 + h.size(1)
+        ]
+    else:
+        pos_emb = pos_emb[:, : h.size(1)]
+    return h, masks, pos_emb
+def roll_tensor(
+    x: torch.Tensor,
+    lengths: torch.Tensor,
+    roll_amounts: Optional[torch.Tensor] = None,
+    fixed_intervals: Optional[int] = None,
+) -> torch.Tensor:
+    """Left-roll tensor x by roll_amounts, only within lengths and optionally quantized.
+    Args:
+        x: input tensor (B, T, D)
+        lengths: lengths of each sequence (B,)
+        roll_amounts: random shift amounts (B,). If None, random shift
+            amounts are generated.
+        fixed_intervals: if not None, roll_amounts are quantized to
+            multiples of this.
+    Returns:
+        rolled_x: rolled tensor (B, T, D)
+    Useful to apply roll augmentation to the input, while considering
+    the input length for each sample.
+    """
+    B, T, D = x.shape
+    indices = torch.arange(T).unsqueeze(0).expand(B, T).to(x.device)  # (B, T)
+    lengths = lengths.unsqueeze(1)  # (B, 1)
+    if roll_amounts is None:
+        roll_amounts = torch.randint(0, lengths.max(), (B,), device=x.device)
+    if fixed_intervals is not None:
+        roll_amounts = (roll_amounts // fixed_intervals) * fixed_intervals
+    roll_indices = (indices - roll_amounts.unsqueeze(1)) % lengths  # (B, T)
+    roll_indices = roll_indices.unsqueeze(2).expand(-1, -1, D)  # (B, T, D)
+    mask = indices < lengths  # (B, T), True if position is valid
+    rolled_x = torch.empty_like(x)
+    rolled_x[mask] = x.gather(1, roll_indices)[mask]
+    rolled_x[~mask] = x[~mask]
+    return rolled_x

src/espnet_import/positionwise_feed_forward.py ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Positionwise feed forward layer definition."""
+import torch
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+    def forward(self, x):
+        """Forward function."""
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))

src/espnet_import/repeat.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Repeat the same layer definition."""
+import torch
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential."""
+    def __init__(self, *args, layer_drop_rate=0.0):
+        """Initialize MultiSequential with layer_drop.
+        Args:
+            layer_drop_rate (float): Probability of dropping out each fn (layer).
+        """
+        super(MultiSequential, self).__init__(*args)
+        self.layer_drop_rate = layer_drop_rate
+    def forward(self, *args):
+        """Repeat."""
+        _probs = torch.empty(len(self)).uniform_()
+        for idx, m in enumerate(self):
+            if not self.training or (_probs[idx] >= self.layer_drop_rate):
+                args = m(*args)
+        return args
+def repeat(N, fn, layer_drop_rate=0.0):
+    """Repeat module N times.
+    Args:
+        N (int): Number of repeat time.
+        fn (Callable): Function to generate module.
+        layer_drop_rate (float): Probability of dropping out each fn (layer).
+    Returns:
+        MultiSequential: Repeated model instance.
+    """
+    return MultiSequential(*[fn(n) for n in range(N)], layer_drop_rate=layer_drop_rate)

src/espnet_import/subsampling.py ADDED Viewed

	@@ -0,0 +1,873 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright 2019 Shigeki Karita
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Subsampling layer definition."""
+import torch
+from src.espnet_import.embedding import PositionalEncoding
+class TooShortUttError(Exception):
+    """Raised when the utt is too short for subsampling.
+    Args:
+        message (str): Message for error catch
+        actual_size (int): the short size that cannot pass the subsampling
+        limit (int): the limit size for subsampling
+    """
+    def __init__(self, message, actual_size, limit):
+        """Construct a TooShortUttError for error handler."""
+        super().__init__(message)
+        self.actual_size = actual_size
+        self.limit = limit
+def check_short_utt(ins, size):
+    """Check if the utterance is too short for subsampling."""
+    if isinstance(ins, Conv1dSubsampling1) and size < 5:
+        return True, 5
+    if isinstance(ins, Conv1dSubsampling2) and size < 5:
+        return True, 5
+    if isinstance(ins, Conv1dSubsampling3) and size < 7:
+        return True, 7
+    if isinstance(ins, Conv2dSubsampling1) and size < 5:
+        return True, 5
+    if isinstance(ins, Conv2dSubsampling2) and size < 7:
+        return True, 7
+    if isinstance(ins, Conv2dSubsampling) and size < 7:
+        return True, 7
+    if isinstance(ins, Conv2dSubsampling6) and size < 11:
+        return True, 11
+    if isinstance(ins, Conv2dSubsampling8) and size < 15:
+        return True, 15
+    return False, -1
+def _upgrade_legacy_subsampling_state_dict(state_dict, prefix):
+    """Remap legacy nn.Sequential keys for subsampling modules."""
+    w_new = prefix + "out.weight"
+    b_new = prefix + "out.bias"
+    w_old = prefix + "out.0.weight"
+    b_old = prefix + "out.0.bias"
+    if w_new not in state_dict and w_old in state_dict:
+        state_dict[w_new] = state_dict.pop(w_old)
+    elif w_new in state_dict and w_old in state_dict:
+        state_dict.pop(w_old)
+    if b_new not in state_dict and b_old in state_dict:
+        state_dict[b_new] = state_dict.pop(b_old)
+    elif b_new in state_dict and b_old in state_dict:
+        state_dict.pop(b_old)
+    old_pos_prefix = prefix + "out.1."
+    new_pos_prefix = prefix + "pos_enc."
+    for k in list(state_dict.keys()):
+        if not k.startswith(old_pos_prefix):
+            continue
+        new_k = new_pos_prefix + k[len(old_pos_prefix) :]
+        if new_k not in state_dict:
+            state_dict[new_k] = state_dict[k]
+        state_dict.pop(k, None)
+class Conv1dSubsampling1(torch.nn.Module):
+    """Convolutional 1D subsampling.
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv1dSubsampling1 object."""
+        super(Conv1dSubsampling1, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, 3, 1),
+            torch.nn.ReLU(),
+            torch.nn.Conv1d(odim, odim, 3, 1),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Linear(odim, odim)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        _upgrade_legacy_subsampling_state_dict(state_dict, prefix)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x, x_mask, prefix_embeds=None):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
+                (#batch, prefix_len, odim).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+        """
+        x = x.transpose(2, 1)  # (#batch, idim, time)
+        x = self.conv(x)
+        b, c, t = x.size()
+        x = self.out(x.transpose(1, 2).contiguous())
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-2:1][:, :, :-2:1]
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
+                    ],
+                    dim=-1,
+                )
+        x = self.pos_enc(x)
+        return x, x_mask
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError("Support only `-1` (for `reset_parameters`).")
+        return self.pos_enc
+class Conv1dSubsampling2(torch.nn.Module):
+    """Convolutional 1D subsampling (to 1/2 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv1dSubsampling2 object."""
+        super(Conv1dSubsampling2, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, 3, 1),
+            torch.nn.ReLU(),
+            torch.nn.Conv1d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Linear(odim, odim)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        _upgrade_legacy_subsampling_state_dict(state_dict, prefix)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x, x_mask, prefix_embeds=None):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
+                (#batch, prefix_len, odim).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+        """
+        x = x.transpose(2, 1)  # (#batch, idim, time)
+        x = self.conv(x)
+        b, c, t = x.size()
+        x = self.out(x.transpose(1, 2).contiguous())
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-2:1][:, :, :-2:2]
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
+                    ],
+                    dim=-1,
+                )
+        x = self.pos_enc(x)
+        return x, x_mask
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError("Support only `-1` (for `reset_parameters`).")
+        return self.pos_enc
+class Conv1dSubsampling3(torch.nn.Module):
+    """Convolutional 1D subsampling (to 1/3 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv1dSubsampling3 object."""
+        super(Conv1dSubsampling3, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, 3, 1),
+            torch.nn.ReLU(),
+            torch.nn.Conv1d(odim, odim, 5, 3),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Linear(odim, odim)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        _upgrade_legacy_subsampling_state_dict(state_dict, prefix)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x, x_mask, prefix_embeds=None):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
+                (#batch, prefix_len, odim).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+        """
+        x = x.transpose(2, 1)  # (#batch, idim, time)
+        x = self.conv(x)
+        b, c, t = x.size()
+        x = self.out(x.transpose(1, 2).contiguous())
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-2:1][:, :, :-4:3]
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
+                    ],
+                    dim=-1,
+                )
+        x = self.pos_enc(x)
+        return x, x_mask
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError("Support only `-1` (for `reset_parameters`).")
+        return self.pos_enc
+class Conv2dSubsampling(torch.nn.Module):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling object."""
+        super(Conv2dSubsampling, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        _upgrade_legacy_subsampling_state_dict(state_dict, prefix)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x, x_mask, prefix_embeds=None):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
+                (#batch, prefix_len, odim).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 4.
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-2:2][:, :, :-2:2]
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
+                    ],
+                    dim=-1,
+                )
+        x = self.pos_enc(x)
+        return x, x_mask
+    # def __getitem__(self, key):
+    #     """Get item.
+    #     When reset_parameters() is called, if use_scaled_pos_enc is used,
+    #         return the positioning encoding.
+    #     """
+    #     if key != -1:
+    #         raise NotImplementedError("Support only `-1` (for `reset_parameters`).")
+    #     return self.out[key]
+class Conv2dSubsampling1(torch.nn.Module):
+    """Similar to Conv2dSubsampling module, but without any subsampling performed.
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling1 object."""
+        super(Conv2dSubsampling1, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 1),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 1),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Linear(odim * (idim - 4), odim)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        _upgrade_legacy_subsampling_state_dict(state_dict, prefix)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x, x_mask, prefix_embeds=None):
+        """Pass x through 2 Conv2d layers without subsampling.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
+                (#batch, prefix_len, odim).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim).
+                where time' = time - 4.
+            torch.Tensor: Subsampled mask (#batch, 1, time').
+                where time' = time - 4.
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-4]
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
+                    ],
+                    dim=-1,
+                )
+        x = self.pos_enc(x)
+        return x, x_mask
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError("Support only `-1` (for `reset_parameters`).")
+        return self.pos_enc
+class Conv2dSubsampling2(torch.nn.Module):
+    """Convolutional 2D subsampling (to 1/2 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling2 object."""
+        super(Conv2dSubsampling2, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 1),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Linear(odim * (((idim - 1) // 2 - 2)), odim)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        _upgrade_legacy_subsampling_state_dict(state_dict, prefix)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x, x_mask, prefix_embeds=None):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
+                (#batch, prefix_len, odim).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-2:2][:, :, :-2:1]
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
+                    ],
+                    dim=-1,
+                )
+        x = self.pos_enc(x)
+        return x, x_mask
+    def __getitem__(self, key):
+        """Get item.
+        When reset_parameters() is called, if use_scaled_pos_enc is used,
+            return the positioning encoding.
+        """
+        if key != -1:
+            raise NotImplementedError("Support only `-1` (for `reset_parameters`).")
+        return self.pos_enc
+class Conv2dSubsampling6(torch.nn.Module):
+    """Convolutional 2D subsampling (to 1/6 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling6 object."""
+        super(Conv2dSubsampling6, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 5, 3),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        _upgrade_legacy_subsampling_state_dict(state_dict, prefix)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x, x_mask, prefix_embeds=None):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
+                (#batch, prefix_len, odim).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 6.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 6.
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-2:2][:, :, :-4:3]
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
+                    ],
+                    dim=-1,
+                )
+        x = self.pos_enc(x)
+        return x, x_mask
+class Conv2dSubsampling8(torch.nn.Module):
+    """Convolutional 2D subsampling (to 1/8 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
+        """Construct an Conv2dSubsampling8 object."""
+        super(Conv2dSubsampling8, self).__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        _upgrade_legacy_subsampling_state_dict(state_dict, prefix)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+    def forward(self, x, x_mask, prefix_embeds=None):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
+                (#batch, prefix_len, odim).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 8.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 8.
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
+                    ],
+                    dim=-1,
+                )
+        x = self.pos_enc(x)
+        return x, x_mask

src/model/__init__.py ADDED Viewed

File without changes

src/model/powsm/__init__.py ADDED Viewed

File without changes

src/model/powsm/ctc.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from typeguard import typechecked
+from src.utils import RankedLogger
+log = RankedLogger(__name__, rank_zero_only=True)
+class CTC(torch.nn.Module):
+    """CTC module.
+    Args:
+        odim: dimension of outputs
+        encoder_output_size: number of encoder projection units
+        dropout_rate: dropout rate (0.0 ~ 1.0)
+        ctc_type: builtin or gtnctc
+        reduce: reduce the CTC loss into a scalar
+        ignore_nan_grad: Same as zero_infinity (keeping for backward compatiblity)
+        zero_infinity:  Whether to zero infinite losses and the associated gradients.
+    """
+    @typechecked
+    def __init__(
+        self,
+        odim: int,
+        encoder_output_size: int,
+        dropout_rate: float = 0.0,
+        ctc_type: str = "builtin",
+        reduce: bool = True,
+        ignore_nan_grad: Optional[bool] = None,
+        zero_infinity: bool = True,
+        brctc_risk_strategy: str = "exp",
+        brctc_group_strategy: str = "end",
+        brctc_risk_factor: float = 0.0,
+    ):
+        super().__init__()
+        eprojs = encoder_output_size
+        self.dropout_rate = dropout_rate
+        self.ctc_lo = torch.nn.Linear(eprojs, odim)
+        self.ctc_type = ctc_type
+        if ignore_nan_grad is not None:
+            zero_infinity = ignore_nan_grad
+        if self.ctc_type == "builtin":
+            self.ctc_loss = torch.nn.CTCLoss(
+                reduction="none", zero_infinity=zero_infinity
+            )
+        elif self.ctc_type == "builtin2":
+            self.ignore_nan_grad = True
+            log.warning("builtin2")
+            self.ctc_loss = torch.nn.CTCLoss(reduction="none")
+        elif self.ctc_type == "gtnctc":
+            raise ImportError("gtnctc requires gtn_ctc which is not bundled here.")
+        elif self.ctc_type == "brctc":
+            try:
+                import k2  # noqa
+            except ImportError:
+                raise ImportError("You should install K2 to use Bayes Risk CTC")
+            raise ImportError("brctc requires BayesRiskCTC which is not bundled here.")
+        else:
+            raise ValueError(
+                f'ctc_type must be "builtin" or "builtin2": {self.ctc_type}'
+            )
+        self.reduce = reduce
+    def loss_fn(
+        self,
+        th_pred,
+        th_target,
+        th_ilen,
+        th_olen,
+        lang_sym: Optional[Union[List[str], None]] = None,
+        accent_sym: Optional[Union[List[str], None]] = None,
+    ) -> torch.Tensor:
+        if self.ctc_type in ["builtin", "brctc"]:
+            th_pred = th_pred.log_softmax(2).float()
+            loss = self.ctc_loss(th_pred, th_target, th_ilen, th_olen)
+            if self.ctc_type == "builtin":
+                size = th_pred.size(1)
+            else:
+                size = loss.size(0)  # some invalid examples will be excluded
+            if self.reduce:
+                # Batch-size average
+                loss = loss.sum() / size
+            else:
+                loss = loss / size
+            return loss
+        # builtin2 ignores nan losses using the logic below, while
+        # builtin relies on the zero_infinity flag in pytorch CTC
+        elif self.ctc_type == "builtin2":
+            th_pred = th_pred.log_softmax(2).float()
+            loss = self.ctc_loss(th_pred, th_target, th_ilen, th_olen)
+            if loss.requires_grad and self.ignore_nan_grad:
+                # ctc_grad: (L, B, O)
+                ctc_grad = loss.grad_fn(torch.ones_like(loss))
+                ctc_grad = ctc_grad.sum([0, 2])
+                indices = torch.isfinite(ctc_grad)
+                size = indices.long().sum()
+                if size == 0:
+                    # Return as is
+                    log.warning(
+                        "All samples in this mini-batch got nan grad."
+                        " Returning nan value instead of CTC loss"
+                    )
+                elif size != th_pred.size(1):
+                    log.warning(
+                        f"{th_pred.size(1) - size}/{th_pred.size(1)}"
+                        " samples got nan grad."
+                        " These were ignored for CTC loss."
+                    )
+                    # Create mask for target
+                    target_mask = torch.full(
+                        [th_target.size(0)],
+                        1,
+                        dtype=torch.bool,
+                        device=th_target.device,
+                    )
+                    s = 0
+                    for ind, le in enumerate(th_olen):
+                        if not indices[ind]:
+                            target_mask[s : s + le] = 0
+                        s += le
+                    # Calc loss again using maksed data
+                    loss = self.ctc_loss(
+                        th_pred[:, indices, :],
+                        th_target[target_mask],
+                        th_ilen[indices],
+                        th_olen[indices],
+                    )
+            else:
+                size = th_pred.size(1)
+            if self.reduce:
+                # Batch-size average
+                loss = loss.sum() / size
+            else:
+                loss = loss / size
+            return loss
+        elif self.ctc_type == "gtnctc":
+            log_probs = torch.nn.functional.log_softmax(th_pred, dim=2)
+            return self.ctc_loss(log_probs, th_target, th_ilen, 0, "none")
+        else:
+            raise NotImplementedError
+    def forward(
+        self,
+        hs_pad,
+        hlens,
+        ys_pad,
+        ys_lens,
+        lang_sym: Optional[Union[List[str], None]] = None,
+        accent_sym: Optional[Union[List[str], None]] = None,
+    ):
+        """Calculate CTC loss.
+        Args:
+            hs_pad: batch of padded hidden state sequences (B, Tmax, D)
+            hlens: batch of lengths of hidden state sequences (B)
+            ys_pad: batch of padded character id sequence tensor (B, Lmax)
+            ys_lens: batch of lengths of character sequence (B)
+            lang_sym: optional list of language codes per utterance
+            accent_sym: optional list of accent codes per utterance
+        """
+        # hs_pad: (B, L, NProj) -> ys_hat: (B, L, Nvocab)
+        ys_hat = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate))
+        if self.ctc_type == "brctc":
+            loss = self.loss_fn(
+                ys_hat, ys_pad, hlens, ys_lens, lang_sym=lang_sym, accent_sym=accent_sym
+            ).to(device=hs_pad.device, dtype=hs_pad.dtype)
+            return loss
+        elif self.ctc_type == "gtnctc":
+            # gtn expects list form for ys
+            ys_true = [y[y != -1] for y in ys_pad]  # parse padded ys
+        else:
+            # ys_hat: (B, L, D) -> (L, B, D)
+            ys_hat = ys_hat.transpose(0, 1)
+            # (B, L) -> (BxL,)
+            ys_true = torch.cat([ys_pad[i, :l] for i, l in enumerate(ys_lens)])
+        loss = self.loss_fn(
+            ys_hat, ys_true, hlens, ys_lens, lang_sym=lang_sym, accent_sym=accent_sym
+        ).to(device=hs_pad.device, dtype=hs_pad.dtype)
+        return loss
+    def softmax(self, hs_pad):
+        """softmax of frame activations
+        Args:
+            Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            torch.Tensor: softmax applied 3d tensor (B, Tmax, odim)
+        """
+        return F.softmax(self.ctc_lo(hs_pad), dim=2)
+    def log_softmax(self, hs_pad):
+        """log_softmax of frame activations
+        Args:
+            Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            torch.Tensor: log softmax applied 3d tensor (B, Tmax, odim)
+        """
+        return F.log_softmax(self.ctc_lo(hs_pad), dim=2)
+    def argmax(self, hs_pad):
+        """argmax of frame activations
+        Args:
+            torch.Tensor hs_pad: 3d tensor (B, Tmax, eprojs)
+        Returns:
+            torch.Tensor: argmax applied 2d tensor (B, Tmax)
+        """
+        return torch.argmax(self.ctc_lo(hs_pad), dim=2)

src/model/powsm/e_branchformer.py ADDED Viewed

	@@ -0,0 +1,555 @@

+# Copyright 2022 Kwangyoun Kim (ASAPP inc.)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""E-Branchformer encoder definition.
+Reference:
+    Kwangyoun Kim, Felix Wu, Yifan Peng, Jing Pan,
+    Prashant Sridhar, Kyu J. Han, Shinji Watanabe,
+    "E-Branchformer: Branchformer with Enhanced merging
+    for speech recognition," in SLT 2022.
+"""
+import logging
+from typing import List, Optional, Tuple
+import torch
+from typeguard import typechecked
+from src.model.powsm.ctc import CTC
+from src.espnet_import.fastformer import FastSelfAttention
+from src.espnet_import.cgmlp import ConvolutionalGatingMLP
+from src.espnet_import.nets_utils import get_activation, make_pad_mask
+from src.espnet_import.attention import (
+    LegacyRelPositionMultiHeadedAttention,
+    MultiHeadedAttention,
+    RelPositionMultiHeadedAttention,
+)
+from src.espnet_import.embedding import (
+    ConvolutionalPositionalEmbedding,
+    LegacyRelPositionalEncoding,
+    PositionalEncoding,
+    RelPositionalEncoding,
+    ScaledPositionalEncoding,
+)
+from src.espnet_import.layer_norm import LayerNorm
+from src.espnet_import.positionwise_feed_forward import PositionwiseFeedForward
+from src.espnet_import.repeat import repeat
+from src.espnet_import.subsampling import (
+    Conv1dSubsampling1,
+    Conv1dSubsampling2,
+    Conv1dSubsampling3,
+    Conv2dSubsampling,
+    Conv2dSubsampling1,
+    Conv2dSubsampling2,
+    Conv2dSubsampling6,
+    Conv2dSubsampling8,
+    TooShortUttError,
+    check_short_utt,
+)
+class EBranchformerEncoderLayer(torch.nn.Module):
+    """E-Branchformer encoder layer module.
+    Args:
+        size (int): model dimension
+        attn: standard self-attention or efficient attention
+        cgmlp: ConvolutionalGatingMLP
+        feed_forward: feed-forward module, optional
+        feed_forward: macaron-style feed-forward module, optional
+        dropout_rate (float): dropout probability
+        merge_conv_kernel (int): kernel size of the depth-wise conv in merge module
+    """
+    def __init__(
+        self,
+        size: int,
+        attn: torch.nn.Module,
+        cgmlp: torch.nn.Module,
+        feed_forward: Optional[torch.nn.Module],
+        feed_forward_macaron: Optional[torch.nn.Module],
+        dropout_rate: float,
+        merge_conv_kernel: int = 3,
+    ):
+        super().__init__()
+        self.size = size
+        self.attn = attn
+        self.cgmlp = cgmlp
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.ff_scale = 1.0
+        if self.feed_forward is not None:
+            self.norm_ff = LayerNorm(size)
+        if self.feed_forward_macaron is not None:
+            self.ff_scale = 0.5
+            self.norm_ff_macaron = LayerNorm(size)
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        self.norm_mlp = LayerNorm(size)  # for the MLP module
+        self.norm_final = LayerNorm(size)  # for the final output of the block
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.depthwise_conv_fusion = torch.nn.Conv1d(
+            size + size,
+            size + size,
+            kernel_size=merge_conv_kernel,
+            stride=1,
+            padding=(merge_conv_kernel - 1) // 2,
+            groups=size + size,
+            bias=True,
+        )
+        self.merge_proj = torch.nn.Linear(size + size, size)
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+        Args:
+            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, 1, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+        if cache is not None:
+            raise NotImplementedError("cache is not None, which is not tested")
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+        if self.feed_forward_macaron is not None:
+            residual = x
+            x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+        # Two branches
+        x1 = x
+        x2 = x
+        # Branch 1: multi-headed attention module
+        x1 = self.norm_mha(x1)
+        if isinstance(self.attn, FastSelfAttention):
+            x_att = self.attn(x1, mask)
+        else:
+            if pos_emb is not None:
+                x_att = self.attn(x1, x1, x1, pos_emb, mask)
+            else:
+                x_att = self.attn(x1, x1, x1, mask)
+        x1 = self.dropout(x_att)
+        # Branch 2: convolutional gating mlp
+        x2 = self.norm_mlp(x2)
+        if pos_emb is not None:
+            x2 = (x2, pos_emb)
+        x2 = self.cgmlp(x2, mask)
+        if isinstance(x2, tuple):
+            x2 = x2[0]
+        x2 = self.dropout(x2)
+        # Merge two branches
+        x_concat = torch.cat([x1, x2], dim=-1)
+        x_tmp = x_concat.transpose(1, 2)
+        x_tmp = self.depthwise_conv_fusion(x_tmp)
+        x_tmp = x_tmp.transpose(1, 2)
+        x = x + self.dropout(self.merge_proj(x_concat + x_tmp))
+        if self.feed_forward is not None:
+            # feed forward module
+            residual = x
+            x = self.norm_ff(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        x = self.norm_final(x)
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+        return x, mask
+class EBranchformerEncoder(torch.nn.Module):
+    """E-Branchformer encoder module."""
+    @typechecked
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        attention_layer_type: str = "rel_selfattn",
+        pos_enc_layer_type: str = "rel_pos",
+        rel_pos_type: str = "latest",
+        cgmlp_linear_units: int = 2048,
+        cgmlp_conv_kernel: int = 31,
+        use_linear_after_conv: bool = False,
+        gate_activation: str = "identity",
+        num_blocks: int = 12,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        zero_triu: bool = False,
+        padding_idx: int = -1,
+        layer_drop_rate: float = 0.0,
+        max_pos_emb_len: int = 5000,
+        use_ffn: bool = False,
+        macaron_ffn: bool = False,
+        ffn_activation_type: str = "swish",
+        linear_units: int = 2048,
+        positionwise_layer_type: str = "linear",
+        merge_conv_kernel: int = 3,
+        interctc_layer_idx=None,
+        interctc_use_conditioning: bool = False,
+        qk_norm: bool = False,
+        use_flash_attn: bool = True,
+        gradient_checkpoint_layers: List[int] = [],
+    ):
+        super().__init__()
+        self._output_size = output_size
+        if rel_pos_type == "legacy":
+            if pos_enc_layer_type == "rel_pos":
+                pos_enc_layer_type = "legacy_rel_pos"
+            if attention_layer_type == "rel_selfattn":
+                attention_layer_type = "legacy_rel_selfattn"
+        elif rel_pos_type == "latest":
+            assert attention_layer_type != "legacy_rel_selfattn"
+            assert pos_enc_layer_type != "legacy_rel_pos"
+        else:
+            raise ValueError("unknown rel_pos_type: " + rel_pos_type)
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "conv":
+            pos_enc_class = ConvolutionalPositionalEmbedding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert attention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            assert attention_layer_type == "legacy_rel_selfattn"
+            pos_enc_class = LegacyRelPositionalEncoding
+            logging.warning(
+                "Using legacy_rel_pos and it will be deprecated in the future."
+            )
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv1d1":
+            self.embed = Conv1dSubsampling1(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv1d2":
+            self.embed = Conv1dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv1d3":
+            self.embed = Conv1dSubsampling3(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d1":
+            self.embed = Conv2dSubsampling1(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer is None:
+            if input_size == output_size:
+                self.embed = torch.nn.Sequential(
+                    pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len)
+                )
+            else:
+                self.embed = torch.nn.Linear(input_size, output_size)
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        activation = get_activation(ffn_activation_type)
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type is None:
+            logging.warning("no macaron ffn")
+        else:
+            raise ValueError("Support only linear.")
+        if attention_layer_type == "selfattn":
+            # Default to flash attention unless overrided by user
+            if use_flash_attn:
+                try:
+                    import flash_attn_interface  # noqa
+                except Exception:
+                    use_flash_attn = False
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+                qk_norm,
+                use_flash_attn,
+                False,
+                False,
+            )
+        elif attention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+            logging.warning(
+                "Using legacy_rel_selfattn and it will be deprecated in the future."
+            )
+        elif attention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+                zero_triu,
+            )
+        elif attention_layer_type == "fast_selfattn":
+            assert pos_enc_layer_type in ["abs_pos", "scaled_abs_pos"]
+            encoder_selfattn_layer = FastSelfAttention
+            encoder_selfattn_layer_args = (
+                output_size,
+                attention_heads,
+                attention_dropout_rate,
+            )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " + attention_layer_type)
+        cgmlp_layer = ConvolutionalGatingMLP
+        cgmlp_layer_args = (
+            output_size,
+            cgmlp_linear_units,
+            cgmlp_conv_kernel,
+            dropout_rate,
+            use_linear_after_conv,
+            gate_activation,
+        )
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EBranchformerEncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                cgmlp_layer(*cgmlp_layer_args),
+                positionwise_layer(*positionwise_layer_args) if use_ffn else None,
+                (
+                    positionwise_layer(*positionwise_layer_args)
+                    if use_ffn and macaron_ffn
+                    else None
+                ),
+                dropout_rate,
+                merge_conv_kernel,
+            ),
+            layer_drop_rate,
+        )
+        self.after_norm = LayerNorm(output_size)
+        self.layer_drop_rate = layer_drop_rate
+        if interctc_layer_idx is None:
+            interctc_layer_idx = []
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+        # For gradient checkpointing
+        # 0 is the embedding layer, 1 is the first encoder layer, etc.
+        self.gradient_checkpoint_layers = gradient_checkpoint_layers
+        # logging.info(f"Gradient checkpoint layers: {self.gradient_checkpoint_layers}")
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        masks: torch.Tensor = None,
+        ctc: CTC = None,
+        max_layer: int = None,
+        return_all_hs: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+            ctc (CTC): Intermediate CTC module.
+            max_layer (int): Layer depth below which InterCTC is applied.
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+        """
+        if masks is None:
+            masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+        else:
+            masks = ~masks[:, None, :]
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv1dSubsampling1)
+            or isinstance(self.embed, Conv1dSubsampling2)
+            or isinstance(self.embed, Conv1dSubsampling3)
+            or isinstance(self.embed, Conv2dSubsampling1)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            if 0 in self.gradient_checkpoint_layers:
+                xs_pad, masks = torch.utils.checkpoint.checkpoint(
+                    self.embed, xs_pad, masks, use_reentrant=False
+                )
+            else:
+                xs_pad, masks = self.embed(xs_pad, masks)
+        elif self.embed is not None:
+            if 0 in self.gradient_checkpoint_layers:
+                xs_pad = torch.utils.checkpoint.checkpoint(
+                    self.embed, xs_pad, use_reentrant=False
+                )
+            else:
+                xs_pad = self.embed(xs_pad)
+        intermediate_outs = []
+        for layer_idx, encoder_layer in enumerate(self.encoders):
+            if max_layer is not None and layer_idx >= max_layer:
+                break
+            if (
+                self.training
+                and torch.empty(1).uniform_().item() < self.layer_drop_rate
+            ):
+                continue
+            if layer_idx + 1 in self.gradient_checkpoint_layers:
+                xs_pad, masks = torch.utils.checkpoint.checkpoint(
+                    encoder_layer, xs_pad, masks, use_reentrant=False
+                )
+            else:
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+            if return_all_hs:
+                if isinstance(xs_pad, tuple):
+                    intermediate_outs.append(xs_pad[0])
+                else:
+                    intermediate_outs.append(xs_pad)
+            elif layer_idx + 1 in self.interctc_layer_idx:
+                encoder_out = xs_pad
+                if isinstance(encoder_out, tuple):
+                    encoder_out = encoder_out[0]
+                intermediate_outs.append((layer_idx + 1, encoder_out))
+                if self.interctc_use_conditioning:
+                    ctc_out = ctc.softmax(encoder_out)
+                    if isinstance(xs_pad, tuple):
+                        xs_pad = list(xs_pad)
+                        xs_pad[0] = xs_pad[0] + self.conditioning_layer(ctc_out)
+                        xs_pad = tuple(xs_pad)
+                    else:
+                        xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+        xs_pad = self.after_norm(xs_pad)
+        olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
+        return xs_pad, olens, None

src/model/powsm/specaug.py ADDED Viewed

	@@ -0,0 +1,384 @@

+"""SpecAugment module."""
+from typing import Optional, Sequence, Union
+import math
+from typeguard import typechecked
+import torch
+from src.espnet_import.nets_utils import pad_list
+DEFAULT_TIME_WARP_MODE = "bicubic"
+def time_warp(x: torch.Tensor, window: int = 80, mode: str = DEFAULT_TIME_WARP_MODE):
+    """Time warping using torch.interpolate.
+    Args:
+        x: (Batch, Time, Freq)
+        window: time warp parameter
+        mode: Interpolate mode
+    """
+    # bicubic supports 4D or more dimension tensor
+    org_size = x.size()
+    if x.dim() == 3:
+        # x: (Batch, Time, Freq) -> (Batch, 1, Time, Freq)
+        x = x[:, None]
+    t = x.shape[2]
+    if t - window <= window:
+        return x.view(*org_size)
+    center = torch.randint(window, t - window, (1,))[0]
+    warped = torch.randint(center - window, center + window, (1,))[0] + 1
+    # left: (Batch, Channel, warped, Freq)
+    # right: (Batch, Channel, time - warped, Freq)
+    left = torch.nn.functional.interpolate(
+        x[:, :, :center], (warped, x.shape[3]), mode=mode, align_corners=False
+    )
+    right = torch.nn.functional.interpolate(
+        x[:, :, center:], (t - warped, x.shape[3]), mode=mode, align_corners=False
+    )
+    if x.requires_grad:
+        x = torch.cat([left, right], dim=-2)
+    else:
+        x[:, :, :warped] = left
+        x[:, :, warped:] = right
+    return x.view(*org_size)
+def mask_along_axis(
+    spec: torch.Tensor,
+    spec_lengths: torch.Tensor,
+    mask_width_range: Sequence[int] = (0, 30),
+    dim: int = 1,
+    num_mask: int = 2,
+    replace_with_zero: bool = True,
+):
+    """Apply mask along the specified direction.
+    Args:
+        spec: (Batch, Length, Freq)
+        spec_lengths: (Length): Not using lengths in this implementation
+        mask_width_range: Select the width randomly between this range
+    """
+    org_size = spec.size()
+    if spec.dim() == 4:
+        # spec: (Batch, Channel, Length, Freq) -> (Batch * Channel, Length, Freq)
+        spec = spec.view(-1, spec.size(2), spec.size(3))
+    B = spec.shape[0]
+    # D = Length or Freq
+    D = spec.shape[dim]
+    # mask_length: (B, num_mask, 1)
+    mask_length = torch.randint(
+        mask_width_range[0],
+        mask_width_range[1],
+        (B, num_mask),
+        device=spec.device,
+    ).unsqueeze(2)
+    # mask_pos: (B, num_mask, 1)
+    mask_pos = torch.randint(
+        0, max(1, D - mask_length.max()), (B, num_mask), device=spec.device
+    ).unsqueeze(2)
+    # aran: (1, 1, D)
+    aran = torch.arange(D, device=spec.device)[None, None, :]
+    # mask: (Batch, num_mask, D)
+    mask = (mask_pos <= aran) * (aran < (mask_pos + mask_length))
+    # Multiply masks: (Batch, num_mask, D) -> (Batch, D)
+    mask = mask.any(dim=1)
+    if dim == 1:
+        # mask: (Batch, Length, 1)
+        mask = mask.unsqueeze(2)
+    elif dim == 2:
+        # mask: (Batch, 1, Freq)
+        mask = mask.unsqueeze(1)
+    if replace_with_zero:
+        value = 0.0
+    else:
+        value = spec.mean()
+    if spec.requires_grad:
+        spec = spec.masked_fill(mask, value)
+    else:
+        spec = spec.masked_fill_(mask, value)
+    spec = spec.view(*org_size)
+    return spec, spec_lengths
+class TimeWarp(torch.nn.Module):
+    """Time warping using torch.interpolate.
+    Args:
+        window: time warp parameter
+        mode: Interpolate mode
+    """
+    def __init__(self, window: int = 80, mode: str = DEFAULT_TIME_WARP_MODE):
+        super().__init__()
+        self.window = window
+        self.mode = mode
+    def extra_repr(self):
+        return f"window={self.window}, mode={self.mode}"
+    def forward(self, x: torch.Tensor, x_lengths: torch.Tensor = None):
+        """Forward function.
+        Args:
+            x: (Batch, Time, Freq)
+            x_lengths: (Batch,)
+        """
+        if x_lengths is None or all(le == x_lengths[0] for le in x_lengths):
+            # Note that applying same warping for each sample
+            y = time_warp(x, window=self.window, mode=self.mode)
+        else:
+            # FIXME(kamo): I have no idea to batchify Timewarp
+            ys = []
+            for i in range(x.size(0)):
+                _y = time_warp(
+                    x[i][None, : x_lengths[i]],
+                    window=self.window,
+                    mode=self.mode,
+                )[0]
+                ys.append(_y)
+            y = pad_list(ys, 0.0)
+        return y, x_lengths
+class MaskAlongAxis(torch.nn.Module):
+    @typechecked
+    def __init__(
+        self,
+        mask_width_range: Union[int, Sequence[int]] = (0, 30),
+        num_mask: int = 2,
+        dim: Union[int, str] = "time",
+        replace_with_zero: bool = True,
+    ):
+        if isinstance(mask_width_range, int):
+            mask_width_range = (0, mask_width_range)
+        if len(mask_width_range) != 2:
+            raise TypeError(
+                f"mask_width_range must be a tuple of int and int values: "
+                f"{mask_width_range}",
+            )
+        assert mask_width_range[1] > mask_width_range[0]
+        if isinstance(dim, str):
+            if dim == "time":
+                dim = 1
+            elif dim == "freq":
+                dim = 2
+            else:
+                raise ValueError("dim must be int, 'time' or 'freq'")
+        if dim == 1:
+            self.mask_axis = "time"
+        elif dim == 2:
+            self.mask_axis = "freq"
+        else:
+            self.mask_axis = "unknown"
+        super().__init__()
+        self.mask_width_range = mask_width_range
+        self.num_mask = num_mask
+        self.dim = dim
+        self.replace_with_zero = replace_with_zero
+    def extra_repr(self):
+        return (
+            f"mask_width_range={self.mask_width_range}, "
+            f"num_mask={self.num_mask}, axis={self.mask_axis}"
+        )
+    def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
+        """Forward function.
+        Args:
+            spec: (Batch, Length, Freq)
+        """
+        return mask_along_axis(
+            spec,
+            spec_lengths,
+            mask_width_range=self.mask_width_range,
+            dim=self.dim,
+            num_mask=self.num_mask,
+            replace_with_zero=self.replace_with_zero,
+        )
+class MaskAlongAxisVariableMaxWidth(torch.nn.Module):
+    """Mask input spec along a specified axis with variable maximum width.
+    Formula:
+        max_width = max_width_ratio * seq_len
+    """
+    @typechecked
+    def __init__(
+        self,
+        mask_width_ratio_range: Union[float, Sequence[float]] = (0.0, 0.05),
+        num_mask: int = 2,
+        dim: Union[int, str] = "time",
+        replace_with_zero: bool = True,
+    ):
+        if isinstance(mask_width_ratio_range, float):
+            mask_width_ratio_range = (0.0, mask_width_ratio_range)
+        if len(mask_width_ratio_range) != 2:
+            raise TypeError(
+                f"mask_width_ratio_range must be a tuple of float and float values: "
+                f"{mask_width_ratio_range}",
+            )
+        assert mask_width_ratio_range[1] > mask_width_ratio_range[0]
+        if isinstance(dim, str):
+            if dim == "time":
+                dim = 1
+            elif dim == "freq":
+                dim = 2
+            else:
+                raise ValueError("dim must be int, 'time' or 'freq'")
+        if dim == 1:
+            self.mask_axis = "time"
+        elif dim == 2:
+            self.mask_axis = "freq"
+        else:
+            self.mask_axis = "unknown"
+        super().__init__()
+        self.mask_width_ratio_range = mask_width_ratio_range
+        self.num_mask = num_mask
+        self.dim = dim
+        self.replace_with_zero = replace_with_zero
+    def extra_repr(self):
+        return (
+            f"mask_width_ratio_range={self.mask_width_ratio_range}, "
+            f"num_mask={self.num_mask}, axis={self.mask_axis}"
+        )
+    def forward(self, spec: torch.Tensor, spec_lengths: torch.Tensor = None):
+        """Forward function.
+        Args:
+            spec: (Batch, Length, Freq)
+        """
+        max_seq_len = spec.shape[self.dim]
+        min_mask_width = math.floor(max_seq_len * self.mask_width_ratio_range[0])
+        min_mask_width = max([0, min_mask_width])
+        max_mask_width = math.floor(max_seq_len * self.mask_width_ratio_range[1])
+        max_mask_width = min([max_seq_len, max_mask_width])
+        if max_mask_width > min_mask_width:
+            return mask_along_axis(
+                spec,
+                spec_lengths,
+                mask_width_range=(min_mask_width, max_mask_width),
+                dim=self.dim,
+                num_mask=self.num_mask,
+                replace_with_zero=self.replace_with_zero,
+            )
+        return spec, spec_lengths
+class SpecAug(torch.nn.Module):
+    """Implementation of SpecAug.
+    Reference:
+        Daniel S. Park et al.
+        "SpecAugment: A Simple Data
+         Augmentation Method for Automatic Speech Recognition"
+    .. warning::
+        When using cuda mode, time_warp doesn't have reproducibility
+        due to `torch.nn.functional.interpolate`.
+    """
+    def __init__(
+        self,
+        apply_time_warp: bool = True,
+        time_warp_window: int = 5,
+        time_warp_mode: str = "bicubic",
+        apply_freq_mask: bool = True,
+        freq_mask_width_range: Union[int, Sequence[int]] = (0, 20),
+        num_freq_mask: int = 2,
+        apply_time_mask: bool = True,
+        time_mask_width_range: Optional[Union[int, Sequence[int]]] = None,
+        time_mask_width_ratio_range: Optional[Union[float, Sequence[float]]] = None,
+        num_time_mask: int = 2,
+        replace_with_zero: bool = True,
+    ):
+        if not apply_time_warp and not apply_time_mask and not apply_freq_mask:
+            raise ValueError(
+                "Either one of time_warp, time_mask, or freq_mask should be applied"
+            )
+        if (
+            apply_time_mask
+            and (time_mask_width_range is not None)
+            and (time_mask_width_ratio_range is not None)
+        ):
+            raise ValueError(
+                'Either one of "time_mask_width_range" or '
+                '"time_mask_width_ratio_range" can be used'
+            )
+        super().__init__()
+        self.apply_time_warp = apply_time_warp
+        self.apply_freq_mask = apply_freq_mask
+        self.apply_time_mask = apply_time_mask
+        if apply_time_warp:
+            self.time_warp = TimeWarp(window=time_warp_window, mode=time_warp_mode)
+        else:
+            self.time_warp = None
+        if apply_freq_mask:
+            self.freq_mask = MaskAlongAxis(
+                dim="freq",
+                mask_width_range=freq_mask_width_range,
+                num_mask=num_freq_mask,
+                replace_with_zero=replace_with_zero,
+            )
+        else:
+            self.freq_mask = None
+        if apply_time_mask:
+            if time_mask_width_range is not None:
+                self.time_mask = MaskAlongAxis(
+                    dim="time",
+                    mask_width_range=time_mask_width_range,
+                    num_mask=num_time_mask,
+                    replace_with_zero=replace_with_zero,
+                )
+            elif time_mask_width_ratio_range is not None:
+                self.time_mask = MaskAlongAxisVariableMaxWidth(
+                    dim="time",
+                    mask_width_ratio_range=time_mask_width_ratio_range,
+                    num_mask=num_time_mask,
+                    replace_with_zero=replace_with_zero,
+                )
+            else:
+                raise ValueError(
+                    'Either one of "time_mask_width_range" or '
+                    '"time_mask_width_ratio_range" should be used.'
+                )
+        else:
+            self.time_mask = None
+    def forward(self, x, x_lengths=None):
+        if self.time_warp is not None:
+            x, x_lengths = self.time_warp(x, x_lengths)
+        if self.freq_mask is not None:
+            x, x_lengths = self.freq_mask(x, x_lengths)
+        if self.time_mask is not None:
+            x, x_lengths = self.time_mask(x, x_lengths)
+        return x, x_lengths

src/model/powsm/utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import dataclasses
+import warnings
+import numpy as np
+import torch
+def force_gatherable(data, device):
+    """Change object to gatherable in torch.nn.DataParallel recursively
+    The restriction to the returned value in DataParallel:
+        The object must be
+        - torch.cuda.Tensor
+        - 1 or more dimension. 0-dimension-tensor sends warning.
+        or a list, tuple, dict.
+    """
+    if isinstance(data, dict):
+        return {k: force_gatherable(v, device) for k, v in data.items()}
+    # DataParallel can't handle NamedTuple well
+    elif isinstance(data, tuple) and type(data) is not tuple:
+        return type(data)(*[force_gatherable(o, device) for o in data])
+    elif isinstance(data, (list, tuple, set)):
+        return type(data)(force_gatherable(v, device) for v in data)
+    elif isinstance(data, np.ndarray):
+        return force_gatherable(torch.from_numpy(data), device)
+    elif isinstance(data, torch.Tensor):
+        if data.dim() == 0:
+            # To 1-dim array
+            data = data[None]
+        return data.to(device)
+    elif isinstance(data, float):
+        return torch.tensor([data], dtype=torch.float, device=device)
+    elif isinstance(data, int):
+        return torch.tensor([data], dtype=torch.long, device=device)
+    elif data is None:
+        return None
+    else:
+        warnings.warn(f"{type(data)} may not be gatherable by DataParallel")
+        return data
+def to_device(data, device=None, dtype=None, non_blocking=False, copy=False):
+    """Change the device of object recursively"""
+    if isinstance(data, dict):
+        return {
+            k: to_device(v, device, dtype, non_blocking, copy) for k, v in data.items()
+        }
+    elif dataclasses.is_dataclass(data) and not isinstance(data, type):
+        return type(data)(
+            *[
+                to_device(v, device, dtype, non_blocking, copy)
+                for v in dataclasses.astuple(data)
+            ]
+        )
+    # maybe namedtuple. I don't know the correct way to judge namedtuple.
+    elif isinstance(data, tuple) and type(data) is not tuple:
+        return type(data)(
+            *[to_device(o, device, dtype, non_blocking, copy) for o in data]
+        )
+    elif isinstance(data, (list, tuple)):
+        return type(data)(to_device(v, device, dtype, non_blocking, copy) for v in data)
+    elif isinstance(data, np.ndarray):
+        return to_device(torch.from_numpy(data), device, dtype, non_blocking, copy)
+    elif isinstance(data, torch.Tensor):
+        if dtype is not None:
+            dtype = str(dtype).removeprefix("torch.")
+            cur_dtype = str(data.dtype).removeprefix("torch.")
+            if not (
+                ("int" in dtype and "int" in cur_dtype)
+                or ("float" in dtype and "float" in cur_dtype)
+            ):
+                dtype = None  # avoid conversion between int and float.
+            else:
+                dtype = getattr(torch, dtype)
+        return data.to(device, dtype, non_blocking, copy)
+    else:
+        return data

src/model/xeusphoneme/__init__.py ADDED Viewed

File without changes

src/model/xeusphoneme/builders.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import copy
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import argparse
+import yaml
+import json
+import torch
+from src.model.powsm.specaug import SpecAug
+from src.model.powsm.e_branchformer import EBranchformerEncoder
+from src.model.xeusphoneme.cnn_frontend import CNNFrontend as Wav2VecCNN
+from src.model.xeusphoneme.linear_layer import LinearProjection
+from src.core.utils import download_hf_snapshot
+from src.model.xeusphoneme.xeuspr_model import XeusPRModel
+from src.model.xeusphoneme.xeuspr_inference import XeusPRInference
+from src.model.powsm.ctc import CTC
+from src.utils import RankedLogger
+log = RankedLogger(__name__, rank_zero_only=False)
+class XeusPRTokenizer:
+    """Tokenizer that maps IPA phones to IDs using the xeuspr ipa_vocab.json."""
+    def __init__(self, vocab_file: str):
+        with open(vocab_file) as f:
+            self.vocab: Dict[str, int] = json.load(f)
+        self.unk_id = self.vocab.get("<unk>", 0)
+    def tokens2ids(self, tokens) -> list:
+        return [self.vocab.get(t, self.unk_id) for t in tokens]
+def build_xeus_pr(
+    config_file: str,
+    checkpoint: Optional[str] = None,
+    vocab_file: Optional[str] = None,
+    ctc_config: Optional[dict] = None,
+    weighted_sum: bool = False,
+    interctc_layer_idx: Optional[list] = None,
+    interctc_weight: float = 0.0,
+    interctc_use_conditioning: bool = False,
+    interctc_ctc_type: str = "phone",
+    ctc_aux_config: Optional[dict] = None,
+    decoder_config: Optional[dict] = None,
+    ctc_weight: float = 1.0,
+) -> XeusPRModel:
+    """Build Xeus PR model from config and optional checkpoint.
+    Args:
+        config_file: Path to config yaml file
+        checkpoint: Path to model checkpoint (pretrained or fully trained)
+        vocab_file: Path to vocabulary file. If None, use vocab in config.
+        ctc_config: Optional dict of CTC config
+        weighted_sum: Whether to use weighted sum of transformer layers
+    Returns:
+        XeusPRModel
+    """
+    with open(config_file, "r", encoding="utf-8") as f:
+        args = argparse.Namespace(**yaml.safe_load(f))
+    if vocab_file is not None:
+        with open(vocab_file) as f:
+            tok2id = json.load(f)
+            id2tok = {v: k for k, v in tok2id.items()}
+            token_list = [id2tok[i] for i in range(len(id2tok))]
+    elif isinstance(args.token_list, str):
+        with open(args.token_list, encoding="utf-8") as f:
+            token_list = [line.rstrip() for line in f]
+    else:
+        token_list = list(args.token_list)
+    vocab_size = len(token_list)
+    log.info(f"Vocabulary size: {vocab_size}")
+    assert (
+        getattr(args, "frontend") == "wav2vec_cnn"
+    ), "Config must specify wav2vec_cnn frontend"
+    frontend = Wav2VecCNN(**args.frontend_conf)
+    input_size = frontend.output_size()
+    specaug = None
+    if hasattr(args, "specaug") and args.specaug == "specaug":
+        specaug = SpecAug(**args.specaug_conf)
+    normalize = None
+    assert (
+        getattr(args, "preencoder") == "linear"
+    ), "Config must specify linear preencoder"
+    preencoder = LinearProjection(input_size=input_size, **args.preencoder_conf)
+    input_size = preencoder.output_size()
+    assert (
+        args.encoder == "e_branchformer"
+    ), f"Only e_branchformer supported, got {args.encoder}"
+    encoder_conf = dict(args.encoder_conf)
+    if interctc_layer_idx:
+        encoder_conf["interctc_layer_idx"] = interctc_layer_idx
+    if interctc_use_conditioning:
+        encoder_conf["interctc_use_conditioning"] = True
+    encoder = EBranchformerEncoder(input_size=input_size, **encoder_conf)
+    ctc_config = ctc_config or getattr(args, "ctc_conf", {})
+    ctc_config_orig = copy.deepcopy(ctc_config)
+    # Build CTC
+    ctc = CTC(
+        odim=vocab_size,
+        encoder_output_size=encoder.output_size(),
+        **ctc_config,
+    )
+    # Build optional aux CTC (orthographic vocabulary)
+    ctc_aux = None
+    if ctc_aux_config is not None:
+        import sentencepiece as spm
+        ctc_aux_config = dict(ctc_aux_config)  # copy to avoid mutating caller's dict
+        sp = spm.SentencePieceProcessor()
+        sp.load(ctc_aux_config.pop("vocab_file"))
+        aux_vocab_size = sp.get_piece_size()
+        ctc_aux = CTC(
+            odim=aux_vocab_size,
+            encoder_output_size=encoder.output_size(),
+            ctc_type="builtin",
+            **ctc_aux_config,
+        )
+        log.info(f"Built aux CTC with vocab size {aux_vocab_size}")
+    # Build optional attention decoder
+    decoder = None
+    if decoder_config:
+        from src.model.powsm.transformer_decoder import TransformerDecoder
+        decoder = TransformerDecoder(
+            vocab_size=vocab_size,
+            encoder_output_size=encoder.output_size(),
+            **decoder_config,
+        )
+    # Build model
+    model = XeusPRModel(
+        encoder=encoder,
+        ctc=ctc,
+        token_list=token_list,
+        frontend=frontend,
+        specaug=specaug,
+        normalize=normalize,
+        preencoder=preencoder,
+        ignore_id=getattr(args, "ignore_id", -1),
+        sym_blank=getattr(args, "sym_blank", "<blank>"),
+        freeze_frontend=checkpoint is not None,
+        weighted_sum=weighted_sum,
+        interctc_weight=interctc_weight,
+        interctc_use_conditioning=interctc_use_conditioning,
+        interctc_ctc_type=interctc_ctc_type,
+        ctc_aux=ctc_aux,
+        decoder=decoder,
+        ctc_weight=ctc_weight,
+    )
+    if checkpoint:
+        state_dict = torch.load(checkpoint, map_location="cpu", weights_only=False)
+        if "state_dict" in state_dict:
+            # convert to standard xeus style checkpoint
+            state_dict = state_dict["state_dict"]  # for finetuned lightning checkpoints
+            state_dict = {
+                k.replace("net.", ""): v
+                for k, v in state_dict.items()
+                if k.startswith("net.")
+            }
+        load_info = model.load_state_dict(state_dict, strict=False)
+        log.info(f"Loaded checkpoint: {checkpoint} with load info: {load_info}")
+        print(f"Loaded checkpoint: {checkpoint} with load info: {load_info}")
+    model.training_args = args
+    model._net_config = {
+        "ctc_config": ctc_config_orig,
+        "weighted_sum": weighted_sum,
+        "interctc_layer_idx": interctc_layer_idx,
+        "interctc_weight": interctc_weight,
+        "interctc_use_conditioning": interctc_use_conditioning,
+        "interctc_ctc_type": interctc_ctc_type,
+        "ctc_aux_config": ctc_aux_config,
+        "decoder_config": decoder_config,
+        "ctc_weight": ctc_weight,
+    }
+    return model
+def build_xeus_pr_from_hf(
+    *,
+    work_dir: str,
+    hf_repo: Optional[str] = None,
+    force: bool = False,
+    config_file: Optional[str] = None,
+    checkpoint: Optional[str] = None,
+    vocab_file: Optional[str] = None,
+    ctc_config: Optional[dict] = None,
+    load_ckpt: bool = True,
+    weighted_sum: bool = False,
+    interctc_layer_idx: Optional[list] = None,
+    interctc_weight: float = 0.0,
+    interctc_use_conditioning: bool = False,
+    interctc_ctc_type: str = "phone",
+    ctc_aux_config: Optional[dict] = None,
+    decoder_config: Optional[dict] = None,
+    ctc_weight: float = 1.0,
+) -> XeusPRModel:
+    """Build Xeus PR model from local files or HuggingFace repo.
+    Args:
+        work_dir: Directory to store downloaded files from HF repo
+        hf_repo: HuggingFace repo name (e.g., "username/xeus-pr")
+            If None, load from local files only
+        force: Whether to force re-download from HF repo
+        config_file: Path to config file. If None, use default path in work_dir.
+            Takes precedence over hf_repo download.
+        checkpoint: Path to checkpoint file. If None, use default path in work_dir.
+            Takes precedence over hf_repo download.
+        vocab_file: Path to vocabulary file. If None, use path in config.
+        ctc_config: Optional dict of CTC config
+        load_ckpt: Whether to load checkpoint weights
+        weighted_sum: Whether to use weighted sum of transformer layers
+    Returns:
+        XeusPRModel
+    """
+    # Default relative paths in HF repo
+    REL_CONFIG = "model/config.yaml"
+    REL_CKPT = "model/xeus_checkpoint_new.pth"
+    # Download from HF if repo specified
+    if hf_repo:
+        log.info(f"Downloading snapshot from HuggingFace: {hf_repo}")
+        download_hf_snapshot(
+            repo_id=hf_repo,
+            force_download=force,
+            work_dir=work_dir,
+        )
+    # Resolve file paths
+    root = Path(work_dir)
+    cfg = config_file or str(root / REL_CONFIG)
+    ckpt = checkpoint or str(root / REL_CKPT)
+    # Verify files exist
+    assert Path(cfg).exists(), f"Config file not found: {cfg}"
+    if not load_ckpt:
+        ckpt = None
+    else:
+        assert Path(ckpt).exists(), f"Checkpoint file not found: {ckpt}"
+    log.info(f"Building model from config: {cfg}")
+    log.info(f"Loading checkpoint: {ckpt}")
+    return build_xeus_pr(
+        config_file=cfg,
+        checkpoint=ckpt,
+        vocab_file=vocab_file,
+        ctc_config=ctc_config,
+        weighted_sum=weighted_sum,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_weight=interctc_weight,
+        interctc_use_conditioning=interctc_use_conditioning,
+        interctc_ctc_type=interctc_ctc_type,
+        ctc_aux_config=ctc_aux_config,
+        decoder_config=decoder_config,
+        ctc_weight=ctc_weight,
+    )
+def build_xeus_pr_inference(
+    work_dir: str,
+    checkpoint: str,
+    vocab_file: str,
+    device,
+    config_file: Optional[str] = None,
+    hf_repo: Optional[str] = None,
+    force_download: bool = False,
+    dtype: str = "float32",
+    ctc_config: Optional[dict] = None,
+    weighted_sum: bool = False,
+    interctc_layer_idx: Optional[list] = None,
+    interctc_weight: float = 0.0,
+    interctc_use_conditioning: bool = False,
+    interctc_ctc_type: str = "phone",
+    ctc_aux_config: Optional[dict] = None,
+    decoder_config: Optional[dict] = None,
+) -> XeusPRInference:
+    model = build_xeus_pr_from_hf(
+        work_dir=work_dir,
+        hf_repo=hf_repo,
+        force=force_download,
+        config_file=config_file,
+        checkpoint=checkpoint,
+        vocab_file=vocab_file,
+        ctc_config=ctc_config,
+        load_ckpt=True,
+        weighted_sum=weighted_sum,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_weight=interctc_weight,
+        interctc_use_conditioning=interctc_use_conditioning,
+        interctc_ctc_type=interctc_ctc_type,
+        ctc_aux_config=ctc_aux_config,
+        decoder_config=decoder_config,
+    )
+    inference_obj = XeusPRInference(model, device=device, dtype=dtype)
+    return inference_obj

src/model/xeusphoneme/cnn_frontend.py ADDED Viewed

	@@ -0,0 +1,261 @@

+from typing import List, Literal, Optional, Tuple, Union
+import torch
+from torch import Tensor, nn
+from torch.nn import Module
+from torch.nn import functional as F
+def dim_1_layer_norm(x, eps=1e-05, gamma=None, beta=None):
+    """Functional version of Dim1LayerNorm."""
+    B, D, T = x.shape
+    mean = torch.mean(x, 1, keepdim=True)
+    variance = torch.mean((x - mean) ** 2, 1, keepdim=True)
+    x = (x - mean) * torch.rsqrt(variance + eps)
+    if gamma is not None:
+        x = x * gamma.view(1, -1, 1)
+        if beta is not None:
+            x = x + beta.view(1, -1, 1)
+    return x
+class Dim1LayerNorm(Module):
+    def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, bias=True):
+        """LayerNorm on middle dim.
+        It assumes the input is shape B, D, T
+        to avoid transposing.
+        Faster than TransposedLayerNorm, but
+        may lead to minor numerical differences.
+        """
+        super().__init__()
+        self.normalized_shape = normalized_shape
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self.weight = None
+        self.bias = None
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(normalized_shape))
+            if bias:
+                self.bias = nn.Parameter(torch.zeros(normalized_shape))
+    def forward(self, x):
+        assert x.size(1) == self.normalized_shape
+        return dim_1_layer_norm(x, self.eps, self.weight, self.bias)
+class TransposedLayerNorm(nn.LayerNorm):
+    """Layer norm with transpose"""
+    def forward(self, input: Tensor) -> Tensor:
+        x = input.transpose(-2, -1)
+        x = nn.functional.layer_norm(
+            x, self.normalized_shape, self.weight, self.bias, self.eps
+        )
+        x = x.transpose(-2, -1)
+        return x
+class ConvLayerBlock(Module):
+    """Convolution unit of FeatureExtractor"""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        bias: bool,
+        layer_norm: Optional[Module],
+        conv_mode: str,
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.layer_norm = layer_norm
+        if conv_mode == "standard":
+            self.conv = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                bias=bias,
+            )
+        elif conv_mode == "depth_only":
+            self.conv = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                bias=bias,
+                groups=in_channels,
+            )
+        elif conv_mode == "depth_sep":
+            self.conv = nn.Sequential(
+                nn.Conv1d(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    bias=bias,
+                    groups=in_channels,
+                ),
+                nn.Conv1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    bias=bias,
+                ),
+            )
+        nn.init.kaiming_normal_(self.conv.weight)
+    def forward(
+        self,
+        x: Tensor,
+        length: Optional[Tensor],
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """ConvLayerBlock Forward.
+        Args:
+            x (Tensor): Shape: ``[batch, in_channels, in_frame]``.
+            length (Tensor or None, optional): Shape ``[batch, ]``.
+        Returns:
+            Tensor: Shape ``[batch, out_channels, out_frames]``.
+            Optional[Tensor]: Shape ``[batch, ]``.
+        """
+        x = self.conv(x)
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        x = nn.functional.gelu(x)
+        if length is not None:
+            length = (
+                torch.div(length - self.kernel_size, self.stride, rounding_mode="floor")
+                + 1
+            )
+            # When input length is 0, the resulting length can be negative.
+            length = torch.max(torch.zeros_like(length), length)
+        return x, length
+class CNNFrontend(Module):
+    """Convolutional feature extractor.
+    Typically used in SSL models.
+    Uses raw waveforms as input.
+    """
+    def __init__(
+        self,
+        norm_mode: str,
+        conv_mode: str,
+        bias: bool,
+        shapes: List[Tuple[int, int, int]] = [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        fs: Union[int, str] = 16000,
+        normalize_audio: bool = False,
+        normalize_output: bool = False,
+        layer_norm_cls: Literal["transposed", "dim1"] = "transposed",
+    ):
+        super().__init__()
+        if norm_mode not in ["group_norm", "layer_norm"]:
+            raise ValueError("Invalid norm mode")
+        if conv_mode not in ["standard", "depth_only", "depth_sep"]:
+            raise ValueError("Invalid cnn mode")
+        self.output_channels = shapes[-1][0]
+        self.normalize_audio = normalize_audio
+        if layer_norm_cls == "dim1":
+            layer_norm_func = Dim1LayerNorm
+        else:
+            layer_norm_func = TransposedLayerNorm
+        blocks = []
+        in_channels = 1
+        self.downsampling_factor = 1
+        for i, (out_channels, kernel_size, stride) in enumerate(shapes):
+            normalization = None
+            if norm_mode == "group_norm" and i == 0:
+                normalization = nn.GroupNorm(
+                    num_groups=out_channels,
+                    num_channels=out_channels,
+                    affine=True,
+                )
+            elif norm_mode == "layer_norm":
+                normalization = layer_norm_func(
+                    normalized_shape=out_channels,
+                )
+            blocks.append(
+                ConvLayerBlock(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    bias=bias,
+                    layer_norm=normalization,
+                    conv_mode=conv_mode,
+                )
+            )
+            in_channels = out_channels
+            self.downsampling_factor *= stride
+        self.layers = nn.Sequential(*blocks)
+        if normalize_output:
+            self.final_norm = nn.LayerNorm(self.output_channels)
+        else:
+            self.final_norm = nn.Identity()
+    def output_size(self) -> int:
+        return self.output_channels
+    def forward(
+        self,
+        x: Tensor,
+        length: Optional[Tensor],
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """CNNFrontend Forward.
+        Args:
+            x (Tensor):
+                Input Tensor representing a batch of audio,
+                shape: ``[batch, time]``.
+            length (Tensor or None, optional):
+                Valid length of each input sample. shape: ``[batch, ]``.
+        Returns:
+            Tensor:
+                The resulting feature, shape: ``[batch, frame, feature]``
+            Optional[Tensor]:
+                Valid length of each output sample. shape: ``[batch, ]``.
+        """
+        if x.ndim != 2:
+            raise ValueError(
+                f"Expected the input to be 2D (batch, time). Found: {list(x.shape)}"
+            )
+        if self.normalize_audio:
+            x = F.layer_norm(x, x.shape)
+        x = x.unsqueeze(1)  # (batch, channel==1, frame)
+        for layer in self.layers:
+            x, length = layer(x, length)  # (batch, feature, frame)
+        x = x.transpose(1, 2)  # (batch, frame, feature)
+        x = self.final_norm(x)
+        return x, length

src/model/xeusphoneme/linear_layer.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Linear Projection."""
+from typing import Tuple
+import torch
+class LinearProjection(torch.nn.Module):
+    def __init__(self, input_size: int, output_size: int, dropout: float = 0.0):
+        super().__init__()
+        self.output_dim = output_size
+        self.linear_out = torch.nn.Linear(input_size, output_size)
+        self.dropout = torch.nn.Dropout(dropout)
+    def forward(
+        self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        output = self.linear_out(self.dropout(input))
+        return output, input_lengths  # no state in this layer
+    def output_size(self) -> int:
+        return self.output_dim

src/model/xeusphoneme/resources/ipa_vocab.json ADDED Viewed

	@@ -0,0 +1,430 @@

+{
+  "<blank>": 0,
+  "<sos>": 1,
+  "<eos>": 2,
+  "<unk>": 3,
+  "ʈ": 4,
+  "ʎː": 5,
+  "cː": 6,
+  "œ̞": 7,
+  "ʔʲ": 8,
+  "o̤": 9,
+  "ɠ": 10,
+  "ø": 11,
+  "kˀ": 12,
+  "e̝": 13,
+  "ʈ͡ʂ": 14,
+  "ɡʰ": 15,
+  "ɟ": 16,
+  "z": 17,
+  "ʃˠ": 18,
+  "vˠ": 19,
+  "ǃʰ": 20,
+  "dʷ": 21,
+  "ĩ": 22,
+  "nˠ": 23,
+  "ə": 24,
+  "t͡ʃʰ": 25,
+  "d̤": 26,
+  "fʲ": 27,
+  "xʷ": 28,
+  "ɛ̃": 29,
+  "ʃʰ": 30,
+  "ʃ̩": 31,
+  "ɤˀ": 32,
+  "əː": 33,
+  "ɛ̯": 34,
+  "ɞ": 35,
+  "yː": 36,
+  "fʷ": 37,
+  "ẽ": 38,
+  "rˤ": 39,
+  "ɒ": 40,
+  "ɲː": 41,
+  "j": 42,
+  "f": 43,
+  "ɲ̥": 44,
+  "ʃː": 45,
+  "l": 46,
+  "ʒ̩": 47,
+  "ɛ̝": 48,
+  "ð̞": 49,
+  "ʃʲ": 50,
+  "ɛ": 51,
+  "ɟː": 52,
+  "ʌ": 53,
+  "ʍ": 54,
+  "kʰ": 55,
+  "p͡f": 56,
+  "ɜː": 57,
+  "ɘ": 58,
+  "bʷ": 59,
+  "sː": 60,
+  "ɡː": 61,
+  "o̝": 62,
+  "cʼ": 63,
+  "tʰ": 64,
+  "kʷ": 65,
+  "ŋ̥": 66,
+  "r̝": 67,
+  "ɸː": 68,
+  "u̝": 69,
+  "ṳ": 70,
+  "β̞": 71,
+  "ɾː": 72,
+  "ɔˤ": 73,
+  "ʎ": 74,
+  "ʊ̃": 75,
+  "pˀ": 76,
+  "m̩": 77,
+  "ɕː": 78,
+  "ɪ̯": 79,
+  "ɖʰ": 80,
+  "ɰ": 81,
+  "t̠": 82,
+  "t͡ʃʲ": 83,
+  "ɡ̤": 84,
+  "j̩": 85,
+  "ɭ̩": 86,
+  "ŋ̰": 87,
+  "p": 88,
+  "ɾ": 89,
+  "sʲ": 90,
+  "ɲ̤": 91,
+  "cʰ": 92,
+  "a̯": 93,
+  "ɡʷ": 94,
+  "t͡s": 95,
+  "ɨ̯": 96,
+  "n̩": 97,
+  "ʌː": 98,
+  "ɤ": 99,
+  "l̩": 100,
+  "l̴": 101,
+  "pʲ": 102,
+  "k": 103,
+  "jː": 104,
+  "ɛ̈": 105,
+  "t͡ʃː": 106,
+  "dˠ": 107,
+  "ɱ̩": 108,
+  "ɯː": 109,
+  "kʼ": 110,
+  "ɑ̯": 111,
+  "zʷ": 112,
+  "çː": 113,
+  "ã": 114,
+  "sˠ": 115,
+  "s̻": 116,
+  "ɐ": 117,
+  "ɸʷ": 118,
+  "ɔ̃": 119,
+  "bˠ": 120,
+  "ʈː": 121,
+  "ʂ": 122,
+  "ɑ": 123,
+  "ë": 124,
+  "ɸ": 125,
+  "ɮʲ": 126,
+  "nː": 127,
+  "mʷ": 128,
+  "ǁ": 129,
+  "ʒ": 130,
+  "jˠ": 131,
+  "d": 132,
+  "tː": 133,
+  "ɤ̆": 134,
+  "s̺": 135,
+  "mː": 136,
+  "ɻ": 137,
+  "l̪": 138,
+  "ɜ": 139,
+  "ɓ": 140,
+  "ü": 141,
+  "lʲ": 142,
+  "tˠ": 143,
+  "ŋː": 144,
+  "ŋʲ": 145,
+  "h̩": 146,
+  "qʷ": 147,
+  "tʼ": 148,
+  "ə̯": 149,
+  "t͡sʲː": 150,
+  "m̤": 151,
+  "ɕʰ": 152,
+  "nʲ": 153,
+  "rˠ": 154,
+  "ɖ̤": 155,
+  "ø̈": 156,
+  "ɯˀ": 157,
+  "mʲ": 158,
+  "n̥": 159,
+  "mˤ": 160,
+  "ʒʲ": 161,
+  "æ": 162,
+  "tʷ": 163,
+  "d̪": 164,
+  "ʔ": 165,
+  "a̠": 166,
+  "ɾˠ": 167,
+  "ʉ": 168,
+  "ɔ̯": 169,
+  "zʲ": 170,
+  "ɳː": 171,
+  "t͡sː": 172,
+  "æ̯": 173,
+  "r̤": 174,
+  "ɑː": 175,
+  "ɘː": 176,
+  "ə˞": 177,
+  "zˤ": 178,
+  "õ": 179,
+  "əˀ": 180,
+  "e": 181,
+  "nˤ": 182,
+  "u": 183,
+  "ɑ̃": 184,
+  "o": 185,
+  "ħ": 186,
+  "ŋ": 187,
+  "mˠ": 188,
+  "i": 189,
+  "rʲ": 190,
+  "ɔ": 191,
+  "xʰ": 192,
+  "dˤ": 193,
+  "s̩": 194,
+  "t͡ɕʰ": 195,
+  "ɔ̈": 196,
+  "ĕ": 197,
+  "ɴ": 198,
+  "k͡x": 199,
+  "d͡ʒ": 200,
+  "dʲ": 201,
+  "æ̞": 202,
+  "ɡ̃": 203,
+  "uː": 204,
+  "pʰ": 205,
+  "ʁ": 206,
+  "n̪": 207,
+  "zˠ": 208,
+  "ø̞": 209,
+  "ɔː": 210,
+  "ɳ": 211,
+  "vʲ": 212,
+  "œ̃": 213,
+  "ɾ̝": 214,
+  "ũ": 215,
+  "ĭ": 216,
+  "ɐ̯": 217,
+  "ʁ̝": 218,
+  "qʼ": 219,
+  "β": 220,
+  "pʼ": 221,
+  "ɡ͡b": 222,
+  "oː": 223,
+  "ɲ": 224,
+  "j̃": 225,
+  "l̠": 226,
+  "a": 227,
+  "d͡ʑ": 228,
+  "œː": 229,
+  "t̪": 230,
+  "zː": 231,
+  "ʁ̩": 232,
+  "ɔ̤": 233,
+  "œ": 234,
+  "dʰ": 235,
+  "lː": 236,
+  "z̤": 237,
+  "sʰ": 238,
+  "ʏ̯": 239,
+  "ð": 240,
+  "r̩": 241,
+  "n̤": 242,
+  "ɭʲ": 243,
+  "ɭː": 244,
+  "ə̃": 245,
+  "ä": 246,
+  "ʀ": 247,
+  "æː": 248,
+  "ɡʲ": 249,
+  "ɪ̃": 250,
+  "lˠ": 251,
+  "ʊː": 252,
+  "cʲ": 253,
+  "ă": 254,
+  "d͡ʒː": 255,
+  "i̯": 256,
+  "ʉː": 257,
+  "t͡ɕː": 258,
+  "ɬ": 259,
+  "fˀ": 260,
+  "bʲ": 261,
+  "ɐ̃": 262,
+  "ɣ̤": 263,
+  "xʲ": 264,
+  "ɛ̆": 265,
+  "θ": 266,
+  "ɵː": 267,
+  "ɨ̞": 268,
+  "ɡ": 269,
+  "ð̠": 270,
+  "l̤": 271,
+  "w̃": 272,
+  "ɹ": 273,
+  "ɣʲ": 274,
+  "wˠ": 275,
+  "u̯": 276,
+  "wː": 277,
+  "ʐ": 278,
+  "ɵ": 279,
+  "ðˠ": 280,
+  "t͡ʃʼ": 281,
+  "pʷ": 282,
+  "v̤": 283,
+  "ǀʰ": 284,
+  "x": 285,
+  "ɥ": 286,
+  "ʂː": 287,
+  "r": 288,
+  "o̞": 289,
+  "ðˤ": 290,
+  "ɨ̃": 291,
+  "ʊ": 292,
+  "ʙ": 293,
+  "b̤": 294,
+  "ŋ̤": 295,
+  "kʲ": 296,
+  "ʏː": 297,
+  "ʄ": 298,
+  "eː": 299,
+  "ɗ": 300,
+  "ʏ̈": 301,
+  "ɛˤ": 302,
+  "w": 303,
+  "pː": 304,
+  "ɖ": 305,
+  "ɧ": 306,
+  "h": 307,
+  "ǁʰ": 308,
+  "hʲ": 309,
+  "ʃ": 310,
+  "ɑ̈": 311,
+  "d͡z": 312,
+  "bˤ": 313,
+  "k͡p": 314,
+  "ð̩": 315,
+  "n̠": 316,
+  "bː": 317,
+  "f̩": 318,
+  "wʲ": 319,
+  "o̯": 320,
+  "ʁː": 321,
+  "pˠ": 322,
+  "kː": 323,
+  "ɪˤ": 324,
+  "ʑː": 325,
+  "ʌ̃": 326,
+  "ɪː": 327,
+  "ǃ": 328,
+  "ç": 329,
+  "s": 330,
+  "hː": 331,
+  "rː": 332,
+  "tˤ": 333,
+  "ɦʲ": 334,
+  "ŋ̩": 335,
+  "m̥": 336,
+  "ɖː": 337,
+  "ɭ": 338,
+  "mˀ": 339,
+  "n": 340,
+  "iː": 341,
+  "æ̝": 342,
+  "xː": 343,
+  "i̤": 344,
+  "ɽ̤": 345,
+  "ɶ": 346,
+  "ˀs": 347,
+  "l̥": 348,
+  "ɱ": 349,
+  "e̞": 350,
+  "ʋ": 351,
+  "y̯": 352,
+  "lˤ": 353,
+  "ö": 354,
+  "a̝": 355,
+  "ɶː": 356,
+  "t͡sʼ": 357,
+  "s̠": 358,
+  "t͡sʲ": 359,
+  "ɪ": 360,
+  "y̆": 361,
+  "ɤː": 362,
+  "ɟʰ": 363,
+  "ʒː": 364,
+  "tʲ": 365,
+  "ɕ": 366,
+  "ɨ": 367,
+  "c": 368,
+  "t͡ʃ": 369,
+  "ʑ": 370,
+  "ʝ": 371,
+  "ʋ̥": 372,
+  "ɢ": 373,
+  "ɛː": 374,
+  "b": 375,
+  "øː": 376,
+  "ǀ": 377,
+  "ʏ": 378,
+  "i̝": 379,
+  "ʊ̯": 380,
+  "ʊˤ": 381,
+  "ɐˤ": 382,
+  "r̥": 383,
+  "t͡sʰ": 384,
+  "aː": 385,
+  "t͡ɬ": 386,
+  "ʋː": 387,
+  "sˤ": 388,
+  "s̪": 389,
+  "dː": 390,
+  "ɪ̈": 391,
+  "ɨː": 392,
+  "ɽʷ": 393,
+  "ʕ": 394,
+  "ɒː": 395,
+  "χ": 396,
+  "fˠ": 397,
+  "ɯ": 398,
+  "hˠ": 399,
+  "jˤ": 400,
+  "tˀ": 401,
+  "ɣ": 402,
+  "y": 403,
+  "ɦ": 404,
+  "ʈʰ": 405,
+  "t͡ɕ": 406,
+  "vː": 407,
+  "m": 408,
+  "ɮ": 409,
+  "e̤": 410,
+  "ʋʲ": 411,
+  "æ̃": 412,
+  "v": 413,
+  "ɽ": 414,
+  "t": 415,
+  "a̤": 416,
+  "e̯": 417,
+  "ɜ˞": 418,
+  "q": 419,
+  "bʰ": 420,
+  "t͡sˠ": 421,
+  "ʂʰ": 422,
+  "fː": 423,
+  "sʷ": 424,
+  "ɾʲ": 425,
+  "w̤": 426,
+  "fˤ": 427
+}

src/model/xeusphoneme/xeuspr_inference.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Compatible with distributed inference api, uses greedy ctc inference strategy
+# python -m src.model.xeusphoneme.xeuspr_inference
+import torch
+import numpy as np
+import torch.nn.functional as F
+from typing import Union, List, Dict, Any, Optional
+from src.recipe.phone_recognition.greedy_ctc_strategy import GreedyCTCInference
+class XeusPRInference:
+    """Greedy inference for Xeus Phoneme Recognition model."""
+    def __init__(
+        self,
+        model: torch.nn.Module,
+        device: str = "cpu",
+        dtype: str = "float32",
+    ):
+        self.device = device
+        self.dtype = getattr(torch, dtype)
+        self.model = model.to(device=self.device, dtype=self.dtype).eval()
+        self.token_list = model.token_list
+        self.blank_id = model.get_blank_id()
+        self.ignore_id = getattr(model, "ignore_id", -1)
+        self.inference_strategy = GreedyCTCInference(
+            token_list=self.token_list, blank_id=self.blank_id
+        )
+    @torch.no_grad()
+    def __call__(
+        self, speech: Union[torch.Tensor, np.ndarray], **kwargs
+    ) -> List[Dict[str, Any]]:
+        """
+        Perform greedy inference.
+        Args:
+            speech: Input speech of shape (nsamples,) or (batch, nsamples)
+        Returns:
+            List of results matching Powsm API
+        """
+        # 1. Prepare Input
+        if isinstance(speech, np.ndarray):
+            speech = torch.from_numpy(speech)
+        if speech.dim() == 1:
+            speech = speech.unsqueeze(0)
+        speech = speech.to(device=self.device, dtype=self.dtype)
+        speech_lengths = torch.full(
+            (speech.size(0),), speech.size(1), device=self.device, dtype=torch.long
+        )
+        results = self.inference_strategy(
+            model=self.model,
+            speech=speech,
+            speech_lengths=speech_lengths,
+            **kwargs,
+        )
+        return results
+if __name__ == "__main__":
+    from src.model.xeusphoneme.builders import build_xeus_pr_inference
+    # Example usage
+    ckpt_path = "path/to/checkpoints/last.ckpt"
+    work_dir = "path/to/exp/cache/xeus"
+    vocab_file = "src/model/xeusphoneme/resources/ipa_vocab.json"
+    device = "cpu" if not torch.cuda.is_available() else "cuda:0"
+    inference_obj = build_xeus_pr_inference(
+        work_dir=work_dir,
+        checkpoint=ckpt_path,
+        vocab_file=vocab_file,
+        hf_repo="espnet/xeus",
+        config_file=None,
+        device=device,
+        force_download=False,
+    )
+    import torchaudio
+    speechpath = "path/to/test_audio.wav"
+    speech = torchaudio.load(speechpath)[0].squeeze(0)
+    # speech = speech[: 16000 * 40]  # 10 seconds of audio
+    # dummy_speech = np.random.randn(16000 * 5).astype(np.float32)  # 5 seconds of audio
+    results = inference_obj(speech=speech)
+    print(results)

src/model/xeusphoneme/xeuspr_model.py ADDED Viewed

	@@ -0,0 +1,378 @@

+#!/usr/bin/env python3
+"""Xeus Phoneme Recognition Model.
+# -*- coding: utf-8 -*-
+# Copyright 2025 William Chen. Adapted from ESPnet.
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+Usage:
+    python -m src.model.xeusphoneme.xeuspr_model \
+        --work_dir path/to/cache/xeus
+"""
+from typing import Any, Dict, Optional, Tuple, Union
+import argparse
+import torch
+import torch.nn.functional as F
+import torchaudio
+from src.model.powsm.utils import force_gatherable
+from src.espnet_import.nets_utils import make_pad_mask, pad_list, th_accuracy
+from src.espnet_import.label_smoothing_loss import LabelSmoothingLoss
+try:
+    from src.recipe.phone_recognition.error_calculator import (
+        ErrorCalculator,
+    )
+except ImportError:
+    class ErrorCalculator:
+        """No-op stub when rapidfuzz/panphon are unavailable."""
+        def __init__(self, *args, **kwargs):
+            pass
+        def __call__(self, *args, **kwargs):
+            return {}
+from src.model.powsm.ctc import CTC
+from src.utils import RankedLogger
+log = RankedLogger(__name__, rank_zero_only=False)
+class XeusPRModel(torch.nn.Module):
+    """Encoder-only CTC model for phone recognition using Xeus pretrained weights."""
+    def __init__(
+        self,
+        encoder: Any,
+        ctc: CTC,
+        token_list: Union[Tuple, list],
+        frontend: Optional[Any] = None,
+        specaug: Optional[Any] = None,
+        normalize: Optional[Any] = None,
+        preencoder: Optional[Any] = None,
+        ignore_id: int = -1,
+        sym_blank: str = "<blank>",
+        freeze_frontend: bool = True,
+        weighted_sum: bool = False,
+        interctc_weight: float = 0.0,
+        interctc_use_conditioning: bool = False,
+        interctc_ctc_type: str = "phone",
+        ctc_aux: Optional[Any] = None,
+        decoder: Optional[Any] = None,
+        ctc_weight: float = 1.0,
+        lsm_weight: float = 0.0,
+        sym_sos: str = "<sos>",
+        sym_eos: str = "<eos>",
+        **kwargs,
+    ):
+        super().__init__()
+        self.frontend = frontend
+        self.specaug = specaug
+        self.normalize = normalize
+        self.preencoder = preencoder
+        self.encoder = encoder
+        self.ctc = ctc
+        self.ctc_aux = ctc_aux
+        self.interctc_ctc_type = interctc_ctc_type
+        if interctc_use_conditioning:
+            vocab_size_cond = (
+                ctc_aux.ctc_lo.out_features
+                if interctc_ctc_type == "ortho" and ctc_aux is not None
+                else len(token_list)
+            )
+            self.encoder.conditioning_layer = torch.nn.Linear(
+                vocab_size_cond, encoder.output_size()
+            )
+            self.encoder.interctc_use_conditioning = True
+        self.token_list = list(token_list)
+        self.ignore_id = ignore_id
+        self.blank_id = token_list.index(sym_blank) if sym_blank in token_list else 0
+        sym_space = kwargs.get("sym_space", "<space>")
+        self.freeze_frontend = freeze_frontend
+        self.error_calculator = ErrorCalculator(
+            token_list,
+            blank_id=self.blank_id,
+            sym_space=sym_space,
+            ignore_id=ignore_id,
+            log_phone_metrics=True,
+        )
+        self.decoder = decoder
+        self.ctc_weight = ctc_weight
+        if decoder is not None:
+            self.sos = token_list.index(sym_sos)
+            self.eos = token_list.index(sym_eos)
+            self.criterion_att = LabelSmoothingLoss(
+                size=len(token_list),
+                padding_idx=ignore_id,
+                smoothing=lsm_weight,
+                normalize_length=False,
+            )
+        self.weighted_sum = weighted_sum
+        if self.weighted_sum:
+            n_layers = encoder.num_blocks
+            assert (
+                n_layers is not None and n_layers > 0
+            ), "Cannot infer number of encoder layers for weighted_sum"
+            self.layer_weights = torch.nn.Parameter(torch.zeros(int(n_layers)))
+        self.interctc_weight = interctc_weight
+        self.sampling_rate = 16000
+    def points_by_frames(self) -> int:
+        """Samples per encoder frame (CNN downsampling factor)."""
+        return self.frontend.downsampling_factor
+    @torch.no_grad()
+    def forced_align(self, speech, speech_lengths, text, text_lengths, utt_id=None):
+        """CTC forced alignment via torchaudio.functional.forced_align (batch size 1)."""
+        assert speech.shape[0] == 1, "forced_align requires batch size 1"
+        text = text[:, : text_lengths.max()]
+        logits, logit_lengths = self.ctc_logits(speech, speech_lengths)
+        log_probs = F.log_softmax(logits, dim=-1)
+        align_label, align_prob = torchaudio.functional.forced_align(
+            log_probs, text, logit_lengths, text_lengths, blank=self.blank_id
+        )
+        return align_label, align_prob
+    def collect_feats(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor, **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """Extract features for stats collection."""
+        feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+        return {"feats": feats, "feats_lengths": feats_lengths}
+    def forward(self, speech, speech_lengths, text, text_lengths, **kwargs):
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+        loss_ctc, stats = self._calc_ctc_loss(
+            encoder_out, encoder_out_lens, text, text_lengths, **kwargs
+        )
+        if self.interctc_weight > 0.0 and intermediate_outs:
+            if self.interctc_ctc_type == "ortho" and self.ctc_aux is not None:
+                ctc_inter = self.ctc_aux
+                ys_inter = kwargs.get("asr_text_tokens")
+                ys_inter_lens = kwargs.get("asr_text_length")
+            else:
+                ctc_inter = self.ctc
+                ys_inter = torch.where(text == -1, self.ignore_id, text)[
+                    :, : text_lengths.max()
+                ]
+                ys_inter_lens = text_lengths
+            if ys_inter is not None and ys_inter_lens is not None:
+                loss_interctc = 0.0
+                for layer_idx, intermediate_out in intermediate_outs:
+                    loss_ic = ctc_inter(
+                        intermediate_out,
+                        encoder_out_lens,
+                        ys_inter,
+                        ys_inter_lens,
+                    )
+                    loss_interctc = loss_interctc + loss_ic
+                    stats[f"loss_interctc_layer{layer_idx}"] = loss_ic.detach()
+                loss_interctc = loss_interctc / len(intermediate_outs)
+                loss_ctc = (
+                    1 - self.interctc_weight
+                ) * loss_ctc + self.interctc_weight * loss_interctc
+        # Attention branch
+        if self.ctc_weight < 1.0 and self.decoder is not None:
+            loss_att, acc_att = self._calc_att_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+            stats["loss_att"] = loss_att.detach()
+            stats["acc_att"] = acc_att
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
+        else:
+            loss = loss_ctc
+        loss, stats, weight = force_gatherable(
+            (loss, stats, speech.shape[0]), loss.device
+        )
+        return {"loss": loss, "stats": stats, "weight": weight}
+    def _extract_feats(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Extract features using frontend."""
+        speech = speech[:, : speech_lengths.max()]
+        return (
+            self.frontend(speech, speech_lengths)
+            if self.frontend
+            else (speech, speech_lengths)
+        )
+    def _apply_preprocessing(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply frontend, specaug, normalize, and preencoder."""
+        speech, speech_lengths = self._extract_feats(speech, speech_lengths)
+        if self.specaug and self.training:
+            speech, speech_lengths = self.specaug(speech, speech_lengths)
+        if self.normalize:
+            speech, speech_lengths = self.normalize(speech, speech_lengths)
+        if self.preencoder:
+            speech, speech_lengths = self.preencoder(speech, speech_lengths)
+        return speech, speech_lengths
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Encode speech to frame-level representations.
+        When weighted_sum=True, returns a weighted sum of all encoder layers.
+        Otherwise, calls the encoder without return_all_hs; if interctc_layer_idx
+        is configured on the encoder, returns (final_out, [(layer_idx, tensor), ...]).
+        """
+        speech, speech_lengths = self._apply_preprocessing(speech, speech_lengths)
+        pad_masks = make_pad_mask(speech_lengths).to(speech.device)
+        if self.weighted_sum:
+            encoder_out, encoder_out_lens, _ = self.encoder(
+                speech, speech_lengths, masks=pad_masks, return_all_hs=True
+            )
+            hs_list = encoder_out[1]
+            assert len(hs_list) == self.layer_weights.numel()
+            w = torch.softmax(self.layer_weights, dim=0).to(
+                hs_list[0].device, hs_list[0].dtype
+            )
+            hs = torch.stack(hs_list, dim=0)  # (L, B, T, D)
+            return (w.view(-1, 1, 1, 1) * hs).sum(0), encoder_out_lens
+        else:
+            ctc_for_encoder = (
+                self.ctc_aux
+                if self.interctc_ctc_type == "ortho" and self.ctc_aux is not None
+                else self.ctc
+            )
+            encoder_out, encoder_out_lens, _ = self.encoder(
+                speech, speech_lengths, masks=pad_masks, ctc=ctc_for_encoder
+            )
+            return encoder_out, encoder_out_lens
+    def ctc_collapse_batch(self, x: torch.Tensor, max_length: int, pad: int = -1):
+        B, T = x.shape
+        blank = self.blank_id
+        x_prev = torch.cat(
+            [torch.full((B, 1), blank, device=x.device, dtype=x.dtype), x[:, :-1]],
+            dim=1,
+        )
+        keep = (x != blank) & ((x_prev == blank) | (x != x_prev))
+        pos = keep.long().cumsum(1) - 1
+        lengths = keep.sum(1)
+        out = torch.full((B, T), pad, device=x.device, dtype=x.dtype)
+        # Compute batch indices and output positions for kept elements
+        batch_idx = (
+            torch.arange(B, device=x.device, dtype=torch.long).unsqueeze(1).expand_as(x)
+        )
+        output_pos = pos.clone()
+        # Only use positions where keep is True
+        batch_idx_keep = batch_idx[keep]
+        output_pos_keep = output_pos[keep]
+        # Flatten the output and set values at correct positions
+        flat_out = out.view(-1)
+        flat_idx = batch_idx_keep * T + output_pos_keep
+        flat_out[flat_idx] = x[keep]
+        out = flat_out.view(B, T)
+        ##### Trim to max_length from ground truth lengths
+        out = out[:, :max_length]
+        lengths = torch.clamp(lengths, max=max_length)
+        return out, lengths
+    def _calc_att_loss(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens):
+        ys_pad = torch.where(ys_pad == -1, self.ignore_id, ys_pad)
+        ys = [y[y != self.ignore_id][:l] for y, l in zip(ys_pad, ys_pad_lens)]
+        _sos = ys_pad.new([self.sos])
+        _eos = ys_pad.new([self.eos])
+        ys_in = [torch.cat([_sos, y]) for y in ys]
+        ys_out = [torch.cat([y, _eos]) for y in ys]
+        ys_in_pad = pad_list(ys_in, self.eos)
+        ys_out_pad = pad_list(ys_out, self.ignore_id)
+        ys_in_lens = torch.tensor([len(y) for y in ys_in], device=ys_pad.device)
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, len(self.token_list)),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+        return loss_att, acc_att
+    def _calc_ctc_loss(
+        self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, **kwargs
+    ):
+        ys_pad = torch.where(ys_pad == -1, self.ignore_id, ys_pad)
+        ys_pad = ys_pad[:, : ys_pad_lens.max()]
+        loss_ctc = self.ctc(
+            encoder_out,
+            encoder_out_lens,
+            ys_pad,
+            ys_pad_lens,
+            lang_sym=kwargs.get("lang_sym"),
+            accent_sym=kwargs.get("accent_sym"),
+        )
+        stats = {}
+        assert self.error_calculator is not None, "ErrorCalculator not initialized"
+        if not self.training:  # err calc, slow?
+            with torch.no_grad():
+                ys_hat = self.ctc.argmax(encoder_out).data  # greedy-top1
+                metrics = self.error_calculator(
+                    ys_hat.cpu(), ys_pad.cpu(), ys_pad_lens.cpu()
+                )
+                for k, v in metrics.items():
+                    stats[k + "_ctc"] = v
+        return loss_ctc, stats
+    def ctc_logits(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get CTC logits for inference."""
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        if isinstance(encoder_out, tuple):
+            encoder_out = encoder_out[0]
+        return self.ctc.ctc_lo(encoder_out), encoder_out_lens
+    def encoder_output_size(self) -> int:
+        return self.encoder.output_size()
+    def get_blank_id(self) -> int:
+        return self.blank_id
+    def get_frontend(self):
+        return self.frontend
+    def get_trainable_parameters(self):
+        trainable_params = {"head": [], "encoder": []}
+        for n, p in self.named_parameters():
+            if (
+                n.startswith("ctc")
+                or n.startswith("decoder")
+                or n.startswith("criterion_att")
+            ):
+                trainable_params["head"].append(p)
+            elif n.startswith("encoder"):
+                trainable_params["encoder"].append(p)
+            elif n.startswith("frontend"):
+                if self.freeze_frontend:
+                    p.requires_grad = False
+                else:
+                    trainable_params["encoder"].append(p)
+            else:
+                # freeze other parts:
+                p.requires_grad = False
+        return trainable_params

src/recipe/__init__.py ADDED Viewed

File without changes

src/recipe/phone_recognition/__init__.py ADDED Viewed

File without changes

src/recipe/phone_recognition/greedy_ctc_strategy.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+from typing import List, Dict, Any, Union
+def ctc_collapse_vectorized(
+    ids: torch.Tensor, blank_id: int, ignore_id: int = -1
+) -> List[List[int]]:
+    """Optimized CTC collapse for batch tensors."""
+    mask = torch.ones_like(ids, dtype=torch.bool)
+    mask[:, 1:] = ids[:, 1:] != ids[:, :-1]
+    mask &= ids != blank_id
+    if ignore_id != -1:
+        mask &= ids != ignore_id
+    return [ids[i][mask[i]].tolist() for i in range(ids.size(0))]
+class GreedyCTCInference:
+    """A scalable inference engine for any CTC-based phone recognizer."""
+    def __init__(self, token_list: List[str], blank_id: int):
+        self.token_list = token_list
+        self.blank_id = blank_id
+    @torch.no_grad()
+    def __call__(
+        self,
+        model: torch.nn.Module,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        **kwargs
+    ) -> List[Dict[str, Any]]:
+        # 1. Standardized Forward pass
+        # Works as long as model has .encode() and .ctc
+        encoder_out, _ = model.encode(speech, speech_lengths)
+        if isinstance(encoder_out, tuple):
+            encoder_out = encoder_out[0]
+        logits = model.ctc.ctc_lo(encoder_out)
+        # 2. Greedy search
+        y_hat = torch.argmax(logits, dim=-1)
+        # 3. Collapse
+        collapsed_ids = ctc_collapse_vectorized(y_hat, self.blank_id)
+        # 4. Map to text
+        results = []
+        for ids in collapsed_ids:
+            tokens = [self.token_list[i] for i in ids]
+            raw_text = "/".join(tokens)
+            # Filter special tokens
+            clean_tokens = [
+                t for t in tokens if not (t.startswith("<") and t.endswith(">"))
+            ]
+            processed = "".join(clean_tokens).strip()  # replace(self.sym_space, " ")
+            results.append(
+                {
+                    "processed_transcript": processed,
+                    "predicted_transcript": raw_text,
+                }
+            )
+        return results

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from src.utils.pylogger import RankedLogger

src/utils/pylogger.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import logging
+from typing import Mapping, Optional
+class RankedLogger(logging.LoggerAdapter):
+    """Simplified logger for single-process inference (no Lightning)."""
+    def __init__(
+        self,
+        name: str = __name__,
+        rank_zero_only: bool = False,
+        extra: Optional[Mapping[str, object]] = None,
+    ) -> None:
+        logger = logging.getLogger(name)
+        super().__init__(logger=logger, extra=extra)
+        self.rank_zero_only = rank_zero_only
+    def log(
+        self, level: int, msg: str, rank: Optional[int] = None, *args, **kwargs
+    ) -> None:
+        if self.isEnabledFor(level):
+            msg, kwargs = self.process(msg, kwargs)
+            self.logger.log(level, msg, *args, **kwargs)