Upload 12 files

Browse files

Files changed (12) hide show

CKPT.yaml +4 -0
SLU2.py +1345 -0
brain.ckpt +3 -0
counter.ckpt +3 -0
hyperparams.yaml +170 -0
labelencoder.txt +113 -0
lr_annealing.ckpt +3 -0
lr_annealing_wav2vec.ckpt +3 -0
model.ckpt +3 -0
optimizer.ckpt +3 -0
optimizer_wav2vec.ckpt +3 -0
wav2vec.ckpt +3 -0

CKPT.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# yamllint disable
+COER: 35.85329341317365
+end-of-epoch: true
+unixtime: 1701399679.8773978

SLU2.py ADDED Viewed

	@@ -0,0 +1,1345 @@

+""" Specifies the inference interfaces for Automatic speech Recognition (ASR) modules.
+Authors:
+ * Aku Rouhe 2021
+ * Peter Plantinga 2021
+ * Loren Lugosch 2020
+ * Mirco Ravanelli 2020
+ * Titouan Parcollet 2021
+ * Abdel Heba 2021
+ * Andreas Nautsch 2022, 2023
+ * Pooneh Mousavi 2023
+ * Sylvain de Langen 2023, 2024
+ * Adel Moumen 2023, 2024
+ * Pradnya Kandarkar 2023
+"""
+import functools
+import itertools
+from dataclasses import dataclass
+from typing import Any, List, Optional, Tuple
+import sentencepiece
+import torch
+import torchaudio
+from tqdm import tqdm
+import speechbrain
+from speechbrain.inference.interfaces import Pretrained
+from speechbrain.utils.data_utils import split_path
+from speechbrain.utils.dynamic_chunk_training import DynChunkTrainConfig
+from speechbrain.utils.fetching import fetch
+from speechbrain.utils.streaming import split_fixed_chunks
+class EncoderDecoderASR(Pretrained):
+    """A ready-to-use Encoder-Decoder ASR model
+    The class can be used either to run only the encoder (encode()) to extract
+    features or to run the entire encoder-decoder model
+    (transcribe()) to transcribe speech. The given YAML must contain the fields
+    specified in the *_NEEDED[] lists.
+    Arguments
+    ---------
+    *args : tuple
+    **kwargs : dict
+        Arguments are forwarded to ``Pretrained`` parent class.
+    Example
+    -------
+    >>> from speechbrain.inference.ASR import EncoderDecoderASR
+    >>> tmpdir = getfixture("tmpdir")
+    >>> asr_model = EncoderDecoderASR.from_hparams(
+    ...     source="speechbrain/asr-crdnn-rnnlm-librispeech",
+    ...     savedir=tmpdir,
+    ... )  # doctest: +SKIP
+    >>> asr_model.transcribe_file("tests/samples/single-mic/example2.flac")  # doctest: +SKIP
+    "MY FATHER HAS REVEALED THE CULPRIT'S NAME"
+    """
+    HPARAMS_NEEDED = ["tokenizer"]
+    MODULES_NEEDED = ["encoder", "decoder"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tokenizer = self.hparams.tokenizer
+        self.transducer_beam_search = False
+        self.transformer_beam_search = False
+        if hasattr(self.hparams, "transducer_beam_search"):
+            self.transducer_beam_search = self.hparams.transducer_beam_search
+        if hasattr(self.hparams, "transformer_beam_search"):
+            self.transformer_beam_search = self.hparams.transformer_beam_search
+    def transcribe_file(self, path, **kwargs):
+        """Transcribes the given audiofile into a sequence of words.
+        Arguments
+        ---------
+        path : str
+            Path to audio file which to transcribe.
+        **kwargs : dict
+            Arguments forwarded to ``load_audio``.
+        Returns
+        -------
+        str
+            The audiofile transcription produced by this ASR system.
+        """
+        waveform = self.load_audio(path, **kwargs)
+        # Fake a batch:
+        batch = waveform.unsqueeze(0)
+        rel_length = torch.tensor([1.0])
+        predicted_words, predicted_tokens = self.transcribe_batch(
+            batch, rel_length
+        )
+        return predicted_words[0]
+    def encode_batch(self, wavs, wav_lens):
+        """Encodes the input audio into a sequence of hidden states
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.Tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model.
+        wav_lens : torch.Tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        torch.Tensor
+            The encoded batch
+        """
+        wavs = wavs.float()
+        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+        encoder_out = self.mods.encoder(wavs, wav_lens)
+        if self.transformer_beam_search:
+            encoder_out = self.mods.transformer.encode(encoder_out, wav_lens)
+        return encoder_out
+    def transcribe_batch(self, wavs, wav_lens):
+        """Transcribes the input audio into a sequence of words
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.Tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model.
+        wav_lens : torch.Tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        list
+            Each waveform in the batch transcribed.
+        tensor
+            Each predicted token id.
+        """
+        with torch.no_grad():
+            wav_lens = wav_lens.to(self.device)
+            encoder_out = self.encode_batch(wavs, wav_lens)
+            if self.transducer_beam_search:
+                inputs = [encoder_out]
+            else:
+                inputs = [encoder_out, wav_lens]
+            predicted_tokens, _, _, _ = self.mods.decoder(*inputs)
+            predicted_words = [
+                self.tokenizer.decode_ids(token_seq)
+                for token_seq in predicted_tokens
+            ]
+        return predicted_words, predicted_tokens
+    def forward(self, wavs, wav_lens):
+        """Runs full transcription - note: no gradients through decoding"""
+        return self.transcribe_batch(wavs, wav_lens)
+class EncoderASR(Pretrained):
+    """A ready-to-use Encoder ASR model
+    The class can be used either to run only the encoder (encode()) to extract
+    features or to run the entire encoder + decoder function model
+    (transcribe()) to transcribe speech. The given YAML must contain the fields
+    specified in the *_NEEDED[] lists.
+    Arguments
+    ---------
+    *args : tuple
+    **kwargs : dict
+        Arguments are forwarded to ``Pretrained`` parent class.
+    Example
+    -------
+    >>> from speechbrain.inference.ASR import EncoderASR
+    >>> tmpdir = getfixture("tmpdir")
+    >>> asr_model = EncoderASR.from_hparams(
+    ...     source="speechbrain/asr-wav2vec2-commonvoice-fr",
+    ...     savedir=tmpdir,
+    ... ) # doctest: +SKIP
+    >>> asr_model.transcribe_file("samples/audio_samples/example_fr.wav") # doctest: +SKIP
+    """
+    HPARAMS_NEEDED = ["tokenizer", "decoding_function"]
+    MODULES_NEEDED = ["encoder"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tokenizer = self.hparams.tokenizer
+        self.set_decoding_function()
+    def set_decoding_function(self):
+        """Set the decoding function based on the parameters defined in the hyperparameter file.
+        The decoding function is determined by the `decoding_function` specified in the hyperparameter file.
+        It can be either a functools.partial object representing a decoding function or an instance of
+        `speechbrain.decoders.ctc.CTCBaseSearcher` for beam search decoding.
+        Raises:
+            ValueError: If the decoding function is neither a functools.partial nor an instance of
+                        speechbrain.decoders.ctc.CTCBaseSearcher.
+        Note:
+            - For greedy decoding (functools.partial), the provided `decoding_function` is assigned directly.
+            - For CTCBeamSearcher decoding, an instance of the specified `decoding_function` is created, and
+            additional parameters are added based on the tokenizer type.
+        """
+        # Greedy Decoding case
+        if isinstance(self.hparams.decoding_function, functools.partial):
+            self.decoding_function = self.hparams.decoding_function
+        # CTCBeamSearcher case
+        else:
+            # 1. check if the decoding function is an instance of speechbrain.decoders.CTCBaseSearcher
+            if issubclass(
+                self.hparams.decoding_function,
+                speechbrain.decoders.ctc.CTCBaseSearcher,
+            ):
+                # If so, we need to retrieve the vocab list from the tokenizer.
+                # We also need to check if the tokenizer is a sentencepiece or a CTCTextEncoder.
+                if isinstance(
+                    self.tokenizer, speechbrain.dataio.encoder.CTCTextEncoder
+                ):
+                    ind2lab = self.tokenizer.ind2lab
+                    vocab_list = [ind2lab[x] for x in range(len(ind2lab))]
+                elif isinstance(
+                    self.tokenizer, sentencepiece.SentencePieceProcessor
+                ):
+                    vocab_list = [
+                        self.tokenizer.id_to_piece(i)
+                        for i in range(self.tokenizer.vocab_size())
+                    ]
+                else:
+                    raise ValueError(
+                        "The tokenizer must be sentencepiece or CTCTextEncoder"
+                    )
+                # We can now instantiate the decoding class and add all the parameters
+                if hasattr(self.hparams, "test_beam_search"):
+                    opt_beam_search_params = self.hparams.test_beam_search
+                    # check if the kenlm_model_path is provided and fetch it if necessary
+                    if "kenlm_model_path" in opt_beam_search_params:
+                        source, fl = split_path(
+                            opt_beam_search_params["kenlm_model_path"]
+                        )
+                        kenlm_model_path = str(
+                            fetch(
+                                fl, source=source, savedir=self.hparams.savedir
+                            )
+                        )
+                        # we need to update the kenlm_model_path in the opt_beam_search_params
+                        opt_beam_search_params["kenlm_model_path"] = (
+                            kenlm_model_path
+                        )
+                else:
+                    opt_beam_search_params = {}
+                self.decoding_function = self.hparams.decoding_function(
+                    **opt_beam_search_params, vocab_list=vocab_list
+                )
+            else:
+                raise ValueError(
+                    "The decoding function must be an instance of speechbrain.decoders.CTCBaseSearcher"
+                )
+    def transcribe_file(self, path, **kwargs):
+        """Transcribes the given audiofile into a sequence of words.
+        Arguments
+        ---------
+        path : str
+            Path to audio file which to transcribe.
+        **kwargs : dict
+            Arguments forwarded to ``load_audio``.
+        Returns
+        -------
+        str
+            The audiofile transcription produced by this ASR system.
+        """
+        waveform = self.load_audio(path, **kwargs)
+        # Fake a batch:
+        batch = waveform.unsqueeze(0)
+        rel_length = torch.tensor([1.0])
+        predicted_words, predicted_tokens = self.transcribe_batch(
+            batch, rel_length
+        )
+        return str(predicted_words[0])
+    def encode_batch(self, wavs, wav_lens):
+        """Encodes the input audio into a sequence of hidden states
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = EncoderASR.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.Tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model.
+        wav_lens : torch.Tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        torch.Tensor
+            The encoded batch
+        """
+        wavs = wavs.float()
+        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+        encoder_out = self.mods.wav2vec(wavs, wav_lens)
+        x = self.mods.dec(encoder_out)
+        logits = self.mods.output_lin(x)
+        p_ctc = self.hparams.softmax(logits)
+        return p_ctc
+    def transcribe_batch(self, wavs, wav_lens):
+        """Transcribes the input audio into a sequence of words
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = EncoderASR.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.Tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model.
+        wav_lens : torch.Tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        list
+            Each waveform in the batch transcribed.
+        tensor
+            Each predicted token id.
+        """
+        with torch.no_grad():
+            wav_lens = wav_lens.to(self.device)
+            encoder_out = self.encode_batch(wavs, wav_lens)
+            predictions = self.decoding_function(encoder_out, wav_lens)
+            print(predictions)
+            is_ctc_text_encoder_tokenizer = isinstance(
+                self.tokenizer, speechbrain.dataio.encoder.CTCTextEncoder
+            )
+            self.tokenizer.load('sample_data/SLU/labelencoder.txt')
+            if isinstance(self.hparams.decoding_function, functools.partial):
+                if is_ctc_text_encoder_tokenizer:
+                    predicted_words = [
+                        "".join(self.tokenizer.decode_ndim(token_seq))
+                        for token_seq in predictions
+                    ]
+                else:
+                    predicted_words = [
+                        self.tokenizer.decode_ids(token_seq)
+                        for token_seq in predictions
+                    ]
+            else:
+                predicted_words = [hyp[0].text for hyp in predictions]
+        return predicted_words, predictions
+    def forward(self, wavs, wav_lens):
+        """Runs the encoder"""
+        return self.encode_batch(wavs, wav_lens)
+@dataclass
+class ASRWhisperSegment:
+    """A single chunk of audio for Whisper ASR streaming.
+    This object is intended to be mutated as streaming progresses and passed across calls
+    to the lower-level APIs such as `encode_chunk`, `decode_chunk`, etc.
+    Attributes
+    ----------
+    start : float
+        The start time of the audio chunk.
+    end : float
+        The end time of the audio chunk.
+    chunk : torch.Tensor
+        The audio chunk, shape [time, channels].
+    lang_id : str
+        The language identifier associated with the audio chunk.
+    words : str
+        The predicted words for the audio chunk.
+    tokens : List[int]
+        The predicted tokens for the audio chunk.
+    prompt : List[str]
+        The prompt associated with the audio chunk.
+    avg_log_probs : float
+        The average log probability associated with the prediction.
+    no_speech_prob : float
+        The probability of no speech in the audio chunk.
+    """
+    start: float
+    end: float
+    chunk: torch.Tensor
+    lang_id: Optional[str] = None
+    words: Optional[str] = None
+    tokens: Optional[List[str]] = None
+    prompt: Optional[List[str]] = None
+    avg_log_probs: Optional[float] = None
+    no_speech_prob: Optional[float] = None
+class WhisperASR(Pretrained):
+    """A ready-to-use Whisper ASR model.
+    The class can be used to run the entire encoder-decoder whisper model.
+    The set of tasks supported are: ``transcribe``, ``translate``, and ``lang_id``.
+    The given YAML must contains the fields specified in the *_NEEDED[] lists.
+    Arguments
+    ---------
+    *args : tuple
+    **kwargs : dict
+        Arguments are forwarded to ``Pretrained`` parent class.
+    Example
+    -------
+    >>> from speechbrain.inference.ASR import WhisperASR
+    >>> tmpdir = getfixture("tmpdir")
+    >>> asr_model = WhisperASR.from_hparams(source="speechbrain/asr-whisper-medium-commonvoice-it", savedir=tmpdir,) # doctest: +SKIP
+    >>> hyp = asr_model.transcribe_file("speechbrain/asr-whisper-medium-commonvoice-it/example-it.wav")  # doctest: +SKIP
+    >>> hyp  # doctest: +SKIP
+    buongiorno a tutti e benvenuti a bordo
+    >>> _, probs = asr_model.detect_language_file("speechbrain/asr-whisper-medium-commonvoice-it/example-it.wav")  # doctest: +SKIP
+    >>> print(f"Detected language: {max(probs[0], key=probs[0].get)}")  # doctest: +SKIP
+    Detected language: it
+    """
+    HPARAMS_NEEDED = ["language", "sample_rate"]
+    MODULES_NEEDED = ["whisper", "decoder"]
+    TASKS = ["transcribe", "translate", "lang_id"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tokenizer = self.hparams.whisper.tokenizer
+    @torch.no_grad()
+    def detect_language_file(self, path: str):
+        """Detects the language of the given audiofile.
+        This method only works on input_file of 30 seconds or less.
+        Arguments
+        ---------
+        path : str
+            Path to audio file which to transcribe.
+        Returns
+        -------
+        language_tokens : torch.Tensor
+            The detected language tokens.
+        language_probs : dict
+            The probabilities of the detected language tokens.
+        Raises
+        ------
+        ValueError
+            If the model doesn't have language tokens.
+        """
+        wavs = self.load_audio(path).float().to(self.device).unsqueeze(0)
+        mel = self.mods.whisper._get_mel(wavs)
+        language_tokens, language_probs = self.mods.whisper.detect_language(mel)
+        return language_tokens, language_probs
+    @torch.no_grad()
+    def detect_language_batch(self, wav: torch.Tensor):
+        """Detects the language of the given wav Tensor.
+        This method only works on wav files of 30 seconds or less.
+        Arguments
+        ---------
+        wav : torch.tensor
+            Batch of waveforms [batch, time, channels].
+        Returns
+        -------
+        language_tokens : torch.Tensor of shape (batch_size,)
+            ids of the most probable language tokens, which appears after the startoftranscript token.
+        language_probs : List[Dict[str, float]]
+            list of dictionaries containing the probability distribution over all languages.
+        Raises
+        ------
+        ValueError
+            If the model doesn't have language tokens.
+        Example
+        -------
+        >>> from speechbrain.inference.ASR import WhisperASR
+        >>> import torchaudio
+        >>> tmpdir = getfixture("tmpdir")
+        >>> asr_model = WhisperASR.from_hparams(
+        ...     source="speechbrain/asr-whisper-medium-commonvoice-it",
+        ...     savedir=tmpdir,
+        ... ) # doctest: +SKIP
+        >>> wav, _ = torchaudio.load("your_audio") # doctest: +SKIP
+        >>> language_tokens, language_probs = asr_model.detect_language(wav) # doctest: +SKIP
+        """
+        mel = self.mods.whisper._get_mel(wav)
+        language_tokens, language_probs = self.mods.whisper.detect_language(mel)
+        return language_tokens, language_probs
+    @torch.no_grad()
+    def _detect_language(self, mel: torch.Tensor, task: str):
+        """Detects the language of the given mel spectrogram.
+        Arguments
+        ---------
+        mel : torch.tensor
+            Batch of mel spectrograms [batch, time, channels].
+        task : str
+            The task to perform.
+        Returns
+        -------
+        language_tokens : Tensor, shape = (n_audio,)
+            ids of the most probable language tokens, which appears after the startoftranscript token.
+        language_probs : List[Dict[str, float]], length = n_audio
+            list of dictionaries containing the probability distribution over all languages.
+        """
+        languages = [self.mods.whisper.language] * mel.shape[0]
+        lang_probs = None
+        if self.mods.whisper.language is None or task == "lang_id":
+            lang_tokens, lang_probs = self.mods.whisper.detect_language(mel)
+            languages = [max(probs, key=probs.get) for probs in lang_probs]
+            self.mods.decoder.set_lang_tokens(lang_tokens)
+        return languages, lang_probs
+    def _get_audio_stream(
+        self, streamer: "torchaudio.io.StreamReader", frames_per_chunk: int
+    ):
+        """From a :class:`torchaudio.io.StreamReader`, identifies the audio
+        stream and returns an iterable stream of chunks (after resampling and
+        downmixing to mono).
+        Arguments
+        ---------
+        streamer : torchaudio.io.StreamReader
+            The stream object. Must hold exactly one source stream of an
+            audio type.
+        frames_per_chunk : int
+            The number of frames per chunk. For a streaming model, this should
+            be determined from the DynChunkTrain configuration.
+        Yields
+        ------
+        chunks from streamer
+        """
+        stream_infos = [
+            streamer.get_src_stream_info(i)
+            for i in range(streamer.num_src_streams)
+        ]
+        audio_stream_infos = [
+            (i, stream_info)
+            for i, stream_info in enumerate(stream_infos)
+            if stream_info.media_type == "audio"
+        ]
+        if len(audio_stream_infos) != 1:
+            raise ValueError(
+                f"Expected stream to have only 1 stream (with any number of channels), got {len(audio_stream_infos)} (with streams: {stream_infos})"
+            )
+        # find the index of the first (and only) audio stream
+        audio_stream_index = audio_stream_infos[0][0]
+        # output stream #0
+        streamer.add_basic_audio_stream(
+            frames_per_chunk=frames_per_chunk,
+            stream_index=audio_stream_index,
+            sample_rate=self.audio_normalizer.sample_rate,
+            format="fltp",  # torch.float32
+            num_channels=1,
+        )
+        for (chunk,) in streamer.stream():
+            chunk = chunk.squeeze(-1)  # we deal with mono, remove that dim
+            chunk = chunk.unsqueeze(0)  # create a fake batch dim
+            yield chunk
+    @torch.no_grad()
+    def transcribe_file_streaming(
+        self,
+        path: str,
+        task: Optional[str] = None,
+        initial_prompt: Optional[str] = None,
+        logprob_threshold: Optional[float] = -1.0,
+        no_speech_threshold=0.6,
+        condition_on_previous_text: bool = False,
+        verbose: bool = False,
+        use_torchaudio_streaming: bool = False,
+        chunk_size: Optional[int] = 30,
+        **kwargs,
+    ):
+        """Transcribes the given audiofile into a sequence of words.
+        This method supports the following tasks: ``transcribe``, ``translate``, and ``lang_id``.
+        It can process an input audio file longer than 30 seconds by splitting it into chunk_size-second segments.
+        Arguments
+        ---------
+        path : str
+            URI/path to the audio to transcribe. When
+            ``use_torchaudio_streaming`` is ``False``, uses SB fetching to allow
+            fetching from HF or a local file. When ``True``, resolves the URI
+            through ffmpeg, as documented in
+            :class:`torchaudio.io.StreamReader`.
+        task : Optional[str]
+            The task to perform. If None, the default task is the one passed in the Whisper model.
+        initial_prompt : Optional[str]
+            The initial prompt to condition the model on.
+        logprob_threshold : Optional[float]
+            The log probability threshold to continue decoding the current segment.
+        no_speech_threshold : float
+            The threshold to skip decoding segment if the no_speech_prob is higher than this value.
+        condition_on_previous_text : bool
+            If True, the model will be condition on the last 224 tokens.
+        verbose : bool
+            If True, print the transcription of each segment.
+        use_torchaudio_streaming : bool
+            Whether the audio file can be loaded in a streaming fashion. If not,
+            transcription is still performed through chunks of audio, but the
+            entire audio file is fetched and loaded at once.
+            This skips the usual fetching method and instead resolves the URI
+            using torchaudio (via ffmpeg).
+        chunk_size : Optional[int]
+            The size of the chunks to split the audio into. The default
+            chunk size is 30 seconds which corresponds to the maximal length
+            that the model can process in one go.
+        **kwargs : dict
+            Arguments forwarded to ``load_audio``
+        Yields
+        ------
+        ASRWhisperSegment
+            A new ASRWhisperSegment instance initialized with the provided parameters.
+        """
+        if task is not None:
+            if task in self.TASKS:
+                if task != "lang_id":
+                    self.mods.decoder.set_task(task)
+            else:
+                raise ValueError(
+                    f"Task {task} not supported. Supported tasks are {self.TASKS}"
+                )
+        # create chunks of chunk_size seconds
+        num_frames_per_chunk = chunk_size * self.hparams.sample_rate
+        if use_torchaudio_streaming:
+            streamer = torchaudio.io.StreamReader(path)
+            segments = self._get_audio_stream(streamer, num_frames_per_chunk)
+        else:
+            waveform = self.load_audio(path, **kwargs)
+            batch = waveform.unsqueeze(0)
+            segments = split_fixed_chunks(batch, num_frames_per_chunk)
+        rel_length = torch.tensor([1.0])
+        all_tokens = []
+        prompt_reset_since = 0
+        if initial_prompt is not None:
+            initial_prompt_tokens = self.whisper.tokenizer.encode(
+                " " + initial_prompt.strip()
+            )
+            all_tokens.extend(initial_prompt_tokens)
+        else:
+            initial_prompt_tokens = []
+        for i, segment in enumerate(tqdm(segments, disable=verbose)):
+            # move the segment on the device
+            segment = segment.to(self.device)
+            # extract mel spectrogram
+            mel_segment = self.mods.whisper._get_mel(segment)
+            start = i * chunk_size
+            end = (i + 1) * chunk_size
+            encoder_out = self.mods.whisper.forward_encoder(mel_segment)
+            languages, _ = self._detect_language(mel_segment, task)
+            if task == "lang_id":
+                yield ASRWhisperSegment(
+                    start=start,
+                    end=end,
+                    chunk=segment,
+                    lang_id=languages[0],
+                )
+                continue
+            prompt = all_tokens[prompt_reset_since:]
+            self.mods.decoder.set_prompt(prompt)
+            predicted_tokens, _, scores, _ = self.mods.decoder(
+                encoder_out, rel_length
+            )
+            avg_log_probs = scores.sum() / (len(predicted_tokens[0]) + 1)
+            if no_speech_threshold is not None:
+                should_skip = (
+                    self.mods.decoder.no_speech_probs[0] > no_speech_threshold
+                )
+                if (
+                    logprob_threshold is not None
+                    and avg_log_probs > logprob_threshold
+                ):
+                    # don't skip if the logprob is high enough, despite the no_speech_prob
+                    should_skip = False
+                if should_skip:
+                    yield ASRWhisperSegment(
+                        start=start,
+                        end=end,
+                        chunk=segment,
+                        lang_id=languages[0],
+                        words="",
+                        tokens=[],
+                        prompt=prompt,
+                        avg_log_probs=avg_log_probs.item(),
+                        no_speech_prob=self.mods.decoder.no_speech_probs[0],
+                    )
+                    continue
+            predicted_words = [
+                self.tokenizer.decode(t, skip_special_tokens=True).strip()
+                for t in predicted_tokens
+            ]
+            yield ASRWhisperSegment(
+                start=start,
+                end=end,
+                chunk=segment,
+                lang_id=languages[0],
+                words=predicted_words[0],
+                tokens=predicted_tokens[0],
+                prompt=prompt,
+                avg_log_probs=avg_log_probs.item(),
+                no_speech_prob=self.mods.decoder.no_speech_probs[0],
+            )
+            all_tokens.extend(predicted_tokens[0])
+            if (
+                not condition_on_previous_text
+                or self.mods.decoder.temperature > 0.5
+            ):
+                prompt_reset_since = len(all_tokens)
+    def transcribe_file(
+        self,
+        path: str,
+        task: Optional[str] = None,
+        initial_prompt: Optional[str] = None,
+        logprob_threshold: Optional[float] = -1.0,
+        no_speech_threshold=0.6,
+        condition_on_previous_text: bool = False,
+        verbose: bool = False,
+        use_torchaudio_streaming: bool = False,
+        chunk_size: Optional[int] = 30,
+        **kwargs,
+    ) -> List[ASRWhisperSegment]:
+        """Run the Whisper model using the specified task on the given audio file and return the ``ASRWhisperSegment`` objects
+        for each segment.
+        This method supports the following tasks: ``transcribe``, ``translate``, and ``lang_id``.
+        It can process an input audio file longer than 30 seconds by splitting it into chunk_size-second segments.
+        Arguments
+        ---------
+        path : str
+            URI/path to the audio to transcribe. When
+            ``use_torchaudio_streaming`` is ``False``, uses SB fetching to allow
+            fetching from HF or a local file. When ``True``, resolves the URI
+            through ffmpeg, as documented in
+            :class:`torchaudio.io.StreamReader`.
+        task : Optional[str]
+            The task to perform. If None, the default task is the one passed in the Whisper model.
+            It can be one of the following: ``transcribe``, ``translate``, ``lang_id``.
+        initial_prompt : Optional[str]
+            The initial prompt to condition the model on.
+        logprob_threshold : Optional[float]
+            The log probability threshold to continue decoding the current segment.
+        no_speech_threshold : float
+            The threshold to skip decoding segment if the no_speech_prob is higher than this value.
+        condition_on_previous_text : bool
+            If True, the model will be condition on the last 224 tokens.
+        verbose : bool
+            If True, print the details of each segment.
+        use_torchaudio_streaming : bool
+            Whether the audio file can be loaded in a streaming fashion. If not,
+            transcription is still performed through chunks of audio, but the
+            entire audio file is fetched and loaded at once.
+            This skips the usual fetching method and instead resolves the URI
+            using torchaudio (via ffmpeg).
+        chunk_size : Optional[int]
+            The size of the chunks to split the audio into. The default
+            chunk size is 30 seconds which corresponds to the maximal length
+            that the model can process in one go.
+        **kwargs : dict
+            Arguments forwarded to ``load_audio``
+        Returns
+        -------
+        results : list
+            A list of ``WhisperASRChunk`` objects, each containing the task result.
+        """
+        results = []
+        for whisper_segment in self.transcribe_file_streaming(
+            path,
+            task=task,
+            initial_prompt=initial_prompt,
+            logprob_threshold=logprob_threshold,
+            no_speech_threshold=no_speech_threshold,
+            condition_on_previous_text=condition_on_previous_text,
+            verbose=verbose,
+            use_torchaudio_streaming=use_torchaudio_streaming,
+            chunk_size=chunk_size,
+            **kwargs,
+        ):
+            results.append(whisper_segment)
+            if verbose:
+                pred = (
+                    whisper_segment.words
+                    if task != "lang_id"
+                    else whisper_segment.lang_id
+                )
+                print(
+                    f"[{whisper_segment.start}s --> {whisper_segment.end}s] {pred}"
+                )
+        return results
+    def encode_batch(self, wavs, wav_lens):
+        """Encodes the input audio into a sequence of hidden states
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.tensor
+            Batch of waveforms [batch, time, channels].
+        wav_lens : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        torch.tensor
+            The encoded batch
+        """
+        wavs = wavs.to(device=self.device, dtype=torch.float32)
+        mel = self.mods.whisper._get_mel(wavs)
+        encoder_out = self.mods.whisper.forward_encoder(mel)
+        return encoder_out
+    @torch.no_grad()
+    def transcribe_batch(self, wavs, wav_lens):
+        """Transcribes the input audio into a sequence of words
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.tensor
+            Batch of waveforms [batch, time, channels].
+        wav_lens : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        list
+            Each waveform in the batch transcribed.
+        tensor
+            Each predicted token id.
+        """
+        wav_lens = wav_lens.float().to(self.device)
+        encoder_out = self.encode_batch(wavs, wav_lens)
+        predicted_tokens, _, _, _ = self.mods.decoder(encoder_out, wav_lens)
+        predicted_words = [
+            self.tokenizer.decode(t, skip_special_tokens=True).strip()
+            for t in predicted_tokens
+        ]
+        if self.hparams.normalized_transcripts:
+            predicted_words = [
+                self.tokenizer.normalize(text).split(" ")
+                for text in predicted_words
+            ]
+        return predicted_words, predicted_tokens
+    def forward(self, wavs, wav_lens):
+        """Runs full transcription - note: no gradients through decoding"""
+        return self.transcribe_batch(wavs, wav_lens)
+@dataclass
+class ASRStreamingContext:
+    """Streaming metadata, initialized by
+    :meth:`~StreamingASR.make_streaming_context` (see there for details on
+    initialization of fields here).
+    This object is intended to be mutate: the same object should be passed
+    across calls as streaming progresses (namely when using the lower-level
+    :meth:`~StreamingASR.encode_chunk`, etc. APIs).
+    Holds some references to opaque streaming contexts, so the context is
+    model-agnostic to an extent."""
+    config: DynChunkTrainConfig
+    """Dynamic chunk training configuration used to initialize the streaming
+    context. Cannot be modified on the fly."""
+    fea_extractor_context: Any
+    """Opaque feature extractor streaming context."""
+    encoder_context: Any
+    """Opaque encoder streaming context."""
+    decoder_context: Any
+    """Opaque decoder streaming context."""
+    tokenizer_context: Optional[List[Any]]
+    """Opaque streaming context for the tokenizer. Initially `None`. Initialized
+    to a list of tokenizer contexts once batch size can be determined."""
+class StreamingASR(Pretrained):
+    """A ready-to-use, streaming-capable ASR model.
+    Arguments
+    ---------
+    *args : tuple
+    **kwargs : dict
+        Arguments are forwarded to ``Pretrained`` parent class.
+    Example
+    -------
+    >>> from speechbrain.inference.ASR import StreamingASR
+    >>> from speechbrain.utils.dynamic_chunk_training import DynChunkTrainConfig
+    >>> tmpdir = getfixture("tmpdir")
+    >>> asr_model = StreamingASR.from_hparams(source="speechbrain/asr-conformer-streaming-librispeech", savedir=tmpdir,) # doctest: +SKIP
+    >>> asr_model.transcribe_file("speechbrain/asr-conformer-streaming-librispeech/test-en.wav", DynChunkTrainConfig(24, 8)) # doctest: +SKIP
+    """
+    HPARAMS_NEEDED = [
+        "fea_streaming_extractor",
+        "make_decoder_streaming_context",
+        "decoding_function",
+        "make_tokenizer_streaming_context",
+        "tokenizer_decode_streaming",
+    ]
+    MODULES_NEEDED = ["enc", "proj_enc"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.filter_props = self.hparams.fea_streaming_extractor.properties
+    def _get_audio_stream(
+        self, streamer: "torchaudio.io.StreamReader", frames_per_chunk: int
+    ):
+        """From a :class:`torchaudio.io.StreamReader`, identifies the audio
+        stream and returns an iterable stream of chunks (after resampling and
+        downmixing to mono).
+        Arguments
+        ---------
+        streamer : torchaudio.io.StreamReader
+            The stream object. Must hold exactly one source stream of an
+            audio type.
+        frames_per_chunk : int
+            The number of frames per chunk. For a streaming model, this should
+            be determined from the DynChunkTrain configuration.
+        Yields
+        ------
+        chunks from streamer
+        """
+        stream_infos = [
+            streamer.get_src_stream_info(i)
+            for i in range(streamer.num_src_streams)
+        ]
+        audio_stream_infos = [
+            (i, stream_info)
+            for i, stream_info in enumerate(stream_infos)
+            if stream_info.media_type == "audio"
+        ]
+        if len(audio_stream_infos) != 1:
+            raise ValueError(
+                f"Expected stream to have only 1 stream (with any number of channels), got {len(audio_stream_infos)} (with streams: {stream_infos})"
+            )
+        # find the index of the first (and only) audio stream
+        audio_stream_index = audio_stream_infos[0][0]
+        # output stream #0
+        streamer.add_basic_audio_stream(
+            frames_per_chunk=frames_per_chunk,
+            stream_index=audio_stream_index,
+            sample_rate=self.audio_normalizer.sample_rate,
+            format="fltp",  # torch.float32
+            num_channels=1,
+        )
+        for (chunk,) in streamer.stream():
+            chunk = chunk.squeeze(-1)  # we deal with mono, remove that dim
+            chunk = chunk.unsqueeze(0)  # create a fake batch dim
+            yield chunk
+    def transcribe_file_streaming(
+        self,
+        path,
+        dynchunktrain_config: DynChunkTrainConfig,
+        use_torchaudio_streaming: bool = True,
+        **kwargs,
+    ):
+        """Transcribes the given audio file into a sequence of words, in a
+        streaming fashion, meaning that text is being yield from this
+        generator, in the form of strings to concatenate.
+        Arguments
+        ---------
+        path : str
+            URI/path to the audio to transcribe. When
+            ``use_torchaudio_streaming`` is ``False``, uses SB fetching to allow
+            fetching from HF or a local file. When ``True``, resolves the URI
+            through ffmpeg, as documented in
+            :class:`torchaudio.io.StreamReader`.
+        dynchunktrain_config : DynChunkTrainConfig
+            Streaming configuration. Sane values and how much time chunks
+            actually represent is model-dependent.
+        use_torchaudio_streaming : bool
+            Whether the audio file can be loaded in a streaming fashion. If not,
+            transcription is still performed through chunks of audio, but the
+            entire audio file is fetched and loaded at once.
+            This skips the usual fetching method and instead resolves the URI
+            using torchaudio (via ffmpeg).
+        **kwargs : dict
+            Arguments forwarded to ``load_audio``
+        Yields
+        ------
+        generator of str
+            An iterator yielding transcribed chunks (strings). There is a yield
+            for every chunk, even if the transcribed string for that chunk is an
+            empty string.
+        """
+        chunk_size = self.get_chunk_size_frames(dynchunktrain_config)
+        if use_torchaudio_streaming:
+            streamer = torchaudio.io.StreamReader(path)
+            chunks = self._get_audio_stream(streamer, chunk_size)
+        else:
+            waveform = self.load_audio(path, **kwargs)
+            batch = waveform.unsqueeze(0)  # create batch dim
+            chunks = split_fixed_chunks(batch, chunk_size)
+        rel_length = torch.tensor([1.0])
+        context = self.make_streaming_context(dynchunktrain_config)
+        final_chunks = [
+            torch.zeros((1, chunk_size), device=self.device)
+        ] * self.hparams.fea_streaming_extractor.get_recommended_final_chunk_count(
+            chunk_size
+        )
+        for chunk in itertools.chain(chunks, final_chunks):
+            predicted_words = self.transcribe_chunk(context, chunk, rel_length)
+            yield predicted_words[0]
+    def transcribe_file(
+        self,
+        path,
+        dynchunktrain_config: DynChunkTrainConfig,
+        use_torchaudio_streaming: bool = True,
+    ):
+        """Transcribes the given audio file into a sequence of words.
+        Arguments
+        ---------
+        path : str
+            URI/path to the audio to transcribe. When
+            ``use_torchaudio_streaming`` is ``False``, uses SB fetching to allow
+            fetching from HF or a local file. When ``True``, resolves the URI
+            through ffmpeg, as documented in
+            :class:`torchaudio.io.StreamReader`.
+        dynchunktrain_config : DynChunkTrainConfig
+            Streaming configuration. Sane values and how much time chunks
+            actually represent is model-dependent.
+        use_torchaudio_streaming : bool
+            Whether the audio file can be loaded in a streaming fashion. If not,
+            transcription is still performed through chunks of audio, but the
+            entire audio file is fetched and loaded at once.
+            This skips the usual fetching method and instead resolves the URI
+            using torchaudio (via ffmpeg).
+        Returns
+        -------
+        str
+            The audio file transcription produced by this ASR system.
+        """
+        pred = ""
+        for text_chunk in self.transcribe_file_streaming(
+            path, dynchunktrain_config, use_torchaudio_streaming
+        ):
+            pred += text_chunk
+        return pred
+    def make_streaming_context(self, dynchunktrain_config: DynChunkTrainConfig):
+        """Create a blank streaming context to be passed around for chunk
+        encoding/transcription.
+        Arguments
+        ---------
+        dynchunktrain_config : DynChunkTrainConfig
+            Streaming configuration. Sane values and how much time chunks
+            actually represent is model-dependent.
+        Returns
+        -------
+        ASRStreamingContext
+        """
+        return ASRStreamingContext(
+            config=dynchunktrain_config,
+            fea_extractor_context=self.hparams.fea_streaming_extractor.make_streaming_context(),
+            encoder_context=self.mods.enc.make_streaming_context(
+                dynchunktrain_config
+            ),
+            decoder_context=self.hparams.make_decoder_streaming_context(),
+            tokenizer_context=None,
+        )
+    def get_chunk_size_frames(
+        self, dynchunktrain_config: DynChunkTrainConfig
+    ) -> int:
+        """Returns the chunk size in actual audio samples, i.e. the exact
+        expected length along the time dimension of an input chunk tensor (as
+        passed to :meth:`~StreamingASR.encode_chunk` and similar low-level
+        streaming functions).
+        Arguments
+        ---------
+        dynchunktrain_config : DynChunkTrainConfig
+            The streaming configuration to determine the chunk frame count of.
+        Returns
+        -------
+        chunk size
+        """
+        return (self.filter_props.stride - 1) * dynchunktrain_config.chunk_size
+    @torch.no_grad()
+    def encode_chunk(
+        self,
+        context: ASRStreamingContext,
+        chunk: torch.Tensor,
+        chunk_len: Optional[torch.Tensor] = None,
+    ):
+        """Encoding of a batch of audio chunks into a batch of encoded
+        sequences.
+        For full speech-to-text offline transcription, use `transcribe_batch` or
+        `transcribe_file`.
+        Must be called over a given context in the correct order of chunks over
+        time.
+        Arguments
+        ---------
+        context : ASRStreamingContext
+            Mutable streaming context object, which must be specified and reused
+            across calls when streaming.
+            You can obtain an initial context by calling
+            `asr.make_streaming_context(config)`.
+        chunk : torch.Tensor
+            The tensor for an audio chunk of shape `[batch size, time]`.
+            The time dimension must strictly match
+            `asr.get_chunk_size_frames(config)`.
+            The waveform is expected to be in the model's expected format (i.e.
+            the sampling rate must be correct).
+        chunk_len : torch.Tensor, optional
+            The relative chunk length tensor of shape `[batch size]`. This is to
+            be used when the audio in one of the chunks of the batch is ending
+            within this chunk.
+            If unspecified, equivalent to `torch.ones((batch_size,))`.
+        Returns
+        -------
+        torch.Tensor
+            Encoded output, of a model-dependent shape."""
+        if chunk_len is None:
+            chunk_len = torch.ones((chunk.size(0),))
+        chunk = chunk.float()
+        chunk, chunk_len = chunk.to(self.device), chunk_len.to(self.device)
+        assert chunk.shape[-1] <= self.get_chunk_size_frames(context.config)
+        x = self.hparams.fea_streaming_extractor(
+            chunk, context=context.fea_extractor_context, lengths=chunk_len
+        )
+        x = self.mods.enc.forward_streaming(x, context.encoder_context)
+        x = self.mods.proj_enc(x)
+        return x
+    @torch.no_grad()
+    def decode_chunk(
+        self, context: ASRStreamingContext, x: torch.Tensor
+    ) -> Tuple[List[str], List[List[int]]]:
+        """Decodes the output of the encoder into tokens and the associated
+        transcription.
+        Must be called over a given context in the correct order of chunks over
+        time.
+        Arguments
+        ---------
+        context : ASRStreamingContext
+            Mutable streaming context object, which should be the same object
+            that was passed to `encode_chunk`.
+        x : torch.Tensor
+            The output of `encode_chunk` for a given chunk.
+        Returns
+        -------
+        list of str
+            Decoded tokens of length `batch_size`. The decoded strings can be
+            of 0-length.
+        list of list of output token hypotheses
+            List of length `batch_size`, each holding a list of tokens of any
+            length `>=0`.
+        """
+        tokens = self.hparams.decoding_function(x, context.decoder_context)
+        # initialize token context for real now that we know the batch size
+        if context.tokenizer_context is None:
+            context.tokenizer_context = [
+                self.hparams.make_tokenizer_streaming_context()
+                for _ in range(len(tokens))
+            ]
+        words = [
+            self.hparams.tokenizer_decode_streaming(
+                self.hparams.tokenizer, cur_tokens, context.tokenizer_context[i]
+            )
+            for i, cur_tokens in enumerate(tokens)
+        ]
+        return words, tokens
+    def transcribe_chunk(
+        self,
+        context: ASRStreamingContext,
+        chunk: torch.Tensor,
+        chunk_len: Optional[torch.Tensor] = None,
+    ):
+        """Transcription of a batch of audio chunks into transcribed text.
+        Must be called over a given context in the correct order of chunks over
+        time.
+        Arguments
+        ---------
+        context : ASRStreamingContext
+            Mutable streaming context object, which must be specified and reused
+            across calls when streaming.
+            You can obtain an initial context by calling
+            `asr.make_streaming_context(config)`.
+        chunk : torch.Tensor
+            The tensor for an audio chunk of shape `[batch size, time]`.
+            The time dimension must strictly match
+            `asr.get_chunk_size_frames(config)`.
+            The waveform is expected to be in the model's expected format (i.e.
+            the sampling rate must be correct).
+        chunk_len : torch.Tensor, optional
+            The relative chunk length tensor of shape `[batch size]`. This is to
+            be used when the audio in one of the chunks of the batch is ending
+            within this chunk.
+            If unspecified, equivalent to `torch.ones((batch_size,))`.
+        Returns
+        -------
+        str
+            Transcribed string for this chunk, might be of length zero.
+        """
+        if chunk_len is None:
+            chunk_len = torch.ones((chunk.size(0),))
+        chunk = chunk.float()
+        chunk, chunk_len = chunk.to(self.device), chunk_len.to(self.device)
+        x = self.encode_chunk(context, chunk, chunk_len)
+        words, _ = self.decode_chunk(context, x)
+        return words

brain.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33809a026a2c1febce7b03c8aafaee4ddfc851b2c70f180f8c06bf1017f4df5c
+size 46

counter.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95aebc97bc646c67fdcd923a5965b001f3c8a5c4d3a77075112e12a3a311d760
+size 3

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,170 @@

+# Data parameters:
+# With data_parallel batch_size is split into N jobs.
+# With DDP batch_size is multiplied by N jobs.
+batch_size: 6
+test_batch_size: 2
+# We remove utterances longer than 90s in the train/dev/test sets as
+# longer sentences certainly correspond to "open microphones".
+avoid_if_longer_than: 90.0
+avoid_if_smaller_than: 0.0
+dataloader_options:
+  batch_size: 6
+  num_workers: 6
+  shuffle: true
+test_dataloader_options:
+  batch_size: 2
+  num_workers: 3
+# Feature parameters:
+sample_rate: 16000
+feats_dim: 1024
+# Training parameters:
+number_of_epochs: 80
+lr: 1
+lr_wav2vec: 0.0001
+annealing_factor: 0.8
+annealing_factor_wav2vec: 0.9
+improvement_threshold: 0.0025
+improvement_threshold_wav2vec: 0.0025
+patient: 0
+patient_wav2vec: 0
+sorting: random
+# Model parameters:
+activation: &id001 !name:torch.nn.LeakyReLU
+dropout: 0.15
+cnn_blocks: 0
+rnn_layers: 0
+dnn_blocks: 1
+rnn_neurons: 0
+dnn_neurons: 1024
+# Wav2Vec parameters:
+freeze: false
+# Decoding parameters:
+blank_index: 0
+# Outputs:
+output_neurons: 113
+# ------ Functions and classes
+epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: 80
+wav2vec: &id002 !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+  source: microsoft/wavlm-large
+  output_norm: true
+  freeze: false
+  save_path: results/TARIC_SLU_wav2vec_wavLM_with_intent_criterion_a100_copie/1212/save/wav2vec.pt
+dec: &id003 !new:speechbrain.lobes.models.VanillaNN.VanillaNN
+  input_shape: [null, null, 1024]
+  activation: *id001
+  dnn_blocks: 1
+  dnn_neurons: 1024
+output_lin: &id004 !new:speechbrain.nnet.linear.Linear
+  input_size: 1024
+  n_neurons: 113
+  bias: true
+softmax: !new:speechbrain.nnet.activations.Softmax
+  apply_log: true
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+  blank_index: 0
+modules:
+  wav2vec: *id002
+  dec: *id003
+  output_lin: *id004
+model: &id005 !new:torch.nn.ModuleList
+- [*id003, *id004]
+model_wav2vec: !new:torch.nn.ModuleList
+- [*id002]
+opt_class: !name:torch.optim.Adadelta
+  lr: 1
+  rho: 0.95
+  eps: 1.e-8
+opt_class_wav2vec: !name:torch.optim.Adam
+  lr: 0.0001
+lr_annealing: &id006 !new:speechbrain.nnet.schedulers.NewBobScheduler
+  initial_value: 1
+  improvement_threshold: 0.0025
+  annealing_factor: 0.8
+  patient: 0
+lr_annealing_wav2vec: &id007 !new:speechbrain.nnet.schedulers.NewBobScheduler
+  initial_value: 0.0001
+  improvement_threshold: 0.0025
+  annealing_factor: 0.9
+  patient: 0
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: results/TARIC_SLU_wav2vec_wavLM_with_intent_criterion_a100_copie/1212/save
+  recoverables:
+    model: *id005
+    wav2vec: *id002
+    lr_annealing: *id006
+    lr_annealing_wav2vec: *id007
+    counter: *id008
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: results/TARIC_SLU_wav2vec_wavLM_with_intent_criterion_a100_copie/1212/train_log.txt
+ctc_computer: !name:speechbrain.utils.metric_stats.MetricStats
+  metric: !name:speechbrain.nnet.losses.ctc_loss
+    blank_index: 0
+    reduction: batch
+error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+  merge_tokens: true
+coer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+  extract_concepts_values: true
+  keep_values: false
+  tag_in: <
+  tag_out: >
+cver_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+  extract_concepts_values: true
+  keep_values: true
+  tag_in: <
+  tag_out: >
+tokenizer: !new:speechbrain.dataio.encoder.CTCTextEncoder
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+  loadables:
+    model: !ref <model>
+    wav2vec: !ref <wav2vec>
+    tokenizer: !ref <tokenizer>
+  paths:
+    model: !ref /content/sample_data/SLU/model.cpkt
+    wav2vec: !ref /content/sample_data/SLU/wav2vec.cpkt
+    tokenizer: !ref /content/sample_data/SLU/label_encoder.txt
+decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
+    blank_id: 0
+# Tag list:
+tag_list: <politeness>, <directives_query>, <directives_answer>, <age>, <age_req>,
+  <age_ticket>, <an>, <answer>, <arrival_time>, <card_price>, <card_type>, <city>,
+  <city_name_arrival>, <city_name_before>, <city_name_departure>, <city_name_direction>,
+  <class_number>, <class_type>, <command_task>, <comparatif_age>, <comparatif_distance>,
+  <comparatif_price>, <comparatif_time>, <coreference_city>, <coreference_departure>,
+  <date>, <day>, <departure_time>, <discount_gain>, <discount_pourcent>, <duration>,
+  <duration_req>, <existance>, <existance_req>, <hour_req>, <money_exchange>, <month>,
+  <negation>, <number>, <number_class>, <number_of_train>, <number_req>, <object>,
+  <option>, <other_transport>, <part_price>, <part_time>, <period_day>, <period_year>,
+  <person_name>, <price_req>, <rang>, <ref_object>, <ref_person>, <ref_time>, <relative_day>,
+  <relative_time>, <state>, <tarif>, <task>, <ticket_number>, <ticket_price>, <ticket_type>,
+  <time>, <train_type>

labelencoder.txt ADDED Viewed

	@@ -0,0 +1,113 @@

+'<politeness>' => 109
+'_' => 1
+'A' => 2
+'y' => 3
+'t' => 4
+'f' => 5
+'D' => 6
+'l' => 7
+'x' => 8
+'w' => 9
+'<directives_query>' => 10
+'m' => 11
+'E' => 12
+'<hour_req>' => 13
+'q' => 14
+'3' => 15
+'>' => 16
+'b' => 17
+'h' => 18
+'<object>' => 19
+'r' => 20
+'n' => 21
+'<directives_answer>' => 22
+'<departure_time>' => 23
+'s' => 24
+'<existance_req>' => 25
+'v' => 26
+'<ref_object>' => 27
+'H' => 28
+'d' => 29
+'<relative_time>' => 30
+'<answer>' => 31
+'k' => 32
+'ç' => 33
+'<coreference_departure>' => 34
+'<existance>' => 35
+'<ticket_number>' => 36
+'z' => 37
+'<city_name_arrival>' => 38
+'S' => 39
+'j' => 40
+'<train_type>' => 41
+'9' => 42
+'g' => 43
+'<arrival_time>' => 44
+'<command_task>' => 45
+'T' => 46
+'<ticket_price>' => 47
+'<discount_gain>' => 48
+'<discount_pourcent>' => 49
+'<number_of_train>' => 50
+'<person_name>' => 51
+'<comparatif_time>' => 52
+'<card_type>' => 53
+'<relative_day>' => 54
+'<negation>' => 55
+'<price_req>' => 56
+'<class_type>' => 57
+'<money_exchange>' => 58
+'<card_price>' => 59
+'<ticket_type>' => 60
+'<city_name_direction>' => 61
+'<other_transport>' => 62
+'Z' => 63
+'7' => 64
+'<age_ticket>' => 65
+'<comparatif_age>' => 66
+'<age>' => 67
+'<tarif>' => 68
+'<rang>' => 69
+'<part_time>' => 70
+'<period_day>' => 71
+'<duration_req>' => 72
+'<number>' => 73
+'<part_price>' => 74
+'ڥ' => 75
+'<day>' => 76
+'<coreference_city>' => 77
+'<ref_time>' => 78
+'<state>' => 79
+'<city_name_departure>' => 80
+'<comparatif_price>' => 81
+'<duration>' => 82
+'.' => 83
+'<city_name_before>' => 84
+'<date>' => 85
+'<ref_person>' => 86
+'<comparatif_distance>' => 87
+'<number_req>' => 88
+'<age_req>' => 89
+'<option>' => 90
+'<time>' => 91
+'<an>' => 92
+'<period_year>' => 93
+'<month>' => 94
+'$' => 95
+'i' => 96
+'e' => 97
+'c' => 98
+'u' => 99
+'a' => 100
+'p' => 101
+'o' => 102
+'<class_number>' => 103
+'<directives_answer_request>' => 104
+'<task>' => 105
+'<city>' => 106
+'<directives_request>' => 107
+'<number_class>' => 108
+'<blank>' => 0
+================
+'starting_index' => 0
+'blank_label' => '<blank>'

lr_annealing.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c4ea943b3cc3d6c91aa6843cf37362ffcad693e8f4cddfb85159458cc445598
+size 697

lr_annealing_wav2vec.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9043595d8cb86f5dc698ec4c3880a6eba4ba0994c1389703069a1ddac323e905
+size 713

model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94ad8f0789775a5708c8a5c365e1f5d7442270963566248075043d606570884d
+size 4663251

optimizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a18feb3922345456cb19d72567f0145816f4e7936d4e07917d35e50103c7bd0
+size 9326243

optimizer_wav2vec.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2acedf6d0996452544892ba315e242de4ef1bb38fef3609e355a1b7d3e51903
+size 2524050533

wav2vec.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e85d339d968c46bb6acb664586d8a11fcfa247f7f77546735a040649a47d8f4
+size 1262004913