### Installing and importing dependencies

In [None]:
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install matplotlib>=3.3.2

!python -m pip install git+https://github.com/NVIDIA/NeMo.git@1fa961ba03ab5f8c91b278640e29807079373372#egg=nemo_toolkit[all]
!python -m pip install pyannote.audio==3.2.0

Collecting wget
 Downloading wget-3.2.zip (10 kB)
 Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
 Building wheel for wget (setup.py) ... [?25l[?25hdone
 Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=bbb22603efb56b962ad6a42332b0daf8d137a1446f5e46d6af4c4c2fb216fbdd
 Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsndfile1 is already the newest version (1.0.31-2ubuntu0.1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
The following additional packages will be installed:
 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libwavpack1
Suggested packages:
 libsox-fmt-all
The following NEW packages will be installed:
 libo

In [3]:
import torch
import torchaudio
from nemo.collections.asr.models import EncDecCTCModel
from nemo.collections.asr.modules.audio_preprocessing import (
 AudioToMelSpectrogramPreprocessor as NeMoAudioToMelSpectrogramPreprocessor,
)
from nemo.collections.asr.parts.preprocessing.features import (
 FilterbankFeaturesTA as NeMoFilterbankFeaturesTA,
)

### Downloading config, weights and audio example

In [None]:
import locale

locale.getpreferredencoding = lambda: "UTF-8"

# Loading weights, config and example wav for CTC-model
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_weights.ckpt
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_config.yaml
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/long_example.wav

--2024-05-28 07:16:36-- https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav
Resolving n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)... 37.230.193.192
Connecting to n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)|37.230.193.192|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 361324 (353K) [application/octet-stream]
Saving to: ‘example.wav’


2024-05-28 07:16:38 (646 KB/s) - ‘example.wav’ saved [361324/361324]



### Adding modules for features extraction

In [4]:
class FilterbankFeaturesTA(NeMoFilterbankFeaturesTA):
 def __init__(self, mel_scale: str = "htk", wkwargs=None, **kwargs):
 if "window_size" in kwargs:
 del kwargs["window_size"]
 if "window_stride" in kwargs:
 del kwargs["window_stride"]

 super().__init__(**kwargs)

 self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = (
 torchaudio.transforms.MelSpectrogram(
 sample_rate=self._sample_rate,
 win_length=self.win_length,
 hop_length=self.hop_length,
 n_mels=kwargs["nfilt"],
 window_fn=self.torch_windows[kwargs["window"]],
 mel_scale=mel_scale,
 norm=kwargs["mel_norm"],
 n_fft=kwargs["n_fft"],
 f_max=kwargs.get("highfreq", None),
 f_min=kwargs.get("lowfreq", 0),
 wkwargs=wkwargs,
 )
 )


class AudioToMelSpectrogramPreprocessor(NeMoAudioToMelSpectrogramPreprocessor):
 def __init__(self, mel_scale: str = "htk", **kwargs):
 super().__init__(**kwargs)
 kwargs["nfilt"] = kwargs["features"]
 del kwargs["features"]
 self.featurizer = (
 FilterbankFeaturesTA( # Deprecated arguments; kept for config compatibility
 mel_scale=mel_scale,
 **kwargs,
 )
 )

### Transcribation example witch CTC-model

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = EncDecCTCModel.from_config_file("./ctc_model_config.yaml")
ckpt = torch.load("./ctc_model_weights.ckpt", map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model.eval()
model = model.to(device)

[NeMo I 2024-05-28 20:34:54 features:305] PADDING: 0


In [6]:
model.transcribe(["example.wav"])

Transcribing: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.26it/s]


['ничьих не требуя похвал счастлив уж я надеждой сладкой что дева с трепетом любви посмотрит может быть украдкой на песни грешные мои у лукоморья дуб зеленый']

### Long-form transcription example


In [7]:
from io import BytesIO
from typing import List, Tuple

import numpy as np
from pyannote.audio import Pipeline
from pydub import AudioSegment

Function for segmentation

In [8]:
def audiosegment_to_numpy(audiosegment: AudioSegment) -> np.ndarray:
 """Convert AudioSegment to numpy array."""
 samples = np.array(audiosegment.get_array_of_samples())
 if audiosegment.channels == 2:
 samples = samples.reshape((-1, 2))

 samples = samples.astype(np.float32, order="C") / 32768.0
 return samples


def segment_audio(
 audio_path: str,
 pipeline: Pipeline,
 max_duration: float = 22.0,
 min_duration: float = 15.0,
 new_chunk_threshold: float = 0.2,
) -> Tuple[List[np.ndarray], List[List[float]]]:
 # Prepare audio for pyannote vad pipeline
 audio = AudioSegment.from_wav(audio_path)
 audio_bytes = BytesIO()
 audio.export(audio_bytes, format="wav")
 audio_bytes.seek(0)

 # Process audio with pipeline to obtain segments with speech activity
 sad_segments = pipeline({"uri": "filename", "audio": audio_bytes})

 segments = []
 curr_duration = 0
 curr_start = 0
 curr_end = 0
 boundaries = []

 # Concat segments from pipeline into chunks for asr according to max/min duration
 for segment in sad_segments.get_timeline().support():
 start = max(0, segment.start)
 end = min(len(audio) / 1000, segment.end)
 if (
 curr_duration > min_duration and start - curr_end > new_chunk_threshold
 ) or (curr_duration + (end - curr_end) > max_duration):
 audio_segment = audiosegment_to_numpy(
 audio[curr_start * 1000 : curr_end * 1000]
 )
 segments.append(audio_segment)
 boundaries.append([curr_start, curr_end])
 curr_start = start

 curr_end = end
 curr_duration = curr_end - curr_start

 if curr_duration != 0:
 audio_segment = audiosegment_to_numpy(
 audio[curr_start * 1000 : curr_end * 1000]
 )
 segments.append(audio_segment)
 boundaries.append([curr_start, curr_end])

 return segments, boundaries

Initializing pyannote VAD pipeline and using it for segmentation

In [11]:
HF_TOKEN = ""

# Initialize pyannote pipeline
pipeline = Pipeline.from_pretrained(
 "pyannote/voice-activity-detection", use_auth_token=HF_TOKEN
)
pipeline = pipeline.to(torch.device(device))

# Segment audio
segments, boundaries = segment_audio("./long_example.wav", pipeline)

Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../libs_cache/pyannote_cache/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e755bb7223f76666bba4/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.2.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.3.0+cu121. Bad things might happen unless you revert torch to 1.x.


In [15]:
# Transcribing segments
BATCH_SIZE = 10
transcriptions = model.transcribe(segments, batch_size=BATCH_SIZE)

Transcribing: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.23it/s]


In [16]:
def format_time(seconds):
 hours = int(seconds // 3600)
 minutes = int((seconds % 3600) // 60)
 seconds = seconds % 60
 full_seconds = int(seconds)
 milliseconds = int((seconds - full_seconds) * 100)

 if hours > 0:
 return f"{hours:02}:{minutes:02}:{full_seconds:02}:{milliseconds:02}"
 else:
 return f"{minutes:02}:{full_seconds:02}:{milliseconds:02}"


for transcription, boundary in zip(transcriptions, boundaries):
 boundary_0 = format_time(boundary[0])
 boundary_1 = format_time(boundary[1])
 print(f"[{boundary_0} - {boundary_1}]: {transcription}\n")

[00:00:00 - 00:16:83]: вечерня отошла давно но в кельях тихо и темно уже и сам эгумин строгий свои молитвы прекратил и кости ветхие склонил перекрестясь на одр убогий кругом и сон и тишина но церкви дверь отворена

[00:17:10 - 00:32:61]: трепещет луч лампады и тускло озаряет он и темную живопись икон и возлощенные оклады и раздается в тишине то тяжкий вздох то шепот важный и мрачно дремлет в вашине старинный свод

[00:32:95 - 00:49:33]: глухой и влажный стоят за клиросом чернец и грешник неподвижны оба и шепот их как глаз из гроба и грешник бледен как мертвец монах несчастный полно перестань

[00:49:82 - 01:05:74]: ужасна исповедь злодея заплачена тобою дань тому кто в злобе пламенее лукаво грешника блюдет и к вечной гибели ведет смирись опомнись время время раскаянья покров

[01:05:97 - 01:10:90]: я разрешу тебя грехов сложи мучительное бремя

