### Installing and importing dependencies

In [None]:
!pip install wget
!apt-get install sox libsndfile1 ffmpeg

!python -m pip install git+https://github.com/NVIDIA/NeMo.git@1fa961ba03ab5f8c91b278640e29807079373372#egg=nemo_toolkit[all]

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=fb233af0965c5da90b8babdcb0fbd51095c2a135ec877618013ed9078dced85b
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsndfile1 is already the newest version (1.0.31-2ubuntu0.1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
The following additional packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libwavpack1
Suggested packages:
  libsox-fmt-all
The following NEW packages will be installed

In [None]:
import hydra
import soundfile as sf
import torch
from omegaconf import OmegaConf

### Downloading config, weights and audio example

In [None]:
import locale

locale.getpreferredencoding = lambda: "UTF-8"

# Loading weights, config and example wav for CTC-model
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ssl_model_weights.ckpt
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/encoder_config.yaml
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav

--2024-05-28 07:12:41--  https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav
Resolving n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)... 37.230.193.192
Connecting to n-ws-q0bez.s3pd12.sbercloud.ru (n-ws-q0bez.s3pd12.sbercloud.ru)|37.230.193.192|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 361324 (353K) [application/octet-stream]
Saving to: ‘example.wav’


2024-05-28 07:12:42 (583 KB/s) - ‘example.wav’ saved [361324/361324]



### Model instantiating and *inference*

In [None]:
class SpecScaler(torch.nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.log(x.clamp_(1e-9, 1e9))


device = "cuda" if torch.cuda.is_available() else "cpu"
encoder_config = "encoder_config.yaml"
model_weights = "ssl_model_weights.ckpt"
audio_path = "example.wav"

conf = OmegaConf.load(encoder_config)

encoder = hydra.utils.instantiate(conf.encoder)
ckpt = torch.load(model_weights, map_location="cpu")
encoder.load_state_dict(ckpt, strict=True)
encoder.to(device)

feature_extractor = hydra.utils.instantiate(conf.feature_extractor)

audio_signal, _ = sf.read(audio_path, dtype="float32")
features = feature_extractor(torch.tensor(audio_signal).float())
features = features.to(device)

encoded, _ = encoder.forward(
    audio_signal=features.unsqueeze(0),
    length=torch.tensor([features.shape[-1]]).to(device),
)
print(f"encoded signal shape: {encoded.shape}")

encoded signal shape: torch.Size([1, 768, 283])
