File size: 1,704 Bytes
30f8290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
import sys
import torch

import numpy as np
import torch.nn as nn

sys.path.append(os.getcwd())

class Spectrogram(nn.Module):
    def __init__(
        self, 
        hop_length, 
        win_length, 
        n_fft=None, 
        clamp=1e-10
    ):
        super(Spectrogram, self).__init__()
        self.n_fft = win_length if n_fft is None else n_fft
        self.hop_length = hop_length
        self.win_length = win_length
        self.clamp = clamp
        self.register_buffer("window", torch.hann_window(win_length), persistent=False)

    def forward(self, audio, center=True):
        bs, c, segment_samples = audio.shape
        audio = audio.reshape(bs * c, segment_samples)

        if str(audio.device).startswith(("ocl", "privateuseone")):
            if not hasattr(self, "stft"): 
                from main.library.backends.utils import STFT

                self.stft = STFT(
                    filter_length=self.n_fft, 
                    hop_length=self.hop_length, 
                    win_length=self.win_length
                ).to(audio.device)

            magnitude = self.stft.transform(audio, 1e-9)
        else:
            fft = torch.stft(
                audio, 
                n_fft=self.n_fft, 
                hop_length=self.hop_length, 
                win_length=self.win_length, 
                window=self.window, 
                center=center, 
                pad_mode="reflect", 
                return_complex=True
            )

            magnitude = (fft.real.pow(2) + fft.imag.pow(2)).sqrt()

        mag = magnitude.transpose(1, 2).clamp(self.clamp, np.inf)
        mag = mag.reshape(bs, c, mag.shape[1], mag.shape[2])

        return mag