Upload 3 files

Browse files

Files changed (3) hide show

logMelSpectrogram.py +175 -0
sadModel.py +32 -0
speech_detection.py +76 -0

logMelSpectrogram.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import torch
+import numpy as np
+from matplotlib import pyplot as plt
+from typing import Optional
+class logMelSpectrogram:
+    def __init__(
+            self,
+            frame_rate_s: int = 30,
+            stride_s: int = 10,
+            n_fft: Optional[int] = None,
+            n_mels: Optional[int] = 40,
+            top_db: int = 80,
+            pre_emph_coef: float = 0.95,
+            device: Optional[str] = None
+    ):
+        self.frame_rate_s = frame_rate_s
+        self.stride_s = stride_s
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        self.log_mel_spec_is_computed = False
+        self.top_db = top_db
+        self.pre_emph_coef = pre_emph_coef
+        if not device:
+            self.device = "cuda" if torch.cuda.is_available() else (
+                "mps" if torch.mps.is_available() else "cpu"
+            )
+        self.device = device
+        torch.set_default_device(device)
+        torch.set_default_dtype(torch.float32)
+    def transform(
+            self,
+            samples: np.array,
+            sr: int,
+    ):
+        self.samples = torch.from_numpy(samples)
+        self.sr = sr
+        if self.samples.shape[0] < 2:
+            raise ValueError("Samples should be longer than two")
+        # pre emphasis
+        # it's necessary to compensate the audio roll off
+        # meaning it amplifies the difference between current signal
+        # and previous one
+        pre_emph_samples = torch.cat([
+            self.samples[0:1],
+            self.samples[1:] - self.pre_emph_coef * self.samples[:-1]
+        ], dim=0)
+        # framing
+        # it's needed to turn the audio into descrete overlapping chunks
+        stride = self.sr * self.stride_s // 1000
+        frame_rate = self.sr * self.frame_rate_s // 1000
+        chunks = pre_emph_samples.unfold(0, frame_rate, stride).contiguous()
+        num_of_frames = chunks.shape[0]
+        # hann window to smooth out the edges
+        # as i understand, it is necessary to
+        # smooth out the edges of chunks to avoid
+        # sudden drops and rises in volume
+        n = torch.arange(frame_rate)
+        hanning_weights = 0.5 - 0.5 * torch.cos(2 * torch.pi * n / (frame_rate - 1))
+        weighted_chunks = chunks * hanning_weights
+        # applying fast fourier transform
+        # to decompose "raw" audio into underlying frequencies
+        # only positive frequencies are taken, because negative freqs
+        # dont bring new information
+        # so there are about "half" (n_fft / 2 + 1) extracted
+        if not self.n_fft:
+            self.n_fft = 2 ** torch.ceil(torch.log2(torch.tensor(frame_rate, dtype=torch.float32))).to(torch.int32)
+        fft_chunks = torch.fft.rfft(weighted_chunks, n=self.n_fft)
+        power_spec = (2 / self.n_fft ** 2) * torch.abs(fft_chunks) ** 2
+        # herz to mels converter and vice versa
+        def hz_to_mel(hz):
+            return 2595 * torch.log10(1 + hz / 700)
+        def mel_to_hz(m):
+            return 700 * (10 ** (m / 2595) - 1)
+        fmax = self.sr / 2
+        fmin = 0
+        # here we create mels scale
+        mels = torch.linspace(
+            hz_to_mel(torch.tensor(fmin)),
+            hz_to_mel(torch.tensor(fmax)),
+            self.n_mels + 2
+        )
+        # converting linear mels to hz thus
+        # introducing non-linearity
+        hz_points = mel_to_hz(mels)
+        bins = torch.floor((self.n_fft + 1) * hz_points / self.sr).to(torch.int32)
+        # building triangular filters
+        # that are overlapping and gain "energy" with the increase of hz
+        # simulating human hearing that is better at distinguishing between lower
+        # freqs than higher ones
+        # so as the hz rises the filter becomes bigger
+        # and, if one might say, less sensitive
+        k = torch.arange(self.n_fft // 2 + 1).unsqueeze(0)
+        f_left = bins[:-2].unsqueeze(1)
+        f_center = bins[1:-1].unsqueeze(1)
+        f_right = bins[2:].unsqueeze(1)
+        up = (k - f_left) / torch.clamp(f_center - f_left, min=1e-8)      # (n_mels, bins)
+        down = (f_right - k) / torch.clamp(f_right - f_center, min=1e-8) # (n_mels, bins)
+        filters = torch.clamp(torch.minimum(up, down), min=0.0)
+        mel_spec = torch.matmul(filters, power_spec.T)
+        # converting mel spectogram to log scale
+        mel_spec = torch.clamp(mel_spec, min=1e-10)
+        log_mel_spec = 10 * torch.log10(mel_spec)
+        # normalising
+        log_mel_spec = torch.clamp(
+            log_mel_spec,
+            min=torch.max(log_mel_spec) - self.top_db
+        )
+        self.log_mel_spec = log_mel_spec
+        self.log_mel_spec_is_computed = True
+        return log_mel_spec
+    def plot_waveform(self):
+        plt.figure(figsize=(10, 4))
+        cpu_samples = self.samples.cpu().numpy()
+        plt.plot(np.arange(cpu_samples.shape[0]) / self.sr, cpu_samples)
+        plt.title("Waveform")
+        plt.xlabel("Time (s)")
+        plt.ylabel("Amplitude")
+        plt.show()
+    def plot_log_mel_spec(self, cmap="magma_r"):
+        if not self.log_mel_spec_is_computed:
+            raise ValueError("run compute() before plotting log mel spectogram")
+        plt.figure(figsize=(10, 4))
+        spec_to_plot = self.log_mel_spec.cpu().numpy()
+        plt.imshow(spec_to_plot, origin="lower", aspect="auto", cmap=cmap)
+        plt.title("Log-Mel Spectrogram (dB)")
+        plt.xlabel("Time frames")
+        plt.ylabel("Mel bins")
+        plt.colorbar()
+        plt.show()

sadModel.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+from torch import nn
+class sadModel(nn.Module):
+    def __init__(self, input_dim=40, hidden_dim=64, num_layers=1, output_dim=800):
+        super(sadModel, self).__init__()
+        # GRU expects input: (seq_len, batch, input_size)
+        self.gru = nn.GRU(
+            input_size=input_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True
+        )
+        self.fc = nn.Linear(hidden_dim * 2 * 400, output_dim)  # 2 for bidirectional
+    def forward(self, x):
+        # x: (batch, 1, 40, 400) -> remove channel dim and permute
+        x = x.squeeze(1).permute(0, 2, 1)  # (batch, 400, 40)
+        # pass through gru
+        out, _ = self.gru(x)  # out: (batch, 400, hidden_dim*2)
+        # flatten time dimension
+        out = out.contiguous().view(out.size(0), -1)  # (batch, 400*hidden_dim*2)
+        out = self.fc(out)  # (batch, 800)
+        return out

speech_detection.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from pydub import AudioSegment
+AudioSegment.converter = "/usr/bin/ffmpeg"
+import numpy as np
+import torch
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+from typing import Optional
+class detectSpeech:
+    def __init__(
+        self,
+        model_class,
+        logMelSpectrogram,
+        model_path: str,
+        stride_s: int = 25,
+        frame_rate_s: int = 25,
+        device: Optional[str] = None,
+        threshold: float = 0.5,
+        batch_size: int = 32,
+        sr: int = 16000
+    ):
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else (
+                "mps" if torch.mps.is_available() else "cpu"
+            )
+        else:
+            self.device = device
+        self.model_path = model_path
+        self.model = model_class.to(self.device)
+        self.model.load_state_dict(torch.load(self.model_path, weights_only=True))
+        self.model.eval()
+        self.log_mel_spec = logMelSpectrogram
+        self.sr = sr
+        self.stride = sr * stride_s // 1000
+        self.frame_rate = sr * frame_rate_s // 1000
+    def detect(
+        self,
+        audio_path: str
+    ):
+        audio = AudioSegment.from_file(audio_path)
+        audio = audio.set_channels(1).set_frame_rate(self.sr)
+        samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
+        log_mel = self.log_mel_spec.transform(samples=samples, sr=self.sr).to(self.device)
+        chunks_mel = log_mel.unfold(dimension=1, size=self.frame_rate, step=self.stride)
+        chunks_mel = chunks_mel.permute(1, 0, 2)
+        chunks_mel = F.normalize(chunks_mel).unsqueeze(1)
+        with torch.no_grad():
+            outputs = self.model.forward(chunks_mel)
+            outputs = torch.sigmoid(outputs)
+            outputs = (outputs >= 0.5).int()
+            onset, offset = torch.split(outputs, 400, dim=1)
+        return torch.flatten(onset), torch.flatten(offset)