Voice Activity Detection
English
File size: 1,944 Bytes
d1124fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77


from pydub import AudioSegment
AudioSegment.converter = "/usr/bin/ffmpeg"

import numpy as np

import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt

from typing import Optional

class detectSpeech:

    def __init__(
        self,
        model_class,
        logMelSpectrogram,
        model_path: str,
        stride_s: int = 25,
        frame_rate_s: int = 25,
        device: Optional[str] = None,
        threshold: float = 0.5,
        batch_size: int = 32,
        sr: int = 16000
    ):

        if device is None:
            self.device = "cuda" if torch.cuda.is_available() else (
                "mps" if torch.mps.is_available() else "cpu"
            )
        else:
            self.device = device

        self.model_path = model_path

        self.model = model_class.to(self.device)
        self.model.load_state_dict(torch.load(self.model_path, weights_only=True))
        self.model.eval()

        self.log_mel_spec = logMelSpectrogram

        self.sr = sr
        self.stride = sr * stride_s // 1000
        self.frame_rate = sr * frame_rate_s // 1000


    def detect(
        self,
        audio_path: str
    ):


        audio = AudioSegment.from_file(audio_path)
        audio = audio.set_channels(1).set_frame_rate(self.sr)
        samples = np.array(audio.get_array_of_samples(), dtype=np.float32)

        log_mel = self.log_mel_spec.transform(samples=samples, sr=self.sr).to(self.device)


        chunks_mel = log_mel.unfold(dimension=1, size=self.frame_rate, step=self.stride)
        chunks_mel = chunks_mel.permute(1, 0, 2)

        chunks_mel = F.normalize(chunks_mel).unsqueeze(1)

        with torch.no_grad():
            outputs = self.model.forward(chunks_mel)
            outputs = torch.sigmoid(outputs)
            outputs = (outputs >= 0.5).int()

            onset, offset = torch.split(outputs, 400, dim=1)


        return torch.flatten(onset), torch.flatten(offset)