File size: 1,944 Bytes
d1124fa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from pydub import AudioSegment
AudioSegment.converter = "/usr/bin/ffmpeg"
import numpy as np
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from typing import Optional
class detectSpeech:
def __init__(
self,
model_class,
logMelSpectrogram,
model_path: str,
stride_s: int = 25,
frame_rate_s: int = 25,
device: Optional[str] = None,
threshold: float = 0.5,
batch_size: int = 32,
sr: int = 16000
):
if device is None:
self.device = "cuda" if torch.cuda.is_available() else (
"mps" if torch.mps.is_available() else "cpu"
)
else:
self.device = device
self.model_path = model_path
self.model = model_class.to(self.device)
self.model.load_state_dict(torch.load(self.model_path, weights_only=True))
self.model.eval()
self.log_mel_spec = logMelSpectrogram
self.sr = sr
self.stride = sr * stride_s // 1000
self.frame_rate = sr * frame_rate_s // 1000
def detect(
self,
audio_path: str
):
audio = AudioSegment.from_file(audio_path)
audio = audio.set_channels(1).set_frame_rate(self.sr)
samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
log_mel = self.log_mel_spec.transform(samples=samples, sr=self.sr).to(self.device)
chunks_mel = log_mel.unfold(dimension=1, size=self.frame_rate, step=self.stride)
chunks_mel = chunks_mel.permute(1, 0, 2)
chunks_mel = F.normalize(chunks_mel).unsqueeze(1)
with torch.no_grad():
outputs = self.model.forward(chunks_mel)
outputs = torch.sigmoid(outputs)
outputs = (outputs >= 0.5).int()
onset, offset = torch.split(outputs, 400, dim=1)
return torch.flatten(onset), torch.flatten(offset)
|