| from pydub import AudioSegment | |
| AudioSegment.converter = "/usr/bin/ffmpeg" | |
| import numpy as np | |
| import torch | |
| import torch.nn.functional as F | |
| import matplotlib.pyplot as plt | |
| from typing import Optional | |
| class detectSpeech: | |
| def __init__( | |
| self, | |
| model_class, | |
| logMelSpectrogram, | |
| model_path: str, | |
| stride_s: int = 25, | |
| frame_rate_s: int = 25, | |
| device: Optional[str] = None, | |
| threshold: float = 0.5, | |
| batch_size: int = 32, | |
| sr: int = 16000 | |
| ): | |
| if device is None: | |
| self.device = "cuda" if torch.cuda.is_available() else ( | |
| "mps" if torch.mps.is_available() else "cpu" | |
| ) | |
| else: | |
| self.device = device | |
| self.model_path = model_path | |
| self.model = model_class.to(self.device) | |
| self.model.load_state_dict(torch.load(self.model_path, weights_only=True)) | |
| self.model.eval() | |
| self.log_mel_spec = logMelSpectrogram | |
| self.sr = sr | |
| self.stride = sr * stride_s // 1000 | |
| self.frame_rate = sr * frame_rate_s // 1000 | |
| def detect( | |
| self, | |
| audio_path: str | |
| ): | |
| audio = AudioSegment.from_file(audio_path) | |
| audio = audio.set_channels(1).set_frame_rate(self.sr) | |
| samples = np.array(audio.get_array_of_samples(), dtype=np.float32) | |
| log_mel = self.log_mel_spec.transform(samples=samples, sr=self.sr).to(self.device) | |
| chunks_mel = log_mel.unfold(dimension=1, size=self.frame_rate, step=self.stride) | |
| chunks_mel = chunks_mel.permute(1, 0, 2) | |
| chunks_mel = F.normalize(chunks_mel).unsqueeze(1) | |
| with torch.no_grad(): | |
| outputs = self.model.forward(chunks_mel) | |
| outputs = torch.sigmoid(outputs) | |
| outputs = (outputs >= 0.5).int() | |
| onset, offset = torch.split(outputs, 400, dim=1) | |
| return torch.flatten(onset), torch.flatten(offset) | |