ebook2audiobook / lib /classes /background_detector.py
priteshmistry's picture
Upload 22 files
aa7ea23 verified
import os
import numpy as np
import librosa
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
from lib.conf import tts_dir
from lib.models import default_voice_detection_model
class BackgroundDetector:
def __init__(self, wav_file: str):
self.wav_file = wav_file
model = Model.from_pretrained(default_voice_detection_model, cache_dir=tts_dir)
self.pipeline = VoiceActivityDetection(segmentation=model)
hyper_params = {
# onset/offset activation thresholds
"onset": 0.5, "offset": 0.5,
# remove speech regions shorter than that many seconds.
"min_duration_on": 0.0,
# fill non-speech regions shorter than that many seconds.
"min_duration_off": 0.0
}
self.pipeline.instantiate(hyper_params)
def detect(self, vad_ratio_thresh: float=0.05):
diarization = self.pipeline(self.wav_file)
speech_segments = [(s.start, s.end) for s in diarization.get_timeline()]
total_duration = librosa.get_duration(path=self.wav_file)
speech_time = sum(end - start for start, end in speech_segments)
non_speech_ratio = 1 - (speech_time / total_duration)
status = non_speech_ratio > vad_ratio_thresh
report = {
'non_speech_ratio': non_speech_ratio,
'background_detected': status
}
return status, report