File size: 1,446 Bytes
aa7ea23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import os
import numpy as np
import librosa

from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
from lib.conf import tts_dir
from lib.models import default_voice_detection_model

class BackgroundDetector:

    def __init__(self, wav_file: str):
        self.wav_file   = wav_file
        model = Model.from_pretrained(default_voice_detection_model, cache_dir=tts_dir)
        self.pipeline = VoiceActivityDetection(segmentation=model)
        hyper_params = {
          # onset/offset activation thresholds
          "onset": 0.5, "offset": 0.5,
          # remove speech regions shorter than that many seconds.
          "min_duration_on": 0.0,
          # fill non-speech regions shorter than that many seconds.
          "min_duration_off": 0.0
        }
        self.pipeline.instantiate(hyper_params)

    def detect(self, vad_ratio_thresh: float=0.05):
        diarization     = self.pipeline(self.wav_file)
        speech_segments = [(s.start, s.end) for s in diarization.get_timeline()]
        total_duration  = librosa.get_duration(path=self.wav_file)
        speech_time     = sum(end - start for start, end in speech_segments)
        non_speech_ratio = 1 - (speech_time / total_duration)
        status = non_speech_ratio > vad_ratio_thresh
        report = {
            'non_speech_ratio': non_speech_ratio,
            'background_detected': status
        }
        return status, report