File size: 5,618 Bytes
1d8403e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# ==================================================================================================
# DEEPFAKE AUDIO - encoder/audio.py (Acoustic Signal Processing)
# ==================================================================================================
# 
# πŸ“ DESCRIPTION
# This module implements the acoustic primitives required for the Speaker Encoder. 
# It handles waveform normalization, resampling, and most importantly, the 
# transformation of raw time-domain signals into frequency-domain Mel-Spectrograms. 
# It also integrates Voice Activity Detection (VAD) via 'webrtcvad' to ensure that 
# only active speech segments are passed to the neural distillation layers.
#
# πŸ‘€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🀝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# πŸ”— PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# πŸ“œ LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================

from scipy.ndimage import binary_dilation
from encoder.params_data import *
from pathlib import Path
from typing import Optional, Union
from warnings import warn
import numpy as np
import librosa
import struct

# --- VAD INITIALIZATION ---
try:
    import webrtcvad
except:
    warn("⚠️ Scholarly Warning: 'webrtcvad' not detected. Noise removal and silence trimming will be bypassed.")
    webrtcvad = None

int16_max = (2 ** 15) - 1

def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
                   source_sr: Optional[int] = None,
                   normalize: Optional[bool] = True,
                   trim_silence: Optional[bool] = True):
    """
    Orchestrates the acoustic normalization pipeline.
    1. Loads signal from disk or buffer.
    2. Resamples to training-specific frequencies.
    3. Normalizes volume (dBFS).
    4. Trims non-speech intervals (if VAD is active).
    """
    # Defensive Input Handling
    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
        wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
    else:
        wav = fpath_or_wav
    
    # Frequency Alignment
    if source_sr is not None and source_sr != sampling_rate:
        wav = librosa.resample(y=wav, orig_sr=source_sr, target_sr=sampling_rate)
        
    # Amplitude Normalization
    if normalize:
        wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
        
    # Temporal Compression (Silence Removal)
    if webrtcvad and trim_silence:
        wav = trim_long_silences(wav)
    
    return wav

def wav_to_mel_spectrogram(wav):
    """
    Distills a time-domain waveform into a frequency-domain Mel-Spectrogram matrix.
    This serves as the primary input for the Speaker Encoder neural network.
    """
    frames = librosa.feature.melspectrogram(
        y=wav,
        sr=sampling_rate,
        n_fft=int(sampling_rate * mel_window_length / 1000),
        hop_length=int(sampling_rate * mel_window_step / 1000),
        n_mels=mel_n_channels
    )
    return frames.astype(np.float32).T

def trim_long_silences(wav):
    """
    Utilizes WebRTC Voice Activity Detection (VAD) to excise non-semantic silences.
    Ensures the speaker identity is extracted from high-entropy speech segments only.
    """
    # Spatial Decomposition into temporal windows
    samples_per_window = (vad_window_length * sampling_rate) // 1000
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]
    
    # Binary Serialization for VAD compatibility (16-bit PCM)
    pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
    
    # Statistical Speech Filtering
    voice_flags = []
    vad = webrtcvad.Vad(mode=3) # Aggressive Filtering
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                                         sample_rate=sampling_rate))
    voice_flags = np.array(voice_flags)
    
    # Temporal Smoothing (Moving Average)
    def moving_average(array, width):
        array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
        ret = np.cumsum(array_padded, dtype=float)
        ret[width:] = ret[width:] - ret[:-width]
        return ret[width - 1:] / width
    
    audio_mask = moving_average(voice_flags, vad_moving_average_width)
    audio_mask = np.round(audio_mask).astype(bool)
    
    # Morphological Dilation to preserve speech boundaries
    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)
    
    return wav[audio_mask == True]

def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
    """Calibrates the signal's energy level to a target Decibel Full Scale (dBFS)."""
    if increase_only and decrease_only:
        raise ValueError("Conflict: Both increase and decrease flags are active.")
    
    dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
    
    if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
        return wav
        
    return wav * (10 ** (dBFS_change / 20))