Spaces:
Running
Running
File size: 5,021 Bytes
1d8403e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | # ==================================================================================================
# DEEPFAKE AUDIO - vocoder/audio.py (Signal Processing Engine)
# ==================================================================================================
#
# π DESCRIPTION
# This module provides low-level signal processing utilities for the vocoder.
# It handles waveform normalization, Mel-Spectrogram conversion, Mu-Law
# encoding/decoding, and pre-emphasis filtering, ensuring audio data is
# properly conditioned for neural generation.
#
# π€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# π€π» CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# π PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# π LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
import math
import numpy as np
import librosa
import librosa.filters
import vocoder.hparams as hp
from scipy.signal import lfilter
import soundfile as sf
def label_2_float(x, bits):
"""Linguistic Mapping: Converts discrete labels back to floating point amplitudes."""
return 2 * x / (2**bits - 1.) - 1.
def float_2_label(x, bits):
"""Categorical Ingestion: Maps floating point samples to discrete bit-depth labels."""
assert abs(x).max() <= 1.0
x = (x + 1.) * (2**bits - 1) / 2
return x.clip(0, 2**bits - 1)
def load_wav(path):
"""IO Gateway: Loads an audio file at the canonical vocoder sampling rate."""
return librosa.load(str(path), sr=hp.sample_rate)[0]
def save_wav(x, path):
"""IO Gateway: Persists a waveform array to the filesystem."""
sf.write(path, x.astype(np.float32), hp.sample_rate)
def split_signal(x):
"""Binary Decomposition: Splits a 16-bit signal into coarse and fine 8-bit components."""
unsigned = x + 2**15
coarse = unsigned // 256
fine = unsigned % 256
return coarse, fine
def combine_signal(coarse, fine):
"""Binary Restoration: Reconstructs a 16-bit signal from coarse and fine components."""
return coarse * 256 + fine - 2**15
def encode_16bits(x):
"""Bit-depth Scaling: Forces signal into the signed 16-bit integer range."""
return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16)
mel_basis = None
def linear_to_mel(spectrogram):
"""Neural Translation: Maps a linear spectrogram to the psychoacoustic Mel scale."""
global mel_basis
if mel_basis is None:
mel_basis = build_mel_basis()
return np.dot(mel_basis, spectrogram)
def build_mel_basis():
"""Linguistic Filter: Constructs the Mel-filterbank matrix."""
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
def normalize(S):
"""Dynamic Range Compression: Scales decibel spectrograms to the [0, 1] interval."""
return np.clip((S - hp.min_level_db) / -hp.min_level_db, 0, 1)
def denormalize(S):
"""Dynamic Range Expansion: Reverses normalization for waveform reconstruction."""
return (np.clip(S, 0, 1) * -hp.min_level_db) + hp.min_level_db
def amp_to_db(x):
"""Logarithmic Scaling: Converts linear amplitudes to decibels."""
return 20 * np.log10(np.maximum(1e-5, x))
def db_to_amp(x):
"""Linear Scaling: Converts decibels back to linear amplitudes."""
return np.power(10.0, x * 0.05)
def spectrogram(y):
"""Signal Extraction: Computes a normalized linear spectrogram via STFT."""
D = stft(y)
S = amp_to_db(np.abs(D)) - hp.ref_level_db
return normalize(S)
def melspectrogram(y):
"""Signal Extraction: Computes a normalized Mel-Spectrogram from a waveform."""
D = stft(y)
S = amp_to_db(linear_to_mel(np.abs(D)))
return normalize(S)
def stft(y):
"""Wavelet Analysis: Performs Short-Time Fourier Transform."""
return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length)
def pre_emphasis(x):
"""Spectral Shaping: Enhances high-frequency signals before processing."""
return lfilter([1, -hp.preemphasis], [1], x)
def de_emphasis(x):
"""Spectral Shaping: Reverses pre-emphasis during post-processing."""
return lfilter([1], [1, -hp.preemphasis], x)
def encode_mu_law(x, mu):
"""Non-linear Quantization: Applies Mu-Law companding logic."""
mu = mu - 1
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
return np.floor((fx + 1) / 2 * mu + 0.5)
def decode_mu_law(y, mu, from_labels=True):
"""Non-linear Expansion: Reverses Mu-Law companding to retrieve amplitudes."""
if from_labels:
y = label_2_float(y, math.log2(mu))
mu = mu - 1
x = np.sign(y) / mu * ((1 + mu) ** np.abs(y) - 1)
return x
|