File size: 3,520 Bytes
1d8403e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# ==================================================================================================
# DEEPFAKE AUDIO - vocoder/inference.py (Neural Waveform Synthesizer)
# ==================================================================================================
# 
# πŸ“ DESCRIPTION
# This module provides the high-level API for vocoder inference. It encapsulates 
# the WaveRNN model, handles hardware acceleration (CUDA), and provides the 
# entry point for transforming Mel-Spectrograms into audible speech waveforms.
#
# πŸ‘€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🀝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# πŸ”— PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# πŸ“œ LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================

from vocoder.models.fatchord_version import WaveRNN
from vocoder import hparams as hp
import torch

_model = None   # Global singleton for the active WaveRNN instance

def load_model(weights_fpath, verbose=True):
    """Neural Wake-up: Initializes the WaveRNN architecture and loads pre-trained weights."""
    global _model, _device
    
    if verbose:
        print("Building Wave-RNN Architecture...")
    _model = WaveRNN(
        rnn_dims=hp.voc_rnn_dims,
        fc_dims=hp.voc_fc_dims,
        bits=hp.bits,
        pad=hp.voc_pad,
        upsample_factors=hp.voc_upsample_factors,
        feat_dims=hp.num_mels,
        compute_dims=hp.voc_compute_dims,
        res_out_dims=hp.voc_res_out_dims,
        res_blocks=hp.voc_res_blocks,
        hop_length=hp.hop_length,
        sample_rate=hp.sample_rate,
        mode=hp.voc_mode
    )

    # Hardware Optimization: Prefer CUDA for high-performance generation
    if torch.cuda.is_available():
        _model = _model.cuda()
        _device = torch.device('cuda')
    else:
        _device = torch.device('cpu')
    
    if verbose:
        print("Loading model weights from: %s" % weights_fpath)
    checkpoint = torch.load(weights_fpath, map_location=_device, weights_only=False)
    _model.load_state_dict(checkpoint['model_state'])
    _model.eval()

def is_loaded():
    """Status Check: Verifies if the vocoder is initialized in memory."""
    return _model is not None

def infer_waveform(mel, normalize=True, batched=True, target=8000, overlap=800, progress_callback=None):
    """
    Waveform Synthesis Phase:
    Transforms a Mel-Spectrogram into a time-domain waveform using neural vocoding.
    
    :param mel: Mel-Spectrogram input (numpy array)
    :param normalize: Whether to scale the input spectrogram
    :param batched: Use parallel generation for speed
    :param target: Chunk size for batched synthesis
    :param overlap: Samples used for smooth blending between chunks
    :return: Synthesized waveform (numpy array)
    """
    if _model is None:
        raise Exception("Operational Error: Wave-RNN must be loaded before inference.")
    
    if normalize:
        mel = mel / hp.mel_max_abs_value
    
    mel = torch.from_numpy(mel[None, ...])
    wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
    return wav