File size: 8,495 Bytes
1d8403e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# ==================================================================================================
# DEEPFAKE AUDIO - encoder/preprocess.py (Acoustic Feature Extraction Engine)
# ==================================================================================================
# 
# πŸ“ DESCRIPTION
# This module implements the internal orchestration for dataset preprocessing. 
# It provides highly-parallelized functions to traverse raw speech corpora 
# (LibriSpeech, VoxCeleb), extract speaker-specific metadata, and materialize 
# normalized Mel-Spectrograms onto the disk for high-throughput training.
#
# πŸ‘€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🀝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# πŸ”— PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# πŸ“œ LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================

from datetime import datetime
from functools import partial
from multiprocessing import Pool
from pathlib import Path
import numpy as np
from tqdm import tqdm

# --- INTERNAL SIGNAL UTILITIES ---
from encoder import audio
from encoder.config import librispeech_datasets, anglophone_nationalites
from encoder.params_data import *

_AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")

class DatasetLog:
    """
    Experimental Ledger: Records the metadata and parameter state of a 
    preprocessing run for reproducibility.
    """
    def __init__(self, root, name):
        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
        self.sample_data = dict()

        start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
        self.write_line("πŸš€ Initiating Preprocessing Cycle for: %s on %s" % (name, start_time))
        self.write_line("-----")
        self._log_params()

    def _log_params(self):
        """Archives the effective hyperparameters in the log."""
        from encoder import params_data
        self.write_line("Acoustic Hyperparameters:")
        for param_name in (p for p in dir(params_data) if not p.startswith("__")):
            value = getattr(params_data, param_name)
            self.write_line("\t%s: %s" % (param_name, value))
        self.write_line("-----")

    def write_line(self, line):
        self.text_file.write("%s\n" % line)

    def add_sample(self, **kwargs):
        """Accumulates statistical distributions of processed samples."""
        for param_name, value in kwargs.items():
            if not param_name in self.sample_data:
                self.sample_data[param_name] = []
            self.sample_data[param_name].append(value)

    def finalize(self):
        """Computes and writes dataset-wide statistics before closing."""
        self.write_line("Neural Statistical Profile:")
        for param_name, values in self.sample_data.items():
            self.write_line("\t%s:" % param_name)
            self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
            self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
        self.write_line("-----")
        end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
        self.write_line("🀝🏻 Cycle Completed on %s" % end_time)
        self.text_file.close()

def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
    """Integrity check and log initialization for a specific corpus."""
    dataset_root = datasets_root.joinpath(dataset_name)
    if not dataset_root.exists():
        print("⚠️ Scholarly Alert: Dataset %s not found. Skipping." % dataset_root)
        return None, None
    return dataset_root, DatasetLog(out_dir, dataset_name)

def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
    """
    Utterance Orchestration: Processes all vocal samples for a single speaker.
    Extracts Mel-Spectrograms and archives source mappings.
    """
    speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
    speaker_out_dir = out_dir.joinpath(speaker_name)
    speaker_out_dir.mkdir(exist_ok=True)
    sources_fpath = speaker_out_dir.joinpath("_sources.txt")

    # Resilience: Load existing sources index
    existing_fnames = {}
    if sources_fpath.exists():
        try:
            with sources_fpath.open("r") as sources_file:
                existing_fnames = {line.split(",")[0] for line in sources_file}
        except:
            pass

    # Process utterances recursively
    sources_file = sources_fpath.open("a" if skip_existing else "w")
    audio_durs = []
    for extension in _AUDIO_EXTENSIONS:
        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
            out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
            out_fname = out_fname.replace(".%s" % extension, ".npy")
            
            if skip_existing and out_fname in existing_fnames:
                continue

            # Signal Normalization
            wav = audio.preprocess_wav(in_fpath)
            if len(wav) == 0:
                continue

            # Spectral Analysis
            frames = audio.wav_to_mel_spectrogram(wav)
            if len(frames) < partials_n_frames:
                continue

            # Materialization
            out_fpath = speaker_out_dir.joinpath(out_fname)
            np.save(out_fpath, frames)
            sources_file.write("%s,%s\n" % (out_fname, in_fpath))
            audio_durs.append(len(wav) / sampling_rate)

    sources_file.close()
    return audio_durs

def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
    """Degree of parallelism management for speaker-level processing."""
    print("🀝🏻 Processing %d speakers from: %s" % (len(speaker_dirs), dataset_name))

    work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
    with Pool(4) as pool:
        tasks = pool.imap(work_fn, speaker_dirs)
        for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
            for sample_dur in sample_durs:
                logger.add_sample(duration=sample_dur)

    logger.finalize()

def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
    """Pipeline for LibriSpeech: A massive corpus of read English speech."""
    for dataset_name in librispeech_datasets["train"]["other"]:
        dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
        if not dataset_root: continue
        
        speaker_dirs = list(dataset_root.glob("*"))
        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)

def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
    """Pipeline for VoxCeleb1: Celebrity voices with multi-national diversity."""
    dataset_name = "VoxCeleb1"
    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
    if not dataset_root: return

    # Meta-Guided Filtering for Anglophone Speakers
    with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
        metadata = [line.split("\t") for line in metafile][1:]

    nationalities = {line[0]: line[3] for line in metadata}
    keep_speaker_ids = [s_id for s_id, nat in nationalities.items() if nat.lower() in anglophone_nationalites]
    
    speaker_dirs = [d for d in dataset_root.joinpath("wav").glob("*") if d.name in keep_speaker_ids]
    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)

def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
    """Pipeline for VoxCeleb2: Broadscale celebrity speech data."""
    dataset_name = "VoxCeleb2"
    dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
    if not dataset_root: return

    speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)