Spaces:
Running
Running
File size: 7,584 Bytes
1d8403e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | # ==================================================================================================
# DEEPFAKE AUDIO - encoder/visualizations.py (Neural Telemetry & Projections)
# ==================================================================================================
#
# π DESCRIPTION
# This module implements the visual diagnostic engine for training monitoring.
# It utilizes Visdom for real-time loss/EER plotting and UMAP (Uniform Manifold
# Approximation and Projection) to visualize the high-dimensional speaker
# identity space, allowing researchers to observe the clustering of d-vectors.
#
# π€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# π€π» CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# π PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# π LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from datetime import datetime
from time import perf_counter as timer
import numpy as np
import umap
import visdom
# --- PROJECT CORE MODULES ---
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
# --- AESTHETIC CONFIGURATION (Categorical Color Map) ---
colormap = np.array([
[76, 255, 0], [0, 127, 70], [255, 0, 0], [255, 217, 38],
[0, 135, 255], [165, 0, 165], [255, 167, 255], [0, 255, 255],
[255, 96, 38], [142, 76, 0], [33, 0, 127], [0, 0, 0],
[183, 183, 183],
], dtype=float) / 255
class Visualizations:
"""
Experimental Dashboard:
Provides a real-time window into the model's convergence and identity manifold.
"""
def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
# Neural Tracking State
self.last_update_timestamp = timer()
self.update_every = update_every
self.step_times, self.losses, self.eers = [], [], []
# Operational Mode
self.disabled = disabled
if self.disabled: return
# Temporal Versioning of Environment
now = str(datetime.now().strftime("%d-%m %Hh%M"))
self.env_name = now if env_name is None else "%s (%s)" % (env_name, now)
# Visdom Server Handshake
try:
self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
except ConnectionError:
raise Exception("β οΈ Technical Blocker: Visdom server not detected. Start it via 'visdom' CLI.")
# Dashboard Window Handles
self.loss_win = self.eer_win = self.implementation_win = self.projection_win = None
self.implementation_string = ""
def log_params(self):
"""Archives model and data hyperparameters in the Visdom environment."""
if self.disabled: return
from encoder import params_data, params_model
param_string = "<b>𧬠Neural Architecture Parameters</b>:<br>"
for p in (n for n in dir(params_model) if not n.startswith("__")):
param_string += "\t%s: %s<br>" % (p, getattr(params_model, p))
param_string += "<br><b>π Acoustic Signal Parameters</b>:<br>"
for p in (n for n in dir(params_data) if not n.startswith("__")):
param_string += "\t%s: %s<br>" % (p, getattr(params_data, p))
self.vis.text(param_string, opts={"title": "Hyperparameters (Static Snapshot)"})
def log_dataset(self, dataset: SpeakerVerificationDataset):
"""Documents the corpus profile being utilized for training."""
if self.disabled: return
ds_string = "<b>π Corpus Overview</b>: %d Registered Speakers<br>" % len(dataset.speakers)
ds_string += dataset.get_logs().replace("\n", "<br>")
self.vis.text(ds_string, opts={"title": "Dataset Manifest"})
def log_implementation(self, params):
"""Metadata regarding the execution environment (e.g., CUDA device)."""
if self.disabled: return
impl_str = "".join("<b>%s</b>: %s<br>" % (p, v) for p, v in params.items())
self.implementation_string = impl_str
self.implementation_win = self.vis.text(impl_str, opts={"title": "Training Environment"})
def update(self, loss, eer, step):
"""Calculates and plots rolling averages of optimization metrics."""
now = timer()
self.step_times.append(1000 * (now - self.last_update_timestamp))
self.last_update_timestamp = now
self.losses.append(loss)
self.eers.append(eer)
print(".", end="") # Silent progress heartbeat
if step % self.update_every != 0: return # Stratified reporting
t_str = "Step Latency: mean %dms, std %dms" % (int(np.mean(self.step_times)), int(np.std(self.step_times)))
print("\nπ Step %d | Loss: %.4f | EER: %.4f | %s" % (step, np.mean(self.losses), np.mean(self.eers), t_str))
if not self.disabled:
# Loss Convergence Plot
self.loss_win = self.vis.line([np.mean(self.losses)], [step], win=self.loss_win,
update="append" if self.loss_win else None,
opts=dict(legend=["Avg. GE2E Loss"], xlabel="Step", ylabel="Loss", title="Training Loss"))
# Error Rate Plot
self.eer_win = self.vis.line([np.mean(self.eers)], [step], win=self.eer_win,
update="append" if self.eer_win else None,
opts=dict(legend=["Avg. EER"], xlabel="Step", ylabel="EER", title="Equal Error Rate"))
if self.implementation_win:
self.vis.text(self.implementation_string + ("<b>%s</b>" % t_str),
win=self.implementation_win, opts={"title": "Training Environment"})
# Memory Cleanup for the next window
self.losses.clear(); self.eers.clear(); self.step_times.clear()
def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, max_speakers=10):
"""
Manifold Mapping:
Projects high-dimensional d-vectors onto a 2D plane for visual cluster analysis.
"""
import matplotlib.pyplot as plt
max_speakers = min(max_speakers, len(colormap))
embeds = embeds[:max_speakers * utterances_per_speaker]
# Categorical Color Orchestration
n_speakers = len(embeds) // utterances_per_speaker
colors = [colormap[i] for i in np.repeat(np.arange(n_speakers), utterances_per_speaker)]
# UMAP Non-Linear Dimensionality Reduction
reducer = umap.UMAP()
projected = reducer.fit_transform(embeds)
plt.scatter(projected[:, 0], projected[:, 1], c=colors)
plt.gca().set_aspect("equal", "datalim")
plt.title("π Identity Manifold (UMAP) - Step %d" % step)
if not self.disabled:
self.projection_win = self.vis.matplot(plt, win=self.projection_win)
if out_fpath:
plt.savefig(out_fpath)
plt.clf()
def save(self):
"""Persists the Visdom environment state to the server."""
if not self.disabled:
self.vis.save([self.env_name])
|