Upload folder using huggingface_hub

4698bfc verified 22 days ago

7.17 kB

	"""
	Dataset for CosyVoice-style TTS flow matching.

	Each training sample comes from a HuggingFace dataset (saved to disk) with:
	file_path (str) – path to audio file
	text (str) – transcription (unused here)
	codes (list) – SEMACS VQ codes at 12.5 Hz

	For each sample we produce:
	mel (T_mel, n_mels) log-mel spectrogram at 24 kHz (target for flow matching)
	codes (num_q, T_codes) VQ conditioning codes
	wav_16k (T_wav,) raw 16 kHz waveform for on-the-fly speaker extraction

	Speaker embeddings are extracted in the training loop on GPU (batched via CAM++ fbank →
	campplus forward), so no precomputation step is needed.

	Mel spectrogram parameters default to the Vocos 24 kHz configuration:
	sample_rate = 24000
	n_mels = 100
	n_fft = 1024
	hop_length = 256 → frame rate ≈ 93.75 Hz
	f_max = 12000
	"""

	from typing import Dict, List, Union

	import numpy as np
	import torch
	import torchaudio
	from datasets import concatenate_datasets, load_from_disk
	from torch.utils.data import Dataset


	class TTSDataset(Dataset):
	"""
	VQ codes + mel spectrogram dataset. Speaker embeddings are NOT extracted
	here — the raw 16 kHz waveform is returned so the training loop can run
	batched CAM++ inference on GPU.

	Args:
	dataset_path HuggingFace dataset directory, or list of directories
	sample_rate Target sample rate for mel (24000)
	n_mels Mel bins (100 for Vocos 24 kHz)
	n_fft FFT size
	hop_length Hop size in samples
	f_min/f_max Mel filterbank range
	max_duration Clip audio longer than this (seconds)
	max_wav_16k Max 16 kHz samples kept for speaker extraction (10 s default)
	"""

	def __init__(
	self,
	dataset_path: Union[str, List[str]],
	sample_rate: int = 24000,
	n_mels: int = 100,
	n_fft: int = 1024,
	hop_length: int = 256,
	f_min: float = 0.0,
	f_max: float = 12000.0,
	max_duration: float = 30.0,
	max_wav_16k: int = 160_000, # 10 s at 16 kHz
	):
	if isinstance(dataset_path, (list, tuple)):
	datasets = [load_from_disk(p) for p in dataset_path]
	self.dataset = concatenate_datasets(datasets)
	print(f"[TTSDataset] Loaded {len(dataset_path)} datasets, "
	f"{len(self.dataset):,} total samples")
	else:
	self.dataset = load_from_disk(dataset_path)
	print(f"[TTSDataset] Loaded {len(self.dataset):,} samples from {dataset_path}")

	self.sample_rate = sample_rate
	self.hop_length = hop_length
	self.max_frames = int(max_duration * sample_rate / hop_length)
	self.max_wav_16k = max_wav_16k

	# Must match Vocos `charactr/vocos-mel-24khz` MelSpectrogramFeatures:
	# power=1 (amplitude, not power), default norm=None, default mel_scale="htk"
	self.mel_transform = torchaudio.transforms.MelSpectrogram(
	sample_rate=sample_rate,
	n_fft=n_fft,
	hop_length=hop_length,
	n_mels=n_mels,
	f_min=f_min,
	f_max=f_max,
	power=1,
	)

	def __len__(self) -> int:
	return len(self.dataset)

	def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
	# Iterative retry: on load failure, walk forward up to N times before giving up.
	N = len(self)
	for attempt in range(32):
	i = (idx + attempt) % N
	item = self.dataset[i]
	file_path = item["file_path"]
	try:
	wav, sr = torchaudio.load(file_path)
	break
	except Exception as e:
	if attempt == 0:
	print(f"[TTSDataset] load failed at idx={i}: {file_path} ({type(e).__name__}: {e}); skipping")
	continue
	else:
	raise RuntimeError(f"TTSDataset: 32 consecutive load failures starting at idx={idx}")

	wav_mono = wav.mean(0) # (T,)

	# ── 16 kHz waveform for speaker extraction (truncated to 10 s) ───────
	wav_16k = torchaudio.functional.resample(wav_mono, sr, 16000) \
	if sr != 16000 else wav_mono
	wav_16k = wav_16k[: self.max_wav_16k]

	# ── Mel spectrogram at target sample rate ────────────────────────────
	wav_target = torchaudio.functional.resample(wav_mono, sr, self.sample_rate) \
	if sr != self.sample_rate else wav_mono
	mel = self.mel_transform(wav_target) # (n_mels, T_mel)
	mel = torch.log(mel.clamp(min=1e-7)) # Vocos safe_log clip_val
	mel = mel[:, : self.max_frames].T.contiguous() # (T_mel, n_mels)

	# ── VQ codes ─────────────────────────────────────────────────────────
	codes = np.array(item["codes"], dtype=np.int64)
	if codes.ndim == 1:
	codes = codes[np.newaxis, :] # (1, T_codes)

	return {
	"mel": mel, # (T_mel, n_mels)
	"codes": torch.from_numpy(codes), # (num_q, T_codes)
	"wav_16k": wav_16k, # (T_wav,) for speaker encoder
	}


	# ── Collate ───────────────────────────────────────────────────────────────────

	def collate_fn(batch: List[Dict]) -> Dict[str, torch.Tensor]:
	"""Pad variable-length sequences within a batch."""
	mels = [item["mel"] for item in batch]
	codes = [item["codes"] for item in batch]
	wavs = [item["wav_16k"] for item in batch]

	# Mel (T_mel, n_mels)
	mel_lens = torch.tensor([m.shape[0] for m in mels])
	max_mel = int(mel_lens.max())
	n_mels = mels[0].shape[1]
	padded_mels = torch.zeros(len(batch), max_mel, n_mels)
	for i, m in enumerate(mels):
	padded_mels[i, : m.shape[0]] = m

	# Codes (num_q, T_codes)
	num_q = codes[0].shape[0]
	code_lens = torch.tensor([c.shape[1] for c in codes])
	max_codes = int(code_lens.max())
	padded_codes = torch.zeros(len(batch), num_q, max_codes, dtype=torch.long)
	for i, c in enumerate(codes):
	padded_codes[i, :, : c.shape[1]] = c

	# wav_16k (T_wav,) — padded for batched fbank extraction
	wav_lens = torch.tensor([w.shape[0] for w in wavs])
	max_wav = int(wav_lens.max())
	padded_wavs = torch.zeros(len(batch), max_wav)
	for i, w in enumerate(wavs):
	padded_wavs[i, : w.shape[0]] = w

	return {
	"mel": padded_mels, # (B, T_mel, n_mels)
	"mel_lens": mel_lens, # (B,)
	"codes": padded_codes, # (B, num_q, T_codes)
	"code_lens": code_lens, # (B,)
	"wav_16k": padded_wavs, # (B, T_wav)
	"wav_lens": wav_lens, # (B,) valid samples in wav_16k
	}