cfm_svc / prepare /preprocess_pack.py
Hector Li
Initial commit for Hugging Face
df93d13
"""
Pack per-sample feature files into single .pt bundles for NFS-efficient training.
For each clip, combines:
<wav_dir>/<spk>/<id>.wav (audio, resampled + mel computed here)
<ppg_dir>/<spk>/<id>.ppg.npy
<hubert_dir>/<spk>/<id>.vec.npy
<f0_dir>/<spk>/<id>.pit.npy
<spk_dir>/<spk>/<id>.spk.npy
Into a single file:
<out_dir>/<spk>/<id>.pt (1 NFS read per sample at training time)
Reduces training I/O from 5 NFS reads/sample to 1.
Mel is precomputed here so the DataLoader skips STFT entirely.
Usage:
python prepare/preprocess_pack.py \
-w data_svc/waves-32k \
-o data_svc/packed \
-t 8
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import argparse
import numpy as np
import torch
import torchaudio
import torchaudio.transforms as T
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
# Mel parameters matching F5-TTS / Vocos
_SAMPLE_RATE = 24_000
_HOP_LENGTH = 256
_N_FFT = 1_024
_WIN_LENGTH = 1_024
_N_MELS = 100
_F_MIN = 0
_F_MAX = 8_000
_mel_tf = None # initialized per-thread via _get_mel_tf()
def _get_mel_tf():
global _mel_tf
if _mel_tf is None:
_mel_tf = T.MelSpectrogram(
sample_rate=_SAMPLE_RATE,
n_fft=_N_FFT,
hop_length=_HOP_LENGTH,
n_mels=_N_MELS,
win_length=_WIN_LENGTH,
f_min=_F_MIN,
f_max=_F_MAX,
power=1.0,
norm="slaney",
mel_scale="slaney",
)
return _mel_tf
def pack_file(file_id, spk, wav_dir, ppg_dir, hubert_dir, f0_dir, spk_dir, out_dir):
wav_path = os.path.join(wav_dir, spk, f"{file_id}.wav")
ppg_path = os.path.join(ppg_dir, spk, f"{file_id}.ppg.npy")
hubert_path = os.path.join(hubert_dir, spk, f"{file_id}.vec.npy")
f0_path = os.path.join(f0_dir, spk, f"{file_id}.pit.npy")
spk_path = os.path.join(spk_dir, spk, f"{file_id}.spk.npy")
out_path = os.path.join(out_dir, spk, f"{file_id}.pt")
# Skip if already packed
if os.path.isfile(out_path):
return True, file_id
missing = [p for p in [wav_path, ppg_path, hubert_path, f0_path, spk_path]
if not os.path.isfile(p)]
if missing:
return False, f"{file_id}: missing {missing}"
try:
# --- Audio → log-mel at 24 kHz ---
wav, sr = torchaudio.load(wav_path)
if wav.shape[0] > 1:
wav = wav.mean(dim=0, keepdim=True)
if sr != _SAMPLE_RATE:
wav = torchaudio.functional.resample(wav, sr, _SAMPLE_RATE)
mel = _get_mel_tf()(wav).squeeze(0).T # (T_mel, N_MELS)
mel = torch.log(mel.clamp(min=1e-5))
# --- Features (kept as float32 tensors) ---
ppg = torch.tensor(np.load(ppg_path)).float() # (T_feat, 1280)
hubert = torch.tensor(np.load(hubert_path)).float() # (T_feat, 256)
f0_raw = torch.tensor(np.load(f0_path)).float() # (T_feat,)
f0 = torch.where(f0_raw > 0,
torch.log(f0_raw.clamp(min=1.0)),
torch.zeros_like(f0_raw)) # log-F0, 0 = unvoiced
spk_emb = torch.tensor(np.load(spk_path)).float() # (256,)
torch.save({"mel": mel, "ppg": ppg, "hubert": hubert,
"f0": f0, "spk": spk_emb}, out_path)
return True, file_id
except Exception as e:
return False, f"{file_id}: {e}"
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Pack SVC features into single .pt bundles (NFS-efficient training)."
)
parser.add_argument("-w", "--wav", dest="wav", required=True,
help="Input wav directory (e.g. data_svc/waves-32k)")
parser.add_argument("-o", "--out", dest="out", required=True,
help="Output packed directory (e.g. data_svc/packed)")
parser.add_argument("--ppg", dest="ppg", default="data_svc/whisper",
help="PPG directory")
parser.add_argument("--hubert", dest="hubert", default="data_svc/hubert",
help="HuBERT directory")
parser.add_argument("--f0", dest="f0", default="data_svc/pitch",
help="F0 directory")
parser.add_argument("--spk", dest="spk", default="data_svc/speaker",
help="Speaker embedding directory")
parser.add_argument("-t", "--thread_count", dest="thread_count", type=int, default=4,
help="Worker threads (0 = all CPU cores)")
args = parser.parse_args()
os.makedirs(args.out, exist_ok=True)
n_workers = os.cpu_count() if args.thread_count == 0 else args.thread_count
for spk in sorted(os.listdir(args.wav)):
spk_wav_dir = os.path.join(args.wav, spk)
if not os.path.isdir(spk_wav_dir):
continue
spk_out_dir = os.path.join(args.out, spk)
os.makedirs(spk_out_dir, exist_ok=True)
files = [f[:-4] for f in os.listdir(spk_wav_dir) if f.endswith(".wav")]
print(f">>>>>>>>>>{spk}<<<<<<<<<< ({len(files)} files)")
ok = fail = skip = 0
with ThreadPoolExecutor(max_workers=n_workers) as executor:
futures = {
executor.submit(
pack_file, fid, spk,
args.wav, args.ppg, args.hubert, args.f0, args.spk, args.out
): fid for fid in files
}
for future in tqdm(as_completed(futures), total=len(futures)):
success, info = future.result()
if success:
if info == futures[future]: # not skipped
ok += 1
else:
skip += 1
else:
fail += 1
tqdm.write(f" WARN: {info}")
print(f" packed={ok} skipped(existed)={skip} failed={fail}")
print("Done. Update training commands to use --packed_dir instead of separate feature dirs.")