| """ |
| Pack per-sample feature files into single .pt bundles for NFS-efficient training. |
| |
| For each clip, combines: |
| <wav_dir>/<spk>/<id>.wav (audio, resampled + mel computed here) |
| <ppg_dir>/<spk>/<id>.ppg.npy |
| <hubert_dir>/<spk>/<id>.vec.npy |
| <f0_dir>/<spk>/<id>.pit.npy |
| <spk_dir>/<spk>/<id>.spk.npy |
| |
| Into a single file: |
| <out_dir>/<spk>/<id>.pt (1 NFS read per sample at training time) |
| |
| Reduces training I/O from 5 NFS reads/sample to 1. |
| Mel is precomputed here so the DataLoader skips STFT entirely. |
| |
| Usage: |
| python prepare/preprocess_pack.py \ |
| -w data_svc/waves-32k \ |
| -o data_svc/packed \ |
| -t 8 |
| """ |
|
|
| import sys |
| import os |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
| import argparse |
| import numpy as np |
| import torch |
| import torchaudio |
| import torchaudio.transforms as T |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
| from tqdm import tqdm |
|
|
| |
| _SAMPLE_RATE = 24_000 |
| _HOP_LENGTH = 256 |
| _N_FFT = 1_024 |
| _WIN_LENGTH = 1_024 |
| _N_MELS = 100 |
| _F_MIN = 0 |
| _F_MAX = 8_000 |
|
|
| _mel_tf = None |
|
|
| def _get_mel_tf(): |
| global _mel_tf |
| if _mel_tf is None: |
| _mel_tf = T.MelSpectrogram( |
| sample_rate=_SAMPLE_RATE, |
| n_fft=_N_FFT, |
| hop_length=_HOP_LENGTH, |
| n_mels=_N_MELS, |
| win_length=_WIN_LENGTH, |
| f_min=_F_MIN, |
| f_max=_F_MAX, |
| power=1.0, |
| norm="slaney", |
| mel_scale="slaney", |
| ) |
| return _mel_tf |
|
|
|
|
| def pack_file(file_id, spk, wav_dir, ppg_dir, hubert_dir, f0_dir, spk_dir, out_dir): |
| wav_path = os.path.join(wav_dir, spk, f"{file_id}.wav") |
| ppg_path = os.path.join(ppg_dir, spk, f"{file_id}.ppg.npy") |
| hubert_path = os.path.join(hubert_dir, spk, f"{file_id}.vec.npy") |
| f0_path = os.path.join(f0_dir, spk, f"{file_id}.pit.npy") |
| spk_path = os.path.join(spk_dir, spk, f"{file_id}.spk.npy") |
| out_path = os.path.join(out_dir, spk, f"{file_id}.pt") |
|
|
| |
| if os.path.isfile(out_path): |
| return True, file_id |
|
|
| missing = [p for p in [wav_path, ppg_path, hubert_path, f0_path, spk_path] |
| if not os.path.isfile(p)] |
| if missing: |
| return False, f"{file_id}: missing {missing}" |
|
|
| try: |
| |
| wav, sr = torchaudio.load(wav_path) |
| if wav.shape[0] > 1: |
| wav = wav.mean(dim=0, keepdim=True) |
| if sr != _SAMPLE_RATE: |
| wav = torchaudio.functional.resample(wav, sr, _SAMPLE_RATE) |
| mel = _get_mel_tf()(wav).squeeze(0).T |
| mel = torch.log(mel.clamp(min=1e-5)) |
|
|
| |
| ppg = torch.tensor(np.load(ppg_path)).float() |
| hubert = torch.tensor(np.load(hubert_path)).float() |
| f0_raw = torch.tensor(np.load(f0_path)).float() |
| f0 = torch.where(f0_raw > 0, |
| torch.log(f0_raw.clamp(min=1.0)), |
| torch.zeros_like(f0_raw)) |
| spk_emb = torch.tensor(np.load(spk_path)).float() |
|
|
| torch.save({"mel": mel, "ppg": ppg, "hubert": hubert, |
| "f0": f0, "spk": spk_emb}, out_path) |
| return True, file_id |
|
|
| except Exception as e: |
| return False, f"{file_id}: {e}" |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser( |
| description="Pack SVC features into single .pt bundles (NFS-efficient training)." |
| ) |
| parser.add_argument("-w", "--wav", dest="wav", required=True, |
| help="Input wav directory (e.g. data_svc/waves-32k)") |
| parser.add_argument("-o", "--out", dest="out", required=True, |
| help="Output packed directory (e.g. data_svc/packed)") |
| parser.add_argument("--ppg", dest="ppg", default="data_svc/whisper", |
| help="PPG directory") |
| parser.add_argument("--hubert", dest="hubert", default="data_svc/hubert", |
| help="HuBERT directory") |
| parser.add_argument("--f0", dest="f0", default="data_svc/pitch", |
| help="F0 directory") |
| parser.add_argument("--spk", dest="spk", default="data_svc/speaker", |
| help="Speaker embedding directory") |
| parser.add_argument("-t", "--thread_count", dest="thread_count", type=int, default=4, |
| help="Worker threads (0 = all CPU cores)") |
| args = parser.parse_args() |
|
|
| os.makedirs(args.out, exist_ok=True) |
| n_workers = os.cpu_count() if args.thread_count == 0 else args.thread_count |
|
|
| for spk in sorted(os.listdir(args.wav)): |
| spk_wav_dir = os.path.join(args.wav, spk) |
| if not os.path.isdir(spk_wav_dir): |
| continue |
|
|
| spk_out_dir = os.path.join(args.out, spk) |
| os.makedirs(spk_out_dir, exist_ok=True) |
|
|
| files = [f[:-4] for f in os.listdir(spk_wav_dir) if f.endswith(".wav")] |
| print(f">>>>>>>>>>{spk}<<<<<<<<<< ({len(files)} files)") |
|
|
| ok = fail = skip = 0 |
| with ThreadPoolExecutor(max_workers=n_workers) as executor: |
| futures = { |
| executor.submit( |
| pack_file, fid, spk, |
| args.wav, args.ppg, args.hubert, args.f0, args.spk, args.out |
| ): fid for fid in files |
| } |
| for future in tqdm(as_completed(futures), total=len(futures)): |
| success, info = future.result() |
| if success: |
| if info == futures[future]: |
| ok += 1 |
| else: |
| skip += 1 |
| else: |
| fail += 1 |
| tqdm.write(f" WARN: {info}") |
|
|
| print(f" packed={ok} skipped(existed)={skip} failed={fail}") |
|
|
| print("Done. Update training commands to use --packed_dir instead of separate feature dirs.") |
|
|