| | import os |
| | import torch |
| | import librosa |
| | from tqdm import tqdm |
| | from openvoice.api import ToneColorConverter |
| | from openvoice.mel_processing import spectrogram_torch |
| | from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments |
| |
|
| |
|
| | @torch.no_grad() |
| | def se_extractor(audio_path, vc): |
| | |
| | SAMPLE_RATE = 16000 |
| | audio_vad = get_audio_tensor(audio_path) |
| | segments = get_vad_segments( |
| | audio_vad, |
| | output_sample=True, |
| | min_speech_duration=0.1, |
| | min_silence_duration=1, |
| | method="silero", |
| | ) |
| | segments = [(seg["start"], seg["end"]) for seg in segments] |
| | segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments] |
| |
|
| | if len(segments) == 0: |
| | segments = [(0, len(audio_vad)/SAMPLE_RATE)] |
| | print(segments) |
| |
|
| | |
| | hps = vc.hps |
| | device = vc.device |
| | model = vc.model |
| | gs = [] |
| |
|
| | audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate) |
| | audio = torch.tensor(audio).float().to(device) |
| |
|
| | for s, e in segments: |
| | y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)] |
| | y = y.to(device) |
| | y = y.unsqueeze(0) |
| | y = spectrogram_torch(y, hps.data.filter_length, |
| | hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, |
| | center=False).to(device) |
| | g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1) |
| | gs.append(g.detach()) |
| |
|
| | gs = torch.stack(gs).mean(0) |
| | return gs.cpu() |
| |
|
| |
|
| | def process_audio_folder(input_folder, output_folder, model, device): |
| | """ |
| | Process all audio files in a folder and its subfolders, |
| | save the extracted features as .pt files in the output folder with the same structure. |
| | |
| | Args: |
| | input_folder (str): Path to the input folder containing audio files. |
| | output_folder (str): Path to the output folder to save .pt files. |
| | model: Pre-trained model for feature extraction. |
| | device: Torch device (e.g., 'cpu' or 'cuda'). |
| | """ |
| | |
| | audio_files = [] |
| | for root, _, files in os.walk(input_folder): |
| | for file in files: |
| | if file.endswith(('.wav', '.mp3', '.flac')): |
| | audio_files.append(os.path.join(root, file)) |
| |
|
| | |
| | for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"): |
| | |
| | relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder) |
| | output_dir = os.path.join(output_folder, relative_path) |
| | os.makedirs(output_dir, exist_ok=True) |
| | output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt') |
| |
|
| | |
| | if os.path.exists(output_path): |
| | |
| | continue |
| | |
| | target_se = se_extractor(audio_path, model).to(device) |
| | |
| | torch.save(target_se, output_path) |
| | |
| |
|
| |
|
| | if __name__ == '__main__': |
| | ckpt_converter = 'checkpoints_v2/converter' |
| | device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| | model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) |
| | model.load_ckpt(f'{ckpt_converter}/checkpoint.pth') |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360' |
| | output_folder = 'spk/LibriTTS-R/train-clean-360/' |
| | process_audio_folder(input_folder, output_folder, model, device) |