| import os | |
| import sys | |
| import time | |
| import torch | |
| import shutil | |
| import warnings | |
| import argparse | |
| import numpy as np | |
| from tqdm import tqdm | |
| from distutils.util import strtobool | |
| sys.path.append(os.getcwd()) | |
| from main.app.variables import config, logger, translations, configs | |
| from main.library.utils import load_audio, load_embedders_model, extract_features | |
| warnings.filterwarnings("ignore") | |
| F0_MIN, F0_MAX, HOP_SIZE, SAMPLE_RATE, FRAME_LENGTH = 50, 1100, 160, 16000, 2048 | |
| def parse_arguments(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--create_reference", action='store_true') | |
| parser.add_argument("--audio_path", type=str, required=True) | |
| parser.add_argument("--reference_name", type=str, default="reference") | |
| parser.add_argument("--pitch_guidance", type=lambda x: bool(strtobool(x)), default=True) | |
| parser.add_argument("--use_energy", type=lambda x: bool(strtobool(x)), default=False) | |
| parser.add_argument("--version", type=str, default="v2") | |
| parser.add_argument("--embedder_model", type=str, default="hubert_base") | |
| parser.add_argument("--embedders_mode", type=str, default="fairseq") | |
| parser.add_argument("--f0_method", type=str, default="rmvpe") | |
| parser.add_argument("--f0_onnx", type=lambda x: bool(strtobool(x)), default=False) | |
| parser.add_argument("--f0_up_key", type=int, default=0) | |
| parser.add_argument("--filter_radius", type=int, default=3) | |
| parser.add_argument("--f0_autotune", type=lambda x: bool(strtobool(x)), default=False) | |
| parser.add_argument("--f0_autotune_strength", type=float, default=1) | |
| parser.add_argument("--proposal_pitch", type=lambda x: bool(strtobool(x)), default=False) | |
| parser.add_argument("--proposal_pitch_threshold", type=float, default=255.0) | |
| parser.add_argument("--alpha", type=float, default=0.5) | |
| return parser.parse_args() | |
| def main(): | |
| args = parse_arguments() | |
| audio_path, reference_name, pitch_guidance, use_energy, version, embedder_model, embedders_mode, f0_method, f0_onnx, f0_up_key, filter_radius, f0_autotune, f0_autotune_strength, proposal_pitch, proposal_pitch_threshold, alpha = args.audio_path, args.reference_name, args.pitch_guidance, args.use_energy, args.version, args.embedder_model, args.embedders_mode, args.f0_method, args.f0_onnx, args.f0_up_key, args.filter_radius, args.f0_autotune, args.f0_autotune_strength, args.proposal_pitch, args.proposal_pitch_threshold, args.alpha | |
| create_reference( | |
| audio_path, | |
| reference_name, | |
| pitch_guidance, | |
| use_energy, | |
| version, | |
| embedder_model, | |
| embedders_mode, | |
| f0_method, | |
| f0_onnx, | |
| f0_up_key, | |
| filter_radius, | |
| f0_autotune, | |
| f0_autotune_strength, | |
| proposal_pitch, | |
| proposal_pitch_threshold, | |
| alpha | |
| ) | |
| def create_reference( | |
| audio_path, | |
| reference_name, | |
| pitch_guidance = True, | |
| use_energy = False, | |
| version = "v2", | |
| embedder_model = "hubert_base", | |
| embedders_mode = "fairseq", | |
| f0_method = "rmvpe", | |
| f0_onnx = False, | |
| f0_up_key = 0, | |
| filter_radius = 3, | |
| f0_autotune = False, | |
| f0_autotune_strength = 1, | |
| proposal_pitch = False, | |
| proposal_pitch_threshold = 255.0, | |
| alpha = 0.5 | |
| ): | |
| device = config.device | |
| is_half = config.is_half | |
| if not audio_path: | |
| logger.warning(translations["not_found_audio"]) | |
| sys.exit(1) | |
| output_reference = os.path.join(configs["reference_path"], f"{reference_name}_{version}_{embedder_model}_{pitch_guidance}_{use_energy}") | |
| if os.path.exists(output_reference): shutil.rmtree(reference_name, ignore_errors=True) | |
| os.makedirs(output_reference) | |
| logger.info(translations["start_create_reference"]) | |
| start_time = time.time() | |
| with tqdm(total=5, desc=translations["create_reference"], ncols=100, unit="a") as pbar: | |
| audio = load_audio(audio_path, sample_rate=SAMPLE_RATE) | |
| pbar.update(1) | |
| audio_max = np.abs(audio).max() / 0.95 | |
| if audio_max > 1: audio /= audio_max | |
| trimmed_len = (len(audio) // 320) * 320 | |
| audio = audio[:trimmed_len] | |
| audio_pad = torch.nn.functional.pad( | |
| torch.from_numpy(audio).to( | |
| torch.float16 if is_half else torch.float32 | |
| ).to(device).unsqueeze(0), | |
| (40, 40), | |
| mode="reflect" | |
| ) | |
| pbar.update(1) | |
| embedder = load_embedders_model(embedder_model, embedders_mode) | |
| if isinstance(embedder, torch.nn.Module): embedder = embedder.to(torch.float16 if is_half else torch.float32).eval().to(device) | |
| with torch.no_grad(): | |
| feats = extract_features(embedder, audio_pad.view(1, -1), version, device=device) | |
| np.save(os.path.join(output_reference, "feats.npy"), feats.squeeze(0).float().cpu().numpy(), allow_pickle=False) | |
| pbar.update(1) | |
| if pitch_guidance: | |
| from main.library.predictors.Generator import Generator | |
| generator = Generator( | |
| sample_rate=SAMPLE_RATE, | |
| hop_length=HOP_SIZE, | |
| f0_min=F0_MIN, | |
| f0_max=F0_MAX, | |
| alpha=alpha, | |
| is_half=is_half, | |
| device=device, | |
| f0_onnx_mode=f0_onnx, | |
| del_onnx_model=True | |
| ) | |
| pitch, pitchf = generator.calculator( | |
| x_pad=config.x_pad, | |
| f0_method=f0_method, | |
| x=audio, | |
| f0_up_key=f0_up_key, | |
| p_len=audio.shape[0] // 160 + 1, | |
| filter_radius=filter_radius, | |
| f0_autotune=f0_autotune, | |
| f0_autotune_strength=f0_autotune_strength, | |
| manual_f0=None, | |
| proposal_pitch=proposal_pitch, | |
| proposal_pitch_threshold=proposal_pitch_threshold | |
| ) | |
| np.save(os.path.join(output_reference, "pitch_coarse.npy"), pitch, allow_pickle=False) | |
| np.save(os.path.join(output_reference, "pitch_fine.npy"), pitchf, allow_pickle=False) | |
| pbar.update(1) | |
| if use_energy: | |
| from main.inference.extracting.rms import RMSEnergyExtractor | |
| rms = RMSEnergyExtractor(frame_length=FRAME_LENGTH, hop_length=HOP_SIZE, center=True, pad_mode="reflect").to(device).eval() | |
| with torch.no_grad(): | |
| energy = rms(audio_pad) | |
| np.save(os.path.join(output_reference, "energy.npy"), energy.float().cpu().numpy(), allow_pickle=False) | |
| pbar.update(1) | |
| logger.info(translations["create_reference_success"].format(elapsed_time=f"{(time.time() - start_time):.2f}")) | |
| if __name__ == "__main__": main() |