File size: 3,284 Bytes
652c91e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import json
import logging
from pathlib import Path
import random
import soundfile as sf
import torch
from tqdm import tqdm
from fairseq import utils
from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder
logging.basicConfig()
logging.root.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def dump_result(args, sample_id, pred_wav, suffix=""):
sf.write(
f"{args.results_path}/{sample_id}{suffix}_pred.wav",
pred_wav.detach().cpu().numpy(),
16000,
)
def load_code(in_file):
with open(in_file) as f:
out = [list(map(int, line.strip().split())) for line in f]
return out
def main(args):
logger.info(args)
use_cuda = torch.cuda.is_available() and not args.cpu
with open(args.vocoder_cfg) as f:
vocoder_cfg = json.load(f)
vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg)
if use_cuda:
vocoder = vocoder.cuda()
multispkr = vocoder.model.multispkr
if multispkr:
logger.info("multi-speaker vocoder")
num_speakers = vocoder_cfg.get(
"num_speakers", 200
) # following the default in codehifigan to set to 200
assert (
args.speaker_id < num_speakers
), f"invalid --speaker-id ({args.speaker_id}) with total #speakers = {num_speakers}"
data = load_code(args.in_code_file)
Path(args.results_path).mkdir(exist_ok=True, parents=True)
for i, d in tqdm(enumerate(data), total=len(data)):
x = {
"code": torch.LongTensor(d).view(1, -1),
}
suffix = ""
if multispkr:
spk = (
random.randint(0, num_speakers - 1)
if args.speaker_id == -1
else args.speaker_id
)
suffix = f"_spk{spk}"
x["spkr"] = torch.LongTensor([spk]).view(1, 1)
x = utils.move_to_cuda(x) if use_cuda else x
wav = vocoder(x, args.dur_prediction)
dump_result(args, i, wav, suffix=suffix)
def cli_main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--in-code-file", type=str, required=True, help="one unit sequence per line"
)
parser.add_argument(
"--vocoder", type=str, required=True, help="path to the CodeHiFiGAN vocoder"
)
parser.add_argument(
"--vocoder-cfg",
type=str,
required=True,
help="path to the CodeHiFiGAN vocoder config",
)
parser.add_argument("--results-path", type=str, required=True)
parser.add_argument(
"--dur-prediction",
action="store_true",
help="enable duration prediction (for reduced/unique code sequences)",
)
parser.add_argument(
"--speaker-id",
type=int,
default=-1,
help="Speaker id (for vocoder that supports multispeaker). Set to -1 to randomly sample speakers.",
)
parser.add_argument("--cpu", action="store_true", help="run on CPU")
args = parser.parse_args()
main(args)
if __name__ == "__main__":
cli_main() |