ThreadAbort
/

IndexTTS-Rust

Model card Files Files and versions

IndexTTS-Rust / indextts /utils /maskgct /models /codec /facodec /facodec_inference.py

ThreadAbort's picture

init

f999cc8 5 months ago

4.56 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import shutil
	import warnings
	import argparse
	import torch
	import os
	import yaml

	warnings.simplefilter("ignore")

	from .modules.commons import *
	import time

	import torchaudio
	import librosa
	from collections import OrderedDict


	class FAcodecInference(object):
	def __init__(self, args=None, cfg=None):
	self.args = args
	self.cfg = cfg
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.model = self._build_model()
	self._load_checkpoint()

	def _build_model(self):
	model = build_model(self.cfg.model_params)
	_ = [model[key].to(self.device) for key in model]
	return model

	def _load_checkpoint(self):
	sd = torch.load(self.args.checkpoint_path, map_location="cpu")
	sd = sd["net"] if "net" in sd else sd
	new_params = dict()
	for key, state_dict in sd.items():
	new_state_dict = OrderedDict()
	for k, v in state_dict.items():
	if k.startswith("module."):
	k = k[7:]
	new_state_dict[k] = v
	new_params[key] = new_state_dict
	for key in new_params:
	if key in self.model:
	self.model[key].load_state_dict(new_params[key])
	_ = [self.model[key].eval() for key in self.model]

	@torch.no_grad()
	def inference(self, source, output_dir):
	source_audio = librosa.load(source, sr=self.cfg.preprocess_params.sr)[0]
	source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(self.device)

	z = self.model.encoder(source_audio[None, ...].to(self.device).float())
	(
	z,
	quantized,
	commitment_loss,
	codebook_loss,
	timbre,
	codes,
	) = self.model.quantizer(
	z,
	source_audio[None, ...].to(self.device).float(),
	n_c=self.cfg.model_params.n_c_codebooks,
	return_codes=True,
	)

	full_pred_wave = self.model.decoder(z)

	os.makedirs(output_dir, exist_ok=True)
	source_name = source.split("/")[-1].split(".")[0]
	torchaudio.save(
	f"{output_dir}/reconstructed_{source_name}.wav",
	full_pred_wave[0].cpu(),
	self.cfg.preprocess_params.sr,
	)

	print(
	"Reconstructed audio saved as: ",
	f"{output_dir}/reconstructed_{source_name}.wav",
	)

	return quantized, codes

	@torch.no_grad()
	def voice_conversion(self, source, reference, output_dir):
	source_audio = librosa.load(source, sr=self.cfg.preprocess_params.sr)[0]
	source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(self.device)

	reference_audio = librosa.load(reference, sr=self.cfg.preprocess_params.sr)[0]
	reference_audio = (
	torch.tensor(reference_audio).unsqueeze(0).float().to(self.device)
	)

	z = self.model.encoder(source_audio[None, ...].to(self.device).float())
	z, quantized, commitment_loss, codebook_loss, timbre = self.model.quantizer(
	z,
	source_audio[None, ...].to(self.device).float(),
	n_c=self.cfg.model_params.n_c_codebooks,
	)

	z_ref = self.model.encoder(reference_audio[None, ...].to(self.device).float())
	(
	z_ref,
	quantized_ref,
	commitment_loss_ref,
	codebook_loss_ref,
	timbre_ref,
	) = self.model.quantizer(
	z_ref,
	reference_audio[None, ...].to(self.device).float(),
	n_c=self.cfg.model_params.n_c_codebooks,
	)

	z_conv = self.model.quantizer.voice_conversion(
	quantized[0] + quantized[1],
	reference_audio[None, ...].to(self.device).float(),
	)
	full_pred_wave = self.model.decoder(z_conv)

	os.makedirs(output_dir, exist_ok=True)
	source_name = source.split("/")[-1].split(".")[0]
	reference_name = reference.split("/")[-1].split(".")[0]
	torchaudio.save(
	f"{output_dir}/converted_{source_name}_to_{reference_name}.wav",
	full_pred_wave[0].cpu(),
	self.cfg.preprocess_params.sr,
	)

	print(
	"Voice conversion results saved as: ",
	f"{output_dir}/converted_{source_name}_to_{reference_name}.wav",
	)