Spaces:

LTTEAM
/

Clone

Runtime error

App Files Files Community

Clone / src /chatterbox /vc.py

LTTEAM

Upload 129 files

6006b63 verified 5 months ago

raw

history blame contribute delete

4.54 kB

	# vc.py

	from pathlib import Path
	from typing import Optional
	import librosa
	import torch
	import perth
	from huggingface_hub import hf_hub_download
	from safetensors.torch import load_file
	from omegaconf import DictConfig
	import soundfile as sf # <--- ADD THIS IMPORT

	from .models.s3tokenizer import S3_SR
	from .models.s3gen import S3GEN_SR, S3Gen


	REPO_ID = "ResembleAI/chatterbox"


	class ChatterboxVC:
	ENC_COND_LEN = 6 * S3_SR
	DEC_COND_LEN = 10 * S3GEN_SR

	def __init__(
	self,
	s3gen: S3Gen,
	device: str,
	ref_dict: dict = None,
	):
	self.sr = S3GEN_SR
	self.s3gen = s3gen
	self.device = device
	self.watermarker = perth.PerthImplicitWatermarker()
	if ref_dict is None:
	self.ref_dict = None
	else:
	self.ref_dict = {
	k: v.to(device) if torch.is_tensor(v) else v
	for k, v in ref_dict.items()
	}

	@classmethod
	def from_local(cls, ckpt_dir, device, s3gen_cfg: Optional[DictConfig] = None) -> 'ChatterboxVC':
	ckpt_dir = Path(ckpt_dir)

	# Always load to CPU first for non-CUDA devices to handle CUDA-saved models
	if device in ["cpu", "mps"]:
	map_location = torch.device('cpu')
	else:
	map_location = None

	ref_dict = None
	if (builtin_voice := ckpt_dir / "conds.pt").exists():
	states = torch.load(builtin_voice, map_location=map_location)
	ref_dict = states['gen']

	# Pass the s3gen_cfg to S3Gen constructor
	s3gen = S3Gen(cfg=s3gen_cfg)
	s3gen.load_state_dict(
	load_file(ckpt_dir / "s3gen.safetensors"), strict=False
	)
	s3gen.to(device).eval()

	return cls(s3gen, device, ref_dict=ref_dict)

	@classmethod
	def from_pretrained(cls, device, s3gen_cfg: Optional[DictConfig] = None) -> 'ChatterboxVC':
	# Check if MPS is available on macOS
	if device == "mps" and not torch.backends.mps.is_available():
	if not torch.backends.mps.is_built():
	print("MPS not available because the current PyTorch install was not built with MPS enabled.")
	else:
	print("MPS not available because the current MacOS version is not 12.3+ and/or you do not have an MPS-enabled device on this machine.")
	device = "cpu"

	for fpath in ["s3gen.safetensors", "conds.pt"]:
	local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath)

	# Pass the s3gen_cfg to from_local
	return cls.from_local(Path(local_path).parent, device, s3gen_cfg=s3gen_cfg)

	def set_target_voice(self, wav_fpath):
	## Load reference wav
	s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)

	s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
	self.ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)

	def generate(
	self,
	audio,
	target_voice_path=None,
	# New inference parameters for S3Gen
	inference_cfg_rate: Optional[float] = None,
	sigma_min: Optional[float] = None,
	):
	# Apply inference parameters to the S3Gen model before running inference
	self.s3gen.set_inference_params(
	inference_cfg_rate=inference_cfg_rate,
	sigma_min=sigma_min,
	)

	if target_voice_path:
	self.set_target_voice(target_voice_path)
	else:
	assert self.ref_dict is not None, "Please `prepare_conditionals` first or specify `target_voice_path`"

	with torch.inference_mode():
	audio_16, _ = librosa.load(audio, sr=S3_SR)
	audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ]

	s3_tokens, _ = self.s3gen.tokenizer(audio_16)
	wav, _ = self.s3gen.inference(
	speech_tokens=s3_tokens,
	ref_dict=self.ref_dict,
	)
	watermarked_wav = self.watermarker.apply_watermark(wav.squeeze(0).detach().cpu().numpy(), sample_rate=self.sr)
	return torch.from_numpy(watermarked_wav).unsqueeze(0)

	# <--- ADD THIS NEW METHOD ---
	def save_wav(self, wav_tensor: torch.Tensor, output_path: str):
	"""Saves a waveform tensor to a WAV file."""
	# Ensure it's on CPU and numpy format for soundfile
	wav_numpy = wav_tensor.squeeze(0).detach().cpu().numpy()
	sf.write(output_path, wav_numpy, self.sr)
	# <--- END NEW METHOD ---