|
|
from pathlib import Path |
|
|
|
|
|
import librosa |
|
|
import torch |
|
|
import perth |
|
|
from huggingface_hub import hf_hub_download |
|
|
from safetensors.torch import load_file |
|
|
|
|
|
from .models.s3tokenizer import S3_SR |
|
|
from .models.s3gen import S3GEN_SR, S3Gen |
|
|
|
|
|
|
|
|
REPO_ID = "ResembleAI/chatterbox" |
|
|
|
|
|
|
|
|
class ChatterboxVC: |
|
|
ENC_COND_LEN = 6 * S3_SR |
|
|
DEC_COND_LEN = 10 * S3GEN_SR |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
s3gen: S3Gen, |
|
|
device: str, |
|
|
ref_dict: dict=None, |
|
|
): |
|
|
self.sr = S3GEN_SR |
|
|
self.s3gen = s3gen |
|
|
self.device = device |
|
|
self.watermarker = perth.PerthImplicitWatermarker() |
|
|
if ref_dict is None: |
|
|
self.ref_dict = None |
|
|
else: |
|
|
self.ref_dict = { |
|
|
k: v.to(device) if torch.is_tensor(v) else v |
|
|
for k, v in ref_dict.items() |
|
|
} |
|
|
|
|
|
@classmethod |
|
|
def from_local(cls, ckpt_dir, device) -> 'ChatterboxVC': |
|
|
ckpt_dir = Path(ckpt_dir) |
|
|
|
|
|
|
|
|
if device in ["cpu", "mps"]: |
|
|
map_location = torch.device('cpu') |
|
|
else: |
|
|
map_location = None |
|
|
|
|
|
ref_dict = None |
|
|
if (builtin_voice := ckpt_dir / "conds.pt").exists(): |
|
|
states = torch.load(builtin_voice, map_location=map_location) |
|
|
ref_dict = states['gen'] |
|
|
|
|
|
s3gen = S3Gen() |
|
|
s3gen.load_state_dict( |
|
|
load_file(ckpt_dir / "s3gen.safetensors"), strict=False |
|
|
) |
|
|
s3gen.to(device).eval() |
|
|
|
|
|
return cls(s3gen, device, ref_dict=ref_dict) |
|
|
|
|
|
@classmethod |
|
|
def from_pretrained(cls, device) -> 'ChatterboxVC': |
|
|
|
|
|
if device == "mps" and not torch.backends.mps.is_available(): |
|
|
if not torch.backends.mps.is_built(): |
|
|
print("MPS not available because the current PyTorch install was not built with MPS enabled.") |
|
|
else: |
|
|
print("MPS not available because the current MacOS version is not 12.3+ and/or you do not have an MPS-enabled device on this machine.") |
|
|
device = "cpu" |
|
|
|
|
|
for fpath in ["s3gen.safetensors", "conds.pt"]: |
|
|
local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath) |
|
|
|
|
|
return cls.from_local(Path(local_path).parent, device) |
|
|
|
|
|
def set_target_voice(self, wav_fpath): |
|
|
|
|
|
s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR) |
|
|
|
|
|
s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN] |
|
|
self.ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device) |
|
|
|
|
|
def generate( |
|
|
self, |
|
|
audio, |
|
|
target_voice_path=None, |
|
|
): |
|
|
if target_voice_path: |
|
|
self.set_target_voice(target_voice_path) |
|
|
else: |
|
|
assert self.ref_dict is not None, "Please `prepare_conditionals` first or specify `target_voice_path`" |
|
|
|
|
|
with torch.inference_mode(): |
|
|
audio_16, _ = librosa.load(audio, sr=S3_SR) |
|
|
audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ] |
|
|
|
|
|
s3_tokens, _ = self.s3gen.tokenizer(audio_16) |
|
|
wav, _ = self.s3gen.inference( |
|
|
speech_tokens=s3_tokens, |
|
|
ref_dict=self.ref_dict, |
|
|
) |
|
|
wav = wav.squeeze(0).detach().cpu().numpy() |
|
|
watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=self.sr) |
|
|
return torch.from_numpy(watermarked_wav).unsqueeze(0) |