Upload folder using huggingface_hub

99341ef verified 2 months ago

6.07 kB

	"""Voice cloning utilities."""

	import datetime
	import importlib.metadata
	import logging
	import os
	from dataclasses import asdict, dataclass
	from pathlib import Path
	from typing import Any

	import torch

	from TTS.utils.generic_utils import is_pytorch_at_least_2_4, slugify

	logger = logging.getLogger(__name__)


	@dataclass
	class VoiceMetadata:
	"""Holds metadata on a voice.

	Args:
	model:
	Model name, same as used in configs.
	speaker_id:
	User-defined speaker name.
	source_files:
	Paths to audio files used to generate the voice.
	created_at:
	ISO 8601 timestamp at which the voice was generated.
	coqui_version:
	Coqui TTS version that generated the voice.
	"""

	model: dict[str, str \| float \| bool]
	speaker_id: str
	source_files: list[str] \| None = None
	created_at: str \| None = None
	coqui_version: str \| None = None

	def to_dict(self) -> dict[str, Any]:
	return asdict(self)

	@classmethod
	def from_dict(cls, data: dict[str, Any]) -> "VoiceMetadata":
	return cls(**data)


	class CloningMixin:
	"""Add voice cloning with caching support.

	To be used as a mixin in :py:class:`~TTS.tts.models.base_tts.BaseTTS` and
	:py:class:`~TTS.vc.models.base_vc.BaseVC`-derived models.
	"""

	def _create_voice_metadata(
	self, model: dict[str, str \| float \| bool], speaker_id: str, source_files: list[str]
	) -> VoiceMetadata:
	return VoiceMetadata(
	model=model,
	speaker_id=speaker_id,
	source_files=source_files,
	created_at=datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="minutes"),
	coqui_version=importlib.metadata.version("coqui-tts"),
	)

	def clone_voice(
	self,
	speaker_wav: str \| os.PathLike[Any] \| list[str \| os.PathLike[Any]] \| None,
	speaker_id: str \| None = None,
	voice_dir: str \| os.PathLike[Any] \| None = None,
	**generate_kwargs: Any,
	) -> dict[str, Any]:
	"""Load a pregenerated voice or generate one from the given reference audio.

	If ``speaker_wav`` is not specified, loads the voice from ``<voice_dir>/<speaker_id>.pth``.

	The voice will be cached in ``<voice_dir>/<speaker_id>.pth`` if those parameters are
	specified. If there already is a voice for ``speaker_id``, it will be overwritten with
	the newly generated one.

	Args:
	speaker_wav:
	Path(s) to the reference audio files.
	speaker_id:
	Speaker ID to be assigned when saving the voice (if not ``None``).
	voice_dir:
	Directory to save the voice (if not ``None``).
	**generate_kwargs:
	Arguments passed on to the model-specific ``_clone_voice()``.

	"""
	if speaker_wav is None or (isinstance(speaker_wav, list) and len(speaker_wav) == 0):
	if speaker_id is None:
	msg = "Neither `speaker_wav` nor `speaker_id` was specified"
	raise RuntimeError(msg)
	if voice_dir is None:
	msg = "Specified only `speaker_id`, but no `voice_dir` to load the voice from"
	raise RuntimeError(msg)
	return self.load_voice_file(speaker_id, voice_dir)
	voice, model_metadata = self._clone_voice(speaker_wav, **generate_kwargs)
	logger.info("Generated voice from reference audio")
	if speaker_id is not None and voice_dir is not None:
	speaker_id = slugify(speaker_id)
	voice_fn = Path(voice_dir) / f"{speaker_id}.pth"
	voice_fn.parent.mkdir(exist_ok=True, parents=True)
	speaker_wav = speaker_wav if isinstance(speaker_wav, list) else [speaker_wav]
	metadata = self._create_voice_metadata(model_metadata, speaker_id, [str(p) for p in speaker_wav])
	voices = self.get_voices(voice_dir)
	if speaker_id in voices:
	logger.info("Voice `%s` already exists in `%s`, overwriting it", speaker_id, voice_fn)
	voice_dict = {**voice, "metadata": metadata.to_dict()}
	torch.save(voice_dict, voice_fn)
	logger.info("Voice `%s` saved to: %s", speaker_id, voice_fn)
	return voice

	def _clone_voice(
	self,
	speaker_wav: str \| os.PathLike[Any] \| list[str \| os.PathLike[Any]],
	**generate_kwargs: Any,
	) -> tuple[dict[str, Any], dict[str, Any]]:
	"""Generate a voice from the given reference audio.

	To be implemented in model subclasses.

	Args:
	speaker_wav: Path(s) to the reference audio files.

	Returns:
	- dictionary with the embedding(s)
	- dictionary with any model-specific metadata, e.g. model name
	"""
	raise NotImplementedError

	def load_voice_file(
	self,
	speaker_id: str,
	voice_dir: str \| os.PathLike[Any],
	) -> dict[str, Any]:
	"""Load the voice for the given speaker.

	Args:
	speaker_id:
	Speaker ID to load.
	voice_dir:
	Directory where to look for the voice.
	"""
	voices = self.get_voices(voice_dir)
	if speaker_id not in voices:
	msg = f"Voice file `{slugify(speaker_id)}.pth` for speaker `{speaker_id}` not found in: {voice_dir}"
	raise FileNotFoundError(msg)
	voice = torch.load(voices[speaker_id], map_location="cpu", weights_only=is_pytorch_at_least_2_4())
	logger.info("Loaded voice `%s` from: %s", speaker_id, voices[speaker_id])
	return voice

	def get_voices(self, voice_dir: str \| os.PathLike[Any]) -> dict[str, Path]:
	"""Return all available voices in the given directory.

	Args:
	voice_dir: Directory to search for voices.

	Returns:
	Dictionary mapping a speaker ID to its voice file.
	"""
	return {path.stem: path for path in Path(voice_dir).glob("*.pth")}