Chatterbox-Multilingual-TTS-API

Sleeping

App Files Files Community

rahul7star commited on Jan 6

Commit

d610107

verified ·

1 Parent(s): ef3441d

Update src/chatterbox/mtl_tts.py

Browse files

Files changed (1) hide show

src/chatterbox/mtl_tts.py +58 -69

src/chatterbox/mtl_tts.py CHANGED Viewed

@@ -2,12 +2,11 @@ from dataclasses import dataclass
 from pathlib import Path
 import os
 import torch
-from huggingface_hub import snapshot_download
 import librosa
 import perth
 import torch.nn.functional as F
 from safetensors.torch import load_file as load_safetensors
 from .models.t3 import T3
 from .models.t3.modules.t3_config import T3Config
@@ -21,29 +20,28 @@ REPO_ID = "ResembleAI/chatterbox"
 # Supported languages for the multilingual model
 SUPPORTED_LANGUAGES = {
-    "ar": "Arabic", "da": "Danish", "de": "German", "el": "Greek",
-    "en": "English", "es": "Spanish", "fi": "Finnish", "fr": "French",
-    "he": "Hebrew", "hi": "Hindi", "it": "Italian", "ja": "Japanese",
-    "ko": "Korean", "ms": "Malay", "nl": "Dutch", "no": "Norwegian",
-    "pl": "Polish", "pt": "Portuguese", "ru": "Russian", "sv": "Swedish",
     "sw": "Swahili", "tr": "Turkish", "zh": "Chinese",
 }
 def punc_norm(text: str) -> str:
-    """Normalize punctuation for TTS text."""
-    if not text:
         return "You need to add some text for me to talk."
-    text = text[0].upper() + text[1:]
     text = " ".join(text.split())
     replacements = [
-        ("...", ", "), ("…", ", "), (":", ","), (" - ", ","),
-        (";", ","), ("—", "-"), ("–", "-"), (" ,", ","),
-        ("“", "\""), ("”", "\""), ("‘", "'"), ("’", "'")
     ]
     for old, new in replacements:
         text = text.replace(old, new)
-    if not any(text.endswith(p) for p in {".", "!", "?", "-", ",","、","，","。","？","！"}):
         text += "."
     return text
@@ -53,35 +51,22 @@ class Conditionals:
     t3: T3Cond
     gen: dict
-    def to(self, device):
-        """Move only tensors in `.gen` to device. T3Cond stays as-is."""
-        for k, v in self.gen.items():
-            if torch.is_tensor(v):
-                self.gen[k] = v.to(device)
-        return self
     def save(self, fpath: Path):
         torch.save({"t3": self.t3.__dict__, "gen": self.gen}, fpath)
     @classmethod
-    def load(cls, fpath: Path, map_location="cpu"):
-        data = torch.load(fpath, map_location=map_location, weights_only=True)
-        return cls(T3Cond(**data["t3"]), data["gen"])
 class ChatterboxMultilingualTTS:
     ENC_COND_LEN = 6 * S3_SR
     DEC_COND_LEN = 10 * S3GEN_SR
-    def __init__(
-        self,
-        t3: T3,
-        s3gen: S3Gen,
-        ve: VoiceEncoder,
-        tokenizer: MTLTokenizer,
-        device: str,
-        conds: Conditionals = None,
-    ):
         self.sr = S3GEN_SR
         self.t3 = t3
         self.s3gen = s3gen
@@ -91,55 +76,43 @@ class ChatterboxMultilingualTTS:
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
-        # Disable gradients for safety
-        for p in self.parameters():
-            p.requires_grad = False
     @classmethod
     def get_supported_languages(cls):
         return SUPPORTED_LANGUAGES.copy()
     @classmethod
-    def from_local(cls, ckpt_dir, device) -> "ChatterboxMultilingualTTS":
         ckpt_dir = Path(ckpt_dir)
-        # Voice Encoder
         ve = VoiceEncoder()
-        ve.load_state_dict(torch.load(ckpt_dir / "ve.pt", map_location="cpu", weights_only=True))
-        ve.to(device)
-        # T3
         t3 = T3(T3Config.multilingual())
         t3_state = load_safetensors(ckpt_dir / "t3_mtl23ls_v2.safetensors")
-        if "model" in t3_state:
             t3_state = t3_state["model"][0]
         t3.load_state_dict(t3_state)
-        t3.to(device)
-        # S3Gen
         s3gen = S3Gen()
-        s3gen.load_state_dict(torch.load(ckpt_dir / "s3gen.pt", map_location="cpu", weights_only=True))
-        s3gen.to(device)
-        # Tokenizer
         tokenizer = MTLTokenizer(str(ckpt_dir / "grapheme_mtl_merged_expanded_v1.json"))
-        # Conditionals
         conds = None
-        if (ckpt_dir / "conds.pt").exists():
-            conds = Conditionals.load(ckpt_dir / "conds.pt").to(device)
         return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
     @classmethod
     def from_pretrained(cls, device: str | torch.device | None = None) -> "ChatterboxMultilingualTTS":
-        """Load model fully on CPU, never use CUDA."""
         if device is None:
             device = torch.device("cpu")
         elif isinstance(device, str):
             device = torch.device(device)
-        # Force CPU
         if device.type != "cpu":
             device = torch.device("cpu")
@@ -148,24 +121,40 @@ class ChatterboxMultilingualTTS:
             repo_type="model",
             revision="main",
             allow_patterns=[
-                "ve.pt",
-                "t3_mtl23ls_v2.safetensors",
-                "s3gen.pt",
-                "grapheme_mtl_merged_expanded_v1.json",
-                "conds.pt",
-                "Cangjie5_TC.json",
             ],
-            token=os.getenv("HF_TOKEN")
         ))
         model = cls.from_local(ckpt_dir, device)
         return model
-    def parameters(self):
-        """Iterate over all parameters in T3, S3Gen, and VE for disabling gradients."""
-        for p in self.t3.parameters():
-            yield p
-        for p in self.s3gen.parameters():
-            yield p
-        for p in self.ve.parameters():
-            yield p

 from pathlib import Path
 import os
 import torch
 import librosa
 import perth
 import torch.nn.functional as F
 from safetensors.torch import load_file as load_safetensors
+from huggingface_hub import snapshot_download
 from .models.t3 import T3
 from .models.t3.modules.t3_config import T3Config
 # Supported languages for the multilingual model
 SUPPORTED_LANGUAGES = {
+    "ar": "Arabic", "da": "Danish", "de": "German", "el": "Greek", "en": "English",
+    "es": "Spanish", "fi": "Finnish", "fr": "French", "he": "Hebrew", "hi": "Hindi",
+    "it": "Italian", "ja": "Japanese", "ko": "Korean", "ms": "Malay", "nl": "Dutch",
+    "no": "Norwegian", "pl": "Polish", "pt": "Portuguese", "ru": "Russian", "sv": "Swedish",
     "sw": "Swahili", "tr": "Turkish", "zh": "Chinese",
 }
 def punc_norm(text: str) -> str:
+    if len(text) == 0:
         return "You need to add some text for me to talk."
+    if text[0].islower():
+        text = text[0].upper() + text[1:]
     text = " ".join(text.split())
     replacements = [
+        ("...", ", "), ("…", ", "), (":", ","), (" - ", ","), (";", ","),
+        ("—", "-"), ("–", "-"), (" ,", ","), ("“", "\""), ("”", "\""),
+        ("‘", "'"), ("’", "'"),
     ]
     for old, new in replacements:
         text = text.replace(old, new)
+    if not text[-1] in {".", "!", "?", "-", ",","、","，","。","？","！"}:
         text += "."
     return text
     t3: T3Cond
     gen: dict
     def save(self, fpath: Path):
         torch.save({"t3": self.t3.__dict__, "gen": self.gen}, fpath)
     @classmethod
+    def load(cls, fpath: Path):
+        kwargs = torch.load(fpath, map_location="cpu", weights_only=True)
+        return cls(T3Cond(**kwargs['t3']), kwargs['gen'])
 class ChatterboxMultilingualTTS:
     ENC_COND_LEN = 6 * S3_SR
     DEC_COND_LEN = 10 * S3GEN_SR
+    def __init__(self, t3: T3, s3gen: S3Gen, ve: VoiceEncoder,
+                 tokenizer: MTLTokenizer, device: torch.device,
+                 conds: Conditionals = None):
         self.sr = S3GEN_SR
         self.t3 = t3
         self.s3gen = s3gen
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
     @classmethod
     def get_supported_languages(cls):
         return SUPPORTED_LANGUAGES.copy()
     @classmethod
+    def from_local(cls, ckpt_dir: Path, device: torch.device) -> "ChatterboxMultilingualTTS":
         ckpt_dir = Path(ckpt_dir)
         ve = VoiceEncoder()
+        ve.load_state_dict(torch.load(ckpt_dir / "ve.pt", weights_only=True))
+        ve.to(device).eval()
         t3 = T3(T3Config.multilingual())
         t3_state = load_safetensors(ckpt_dir / "t3_mtl23ls_v2.safetensors")
+        if "model" in t3_state.keys():
             t3_state = t3_state["model"][0]
         t3.load_state_dict(t3_state)
+        t3.to(device).eval()
         s3gen = S3Gen()
+        s3gen.load_state_dict(torch.load(ckpt_dir / "s3gen.pt", weights_only=True))
+        s3gen.to(device).eval()
         tokenizer = MTLTokenizer(str(ckpt_dir / "grapheme_mtl_merged_expanded_v1.json"))
         conds = None
+        if (builtin_voice := ckpt_dir / "conds.pt").exists():
+            conds = Conditionals.load(builtin_voice)
         return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
     @classmethod
     def from_pretrained(cls, device: str | torch.device | None = None) -> "ChatterboxMultilingualTTS":
         if device is None:
             device = torch.device("cpu")
         elif isinstance(device, str):
             device = torch.device(device)
         if device.type != "cpu":
             device = torch.device("cpu")
             repo_type="model",
             revision="main",
             allow_patterns=[
+                "ve.pt", "t3_mtl23ls_v2.safetensors", "s3gen.pt",
+                "grapheme_mtl_merged_expanded_v1.json", "conds.pt",
+                "Cangjie5_TC.json"
             ],
+            token=os.getenv("HF_TOKEN"),
         ))
         model = cls.from_local(ckpt_dir, device)
+        # Ensure all params on CPU and eval
+        model.t3.to(device).eval()
+        model.s3gen.to(device).eval()
+        model.ve.to(device).eval()
+        if model.conds:
+            for k, v in model.conds.gen.items():
+                if torch.is_tensor(v):
+                    model.conds.gen[k] = v.to(device)
         return model
+    @torch.no_grad()
+    def generate(self, text: str, speaker_embedding: torch.Tensor = None) -> torch.Tensor:
+        """
+        Generate audio waveform (numpy array) from text.
+        CPU-compatible.
+        """
+        text = punc_norm(text)
+        token_ids = self.tokenizer.encode(text)
+        token_ids = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(self.device)
+        conds = self.conds.gen if self.conds else {}
+        t3_out = self.t3(token_ids, **conds)
+        audio = self.s3gen(t3_out, **conds)
+        if isinstance(audio, torch.Tensor):
+            audio = audio.squeeze(0).cpu().numpy()
+        return audio