Chatterbox-Multilingual-TTS-API

Running

App Files Files Community

rahul7star commited on Jan 6

Commit

ef3441d

verified ·

1 Parent(s): c5d087d

Update src/chatterbox/mtl_tts.py

Browse files

Files changed (1) hide show

src/chatterbox/mtl_tts.py +69 -86

src/chatterbox/mtl_tts.py CHANGED Viewed

@@ -2,12 +2,12 @@ from dataclasses import dataclass
 from pathlib import Path
 import os
 import torch
 import librosa
 import perth
 import torch.nn.functional as F
 from safetensors.torch import load_file as load_safetensors
-from huggingface_hub import snapshot_download
 from .models.t3 import T3
 from .models.t3.modules.t3_config import T3Config
@@ -17,64 +17,34 @@ from .models.tokenizers import MTLTokenizer
 from .models.voice_encoder import VoiceEncoder
 from .models.t3.modules.cond_enc import T3Cond
 REPO_ID = "ResembleAI/chatterbox"
 SUPPORTED_LANGUAGES = {
-    "ar": "Arabic",
-    "da": "Danish",
-    "de": "German",
-    "el": "Greek",
-    "en": "English",
-    "es": "Spanish",
-    "fi": "Finnish",
-    "fr": "French",
-    "he": "Hebrew",
-    "hi": "Hindi",
-    "it": "Italian",
-    "ja": "Japanese",
-    "ko": "Korean",
-    "ms": "Malay",
-    "nl": "Dutch",
-    "no": "Norwegian",
-    "pl": "Polish",
-    "pt": "Portuguese",
-    "ru": "Russian",
-    "sv": "Swedish",
-    "sw": "Swahili",
-    "tr": "Turkish",
-    "zh": "Chinese",
 }
 def punc_norm(text: str) -> str:
     if not text:
         return "You need to add some text for me to talk."
-    text = text.strip()
-    text = text[0].upper() + text[1:] if text[0].islower() else text
     text = " ".join(text.split())
     replacements = [
-        ("...", ", "),
-        ("…", ", "),
-        (":", ","),
-        (" - ", ", "),
-        (";", ", "),
-        ("—", "-"),
-        ("–", "-"),
-        (" ,", ","),
-        ("“", "\""),
-        ("”", "\""),
-        ("‘", "'"),
-        ("’", "'"),
     ]
-    for a, b in replacements:
-        text = text.replace(a, b)
-    if not text.endswith((".", "!", "?", ",", "-", "。", "？", "！")):
         text += "."
     return text
@@ -84,18 +54,18 @@ class Conditionals:
     gen: dict
     def to(self, device):
-        self.t3 = self.t3.to(device)
         for k, v in self.gen.items():
             if torch.is_tensor(v):
                 self.gen[k] = v.to(device)
         return self
-    def save(self, path: Path):
-        torch.save({"t3": self.t3.__dict__, "gen": self.gen}, path)
     @classmethod
-    def load(cls, path, map_location="cpu"):
-        data = torch.load(path, map_location=map_location, weights_only=True)
         return cls(T3Cond(**data["t3"]), data["gen"])
@@ -103,7 +73,15 @@ class ChatterboxMultilingualTTS:
     ENC_COND_LEN = 6 * S3_SR
     DEC_COND_LEN = 10 * S3GEN_SR
-    def __init__(self, t3, s3gen, ve, tokenizer, device, conds=None):
         self.sr = S3GEN_SR
         self.t3 = t3
         self.s3gen = s3gen
@@ -113,61 +91,62 @@ class ChatterboxMultilingualTTS:
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
-    # ✅ Forward torch behavior
-    def eval(self):
-        for m in (self.t3, self.s3gen, self.ve):
-            m.eval()
-        return self
-    def to(self, device):
-        self.device = device
-        for m in (self.t3, self.s3gen, self.ve):
-            m.to(device)
-        if self.conds:
-            self.conds.to(device)
-        return self
-    def parameters(self):
-        for m in (self.t3, self.s3gen, self.ve):
-            yield from m.parameters()
     @classmethod
     def get_supported_languages(cls):
         return SUPPORTED_LANGUAGES.copy()
     @classmethod
-    def from_local(cls, ckpt_dir, device):
         ckpt_dir = Path(ckpt_dir)
         ve = VoiceEncoder()
         ve.load_state_dict(torch.load(ckpt_dir / "ve.pt", map_location="cpu", weights_only=True))
-        ve.to(device).eval()
         t3 = T3(T3Config.multilingual())
         t3_state = load_safetensors(ckpt_dir / "t3_mtl23ls_v2.safetensors")
         if "model" in t3_state:
             t3_state = t3_state["model"][0]
         t3.load_state_dict(t3_state)
-        t3.to(device).eval()
         s3gen = S3Gen()
         s3gen.load_state_dict(torch.load(ckpt_dir / "s3gen.pt", map_location="cpu", weights_only=True))
-        s3gen.to(device).eval()
         tokenizer = MTLTokenizer(str(ckpt_dir / "grapheme_mtl_merged_expanded_v1.json"))
         conds = None
         if (ckpt_dir / "conds.pt").exists():
             conds = Conditionals.load(ckpt_dir / "conds.pt").to(device)
-        return cls(t3, s3gen, ve, tokenizer, device, conds)
     @classmethod
-    def from_pretrained(cls, device=None):
-        device = torch.device("cpu")
-        ckpt_dir = snapshot_download(
             repo_id=REPO_ID,
             allow_patterns=[
                 "ve.pt",
                 "t3_mtl23ls_v2.safetensors",
@@ -176,13 +155,17 @@ class ChatterboxMultilingualTTS:
                 "conds.pt",
                 "Cangjie5_TC.json",
             ],
-            token=os.getenv("HF_TOKEN"),
-        )
         model = cls.from_local(ckpt_dir, device)
-        model.eval()
-        for p in model.parameters():
-            p.requires_grad_(False)
         return model

 from pathlib import Path
 import os
 import torch
+from huggingface_hub import snapshot_download
 import librosa
 import perth
 import torch.nn.functional as F
 from safetensors.torch import load_file as load_safetensors
 from .models.t3 import T3
 from .models.t3.modules.t3_config import T3Config
 from .models.voice_encoder import VoiceEncoder
 from .models.t3.modules.cond_enc import T3Cond
 REPO_ID = "ResembleAI/chatterbox"
+# Supported languages for the multilingual model
 SUPPORTED_LANGUAGES = {
+    "ar": "Arabic", "da": "Danish", "de": "German", "el": "Greek",
+    "en": "English", "es": "Spanish", "fi": "Finnish", "fr": "French",
+    "he": "Hebrew", "hi": "Hindi", "it": "Italian", "ja": "Japanese",
+    "ko": "Korean", "ms": "Malay", "nl": "Dutch", "no": "Norwegian",
+    "pl": "Polish", "pt": "Portuguese", "ru": "Russian", "sv": "Swedish",
+    "sw": "Swahili", "tr": "Turkish", "zh": "Chinese",
 }
 def punc_norm(text: str) -> str:
+    """Normalize punctuation for TTS text."""
     if not text:
         return "You need to add some text for me to talk."
+    text = text[0].upper() + text[1:]
     text = " ".join(text.split())
     replacements = [
+        ("...", ", "), ("…", ", "), (":", ","), (" - ", ","),
+        (";", ","), ("—", "-"), ("–", "-"), (" ,", ","),
+        ("“", "\""), ("”", "\""), ("‘", "'"), ("’", "'")
     ]
+    for old, new in replacements:
+        text = text.replace(old, new)
+    if not any(text.endswith(p) for p in {".", "!", "?", "-", ",","、","，","。","？","！"}):
         text += "."
     return text
     gen: dict
     def to(self, device):
+        """Move only tensors in `.gen` to device. T3Cond stays as-is."""
         for k, v in self.gen.items():
             if torch.is_tensor(v):
                 self.gen[k] = v.to(device)
         return self
+    def save(self, fpath: Path):
+        torch.save({"t3": self.t3.__dict__, "gen": self.gen}, fpath)
     @classmethod
+    def load(cls, fpath: Path, map_location="cpu"):
+        data = torch.load(fpath, map_location=map_location, weights_only=True)
         return cls(T3Cond(**data["t3"]), data["gen"])
     ENC_COND_LEN = 6 * S3_SR
     DEC_COND_LEN = 10 * S3GEN_SR
+    def __init__(
+        self,
+        t3: T3,
+        s3gen: S3Gen,
+        ve: VoiceEncoder,
+        tokenizer: MTLTokenizer,
+        device: str,
+        conds: Conditionals = None,
+    ):
         self.sr = S3GEN_SR
         self.t3 = t3
         self.s3gen = s3gen
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
+        # Disable gradients for safety
+        for p in self.parameters():
+            p.requires_grad = False
     @classmethod
     def get_supported_languages(cls):
         return SUPPORTED_LANGUAGES.copy()
     @classmethod
+    def from_local(cls, ckpt_dir, device) -> "ChatterboxMultilingualTTS":
         ckpt_dir = Path(ckpt_dir)
+        # Voice Encoder
         ve = VoiceEncoder()
         ve.load_state_dict(torch.load(ckpt_dir / "ve.pt", map_location="cpu", weights_only=True))
+        ve.to(device)
+        # T3
         t3 = T3(T3Config.multilingual())
         t3_state = load_safetensors(ckpt_dir / "t3_mtl23ls_v2.safetensors")
         if "model" in t3_state:
             t3_state = t3_state["model"][0]
         t3.load_state_dict(t3_state)
+        t3.to(device)
+        # S3Gen
         s3gen = S3Gen()
         s3gen.load_state_dict(torch.load(ckpt_dir / "s3gen.pt", map_location="cpu", weights_only=True))
+        s3gen.to(device)
+        # Tokenizer
         tokenizer = MTLTokenizer(str(ckpt_dir / "grapheme_mtl_merged_expanded_v1.json"))
+        # Conditionals
         conds = None
         if (ckpt_dir / "conds.pt").exists():
             conds = Conditionals.load(ckpt_dir / "conds.pt").to(device)
+        return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
     @classmethod
+    def from_pretrained(cls, device: str | torch.device | None = None) -> "ChatterboxMultilingualTTS":
+        """Load model fully on CPU, never use CUDA."""
+        if device is None:
+            device = torch.device("cpu")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # Force CPU
+        if device.type != "cpu":
+            device = torch.device("cpu")
+        ckpt_dir = Path(snapshot_download(
             repo_id=REPO_ID,
+            repo_type="model",
+            revision="main",
             allow_patterns=[
                 "ve.pt",
                 "t3_mtl23ls_v2.safetensors",
                 "conds.pt",
                 "Cangjie5_TC.json",
             ],
+            token=os.getenv("HF_TOKEN")
+        ))
         model = cls.from_local(ckpt_dir, device)
         return model
+    def parameters(self):
+        """Iterate over all parameters in T3, S3Gen, and VE for disabling gradients."""
+        for p in self.t3.parameters():
+            yield p
+        for p in self.s3gen.parameters():
+            yield p
+        for p in self.ve.parameters():
+            yield p