Chatterbox-Multilingual-TTS-API

Sleeping

App Files Files Community

rahul7star commited on Jan 6

Commit

c5d087d

verified ·

1 Parent(s): 036ffff

Update src/chatterbox/mtl_tts.py

Browse files

Files changed (1) hide show

src/chatterbox/mtl_tts.py +76 -135

src/chatterbox/mtl_tts.py CHANGED Viewed

@@ -1,15 +1,11 @@
 from dataclasses import dataclass
 from pathlib import Path
 import os
-from pathlib import Path
 import torch
-import os
-from huggingface_hub import snapshot_download
 import librosa
-import torch
 import perth
 import torch.nn.functional as F
 from safetensors.torch import load_file as load_safetensors
 from huggingface_hub import snapshot_download
@@ -24,51 +20,42 @@ from .models.t3.modules.cond_enc import T3Cond
 REPO_ID = "ResembleAI/chatterbox"
-# Supported languages for the multilingual model
 SUPPORTED_LANGUAGES = {
-  "ar": "Arabic",
-  "da": "Danish",
-  "de": "German",
-  "el": "Greek",
-  "en": "English",
-  "es": "Spanish",
-  "fi": "Finnish",
-  "fr": "French",
-  "he": "Hebrew",
-  "hi": "Hindi",
-  "it": "Italian",
-  "ja": "Japanese",
-  "ko": "Korean",
-  "ms": "Malay",
-  "nl": "Dutch",
-  "no": "Norwegian",
-  "pl": "Polish",
-  "pt": "Portuguese",
-  "ru": "Russian",
-  "sv": "Swedish",
-  "sw": "Swahili",
-  "tr": "Turkish",
-  "zh": "Chinese",
 }
 def punc_norm(text: str) -> str:
-    """
-        Quick cleanup func for punctuation from LLMs or
-        containing chars not seen often in the dataset
-    """
-    if len(text) == 0:
         return "You need to add some text for me to talk."
-    # Capitalise first letter
-    if text[0].islower():
-        text = text[0].upper() + text[1:]
-    # Remove multiple space chars
     text = " ".join(text.split())
-    # Replace uncommon/llm punc
-    punc_to_replace = [
         ("...", ", "),
         ("…", ", "),
         (":", ","),
@@ -82,13 +69,10 @@ def punc_norm(text: str) -> str:
         ("‘", "'"),
         ("’", "'"),
     ]
-    for old_char_sequence, new_char in punc_to_replace:
-        text = text.replace(old_char_sequence, new_char)
-    # Add full stop if no ending punc
-    text = text.rstrip(" ")
-    sentence_enders = {".", "!", "?", "-", ",","、","，","。","？","！"}
-    if not any(text.endswith(p) for p in sentence_enders):
         text += "."
     return text
@@ -96,58 +80,31 @@ def punc_norm(text: str) -> str:
 @dataclass
 class Conditionals:
-    """
-    Conditionals for T3 and S3Gen
-    - T3 conditionals:
-        - speaker_emb
-        - clap_emb
-        - cond_prompt_speech_tokens
-        - cond_prompt_speech_emb
-        - emotion_adv
-    - S3Gen conditionals:
-        - prompt_token
-        - prompt_token_len
-        - prompt_feat
-        - prompt_feat_len
-        - embedding
-    """
     t3: T3Cond
     gen: dict
     def to(self, device):
-        self.t3 = self.t3.to(device=device)
         for k, v in self.gen.items():
             if torch.is_tensor(v):
-                self.gen[k] = v.to(device=device)
         return self
-    def save(self, fpath: Path):
-        arg_dict = dict(
-            t3=self.t3.__dict__,
-            gen=self.gen
-        )
-        torch.save(arg_dict, fpath)
     @classmethod
-    def load(cls, fpath, map_location="cpu"):
-        kwargs = torch.load(fpath, map_location="cpu", weights_only=True)
-        return cls(T3Cond(**kwargs['t3']), kwargs['gen'])
 class ChatterboxMultilingualTTS:
     ENC_COND_LEN = 6 * S3_SR
     DEC_COND_LEN = 10 * S3GEN_SR
-    def __init__(
-        self,
-        t3: T3,
-        s3gen: S3Gen,
-        ve: VoiceEncoder,
-        tokenizer: MTLTokenizer,
-        device: str,
-        conds: Conditionals = None,
-    ):
-        self.sr = S3GEN_SR  # sample rate of synthesized audio
         self.t3 = t3
         self.s3gen = s3gen
         self.ve = ve
@@ -156,72 +113,61 @@ class ChatterboxMultilingualTTS:
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
     @classmethod
     def get_supported_languages(cls):
-        """Return dictionary of supported language codes and names."""
         return SUPPORTED_LANGUAGES.copy()
     @classmethod
-    def from_local(cls, ckpt_dir, device) -> 'ChatterboxMultilingualTTS':
         ckpt_dir = Path(ckpt_dir)
         ve = VoiceEncoder()
-        ve.load_state_dict(
-            torch.load(ckpt_dir / "ve.pt", weights_only=True)
-        )
         ve.to(device).eval()
         t3 = T3(T3Config.multilingual())
         t3_state = load_safetensors(ckpt_dir / "t3_mtl23ls_v2.safetensors")
-        if "model" in t3_state.keys():
             t3_state = t3_state["model"][0]
         t3.load_state_dict(t3_state)
         t3.to(device).eval()
         s3gen = S3Gen()
-        s3gen.load_state_dict(
-            torch.load(ckpt_dir / "s3gen.pt", weights_only=True)
-        )
         s3gen.to(device).eval()
-        tokenizer = MTLTokenizer(
-            str(ckpt_dir / "grapheme_mtl_merged_expanded_v1.json")
-        )
         conds = None
-        if (builtin_voice := ckpt_dir / "conds.pt").exists():
-            conds = Conditionals.load(builtin_voice).to(device)
-        return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
     @classmethod
-    def from_pretrained(
-       cls,
-        device: str | torch.device | None = None,
-    ) -> "ChatterboxMultilingualTTS":
-       """
-       Load ChatterboxMultilingualTTS safely.
-       Defaults to CPU and never forces CUDA.
-       """
-       # 🔒 Normalize + force CPU
-       if device is None:
-         device = torch.device("cpu")
-       elif isinstance(device, str):
-         device = torch.device(device)
-       # Absolute safety: never allow CUDA
-       if device.type != "cpu":
-         device = torch.device("cpu")
-       ckpt_dir = Path(
-        snapshot_download(
             repo_id=REPO_ID,
-            repo_type="model",
-            revision="main",
             allow_patterns=[
                 "ve.pt",
                 "t3_mtl23ls_v2.safetensors",
@@ -231,17 +177,12 @@ class ChatterboxMultilingualTTS:
                 "Cangjie5_TC.json",
             ],
             token=os.getenv("HF_TOKEN"),
-           )
-       )
-       model = cls.from_local(ckpt_dir, device)
-       # Extra safety: force model tensors to CPU
-       if hasattr(model, "to"):
-          model = model.to("cpu")
-       model.eval()
-       for p in model.parameters():
-           p.requires_grad = False
-       return model

 from dataclasses import dataclass
 from pathlib import Path
 import os
 import torch
 import librosa
 import perth
 import torch.nn.functional as F
 from safetensors.torch import load_file as load_safetensors
 from huggingface_hub import snapshot_download
 REPO_ID = "ResembleAI/chatterbox"
 SUPPORTED_LANGUAGES = {
+    "ar": "Arabic",
+    "da": "Danish",
+    "de": "German",
+    "el": "Greek",
+    "en": "English",
+    "es": "Spanish",
+    "fi": "Finnish",
+    "fr": "French",
+    "he": "Hebrew",
+    "hi": "Hindi",
+    "it": "Italian",
+    "ja": "Japanese",
+    "ko": "Korean",
+    "ms": "Malay",
+    "nl": "Dutch",
+    "no": "Norwegian",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "ru": "Russian",
+    "sv": "Swedish",
+    "sw": "Swahili",
+    "tr": "Turkish",
+    "zh": "Chinese",
 }
 def punc_norm(text: str) -> str:
+    if not text:
         return "You need to add some text for me to talk."
+    text = text.strip()
+    text = text[0].upper() + text[1:] if text[0].islower() else text
     text = " ".join(text.split())
+    replacements = [
         ("...", ", "),
         ("…", ", "),
         (":", ","),
         ("‘", "'"),
         ("’", "'"),
     ]
+    for a, b in replacements:
+        text = text.replace(a, b)
+    if not text.endswith((".", "!", "?", ",", "-", "。", "？", "！")):
         text += "."
     return text
 @dataclass
 class Conditionals:
     t3: T3Cond
     gen: dict
     def to(self, device):
+        self.t3 = self.t3.to(device)
         for k, v in self.gen.items():
             if torch.is_tensor(v):
+                self.gen[k] = v.to(device)
         return self
+    def save(self, path: Path):
+        torch.save({"t3": self.t3.__dict__, "gen": self.gen}, path)
     @classmethod
+    def load(cls, path, map_location="cpu"):
+        data = torch.load(path, map_location=map_location, weights_only=True)
+        return cls(T3Cond(**data["t3"]), data["gen"])
 class ChatterboxMultilingualTTS:
     ENC_COND_LEN = 6 * S3_SR
     DEC_COND_LEN = 10 * S3GEN_SR
+    def __init__(self, t3, s3gen, ve, tokenizer, device, conds=None):
+        self.sr = S3GEN_SR
         self.t3 = t3
         self.s3gen = s3gen
         self.ve = ve
         self.conds = conds
         self.watermarker = perth.PerthImplicitWatermarker()
+    # ✅ Forward torch behavior
+    def eval(self):
+        for m in (self.t3, self.s3gen, self.ve):
+            m.eval()
+        return self
+    def to(self, device):
+        self.device = device
+        for m in (self.t3, self.s3gen, self.ve):
+            m.to(device)
+        if self.conds:
+            self.conds.to(device)
+        return self
+    def parameters(self):
+        for m in (self.t3, self.s3gen, self.ve):
+            yield from m.parameters()
     @classmethod
     def get_supported_languages(cls):
         return SUPPORTED_LANGUAGES.copy()
     @classmethod
+    def from_local(cls, ckpt_dir, device):
         ckpt_dir = Path(ckpt_dir)
         ve = VoiceEncoder()
+        ve.load_state_dict(torch.load(ckpt_dir / "ve.pt", map_location="cpu", weights_only=True))
         ve.to(device).eval()
         t3 = T3(T3Config.multilingual())
         t3_state = load_safetensors(ckpt_dir / "t3_mtl23ls_v2.safetensors")
+        if "model" in t3_state:
             t3_state = t3_state["model"][0]
         t3.load_state_dict(t3_state)
         t3.to(device).eval()
         s3gen = S3Gen()
+        s3gen.load_state_dict(torch.load(ckpt_dir / "s3gen.pt", map_location="cpu", weights_only=True))
         s3gen.to(device).eval()
+        tokenizer = MTLTokenizer(str(ckpt_dir / "grapheme_mtl_merged_expanded_v1.json"))
         conds = None
+        if (ckpt_dir / "conds.pt").exists():
+            conds = Conditionals.load(ckpt_dir / "conds.pt").to(device)
+        return cls(t3, s3gen, ve, tokenizer, device, conds)
     @classmethod
+    def from_pretrained(cls, device=None):
+        device = torch.device("cpu")
+        ckpt_dir = snapshot_download(
             repo_id=REPO_ID,
             allow_patterns=[
                 "ve.pt",
                 "t3_mtl23ls_v2.safetensors",
                 "Cangjie5_TC.json",
             ],
             token=os.getenv("HF_TOKEN"),
+        )
+        model = cls.from_local(ckpt_dir, device)
+        model.eval()
+        for p in model.parameters():
+            p.requires_grad_(False)
+        return model