Spaces:

Alstears
/

chatterbox-id-clone-api

Running

App Files Files Community

Alstears commited on 6 days ago

Commit

7af2a5a

verified ·

1 Parent(s): 5407e8e

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -29

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = ""  # paksa CPU-only
 import re
 import inspect
@@ -12,10 +12,24 @@ import torch
 import torchaudio as ta
 import gradio as gr
 # =========================
 # HARD PATCH CPU DESERIALIZE
 # =========================
-torch.cuda.is_available = lambda: False
 _original_torch_load = torch.load
 def _torch_load_cpu(*args, **kwargs):
@@ -37,10 +51,6 @@ from chatterbox.tts import ChatterboxTTS
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
-MODEL_REPO = "grandhigh/Chatterbox-TTS-Indonesian"
-CHECKPOINT_FILENAME = "t3_cfg.safetensors"
-DEVICE = "cpu"
 _model = None
 _model_lock = Lock()
@@ -60,7 +70,6 @@ def get_model():
                 t3_state = load_file(ckpt_path, device="cpu")
                 m.t3.load_state_dict(t3_state)
-                # ChatterboxTTS tidak punya .to(), jadi jangan pakai m.to("cpu")
                 if hasattr(m, "eval"):
                     m.eval()
@@ -70,8 +79,9 @@ def get_model():
 def _download_wav(url: str) -> str:
-    r = requests.get(url, timeout=90)
     r.raise_for_status()
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
     tmp.write(r.content)
     tmp.close()
@@ -79,16 +89,17 @@ def _download_wav(url: str) -> str:
 def _resolve_audio_input(audio_file, audio_url: str):
-    # gr.Audio(type="filepath") biasanya return string path
     if isinstance(audio_file, str) and audio_file.strip():
         return audio_file
-    # fallback kalau format dict
     if isinstance(audio_file, dict):
         p = audio_file.get("path")
         if p:
             return p
     if audio_url and audio_url.strip():
         return _download_wav(audio_url.strip())
@@ -96,24 +107,78 @@ def _resolve_audio_input(audio_file, audio_url: str):
 def _prepare_text_exact(text: str) -> str:
-    t = (text or "").strip()
     if not t:
         raise gr.Error("Text prompt tidak boleh kosong.")
-    # tambah tanda akhir agar model tidak lanjut ngawur
     if not re.search(r"[.!?…]$", t):
         t += "."
     return t
 def _generate_with_safe_kwargs(model, text: str, prompt_path: str):
     sig = inspect.signature(model.generate)
     params = sig.parameters
     kwargs = {}
     if "audio_prompt_path" in params:
         kwargs["audio_prompt_path"] = prompt_path
-    # Set parameter jika didukung versi chatterbox yang terpasang
     if "temperature" in params:
         kwargs["temperature"] = 0.05
     if "top_p" in params:
@@ -122,41 +187,77 @@ def _generate_with_safe_kwargs(model, text: str, prompt_path: str):
         kwargs["exaggeration"] = 0.25
     if "cfg_weight" in params:
         kwargs["cfg_weight"] = 0.3
-    # Coba gaya pemanggilan paling umum
     try:
         return model.generate(text, **kwargs)
     except TypeError:
-        # fallback: beberapa versi pakai named argument
         if "text" in params:
             kwargs["text"] = text
             return model.generate(**kwargs)
-        # fallback paling basic
         return model.generate(text)
-def clone_voice(text: str, audio_file, audio_url: str):
     try:
-        text = _prepare_text_exact(text)
-        prompt_path = _resolve_audio_input(audio_file, audio_url)
         if not prompt_path:
             raise gr.Error("Upload WAV atau isi Audio URL WAV.")
         model = get_model()
-        # bikin output lebih konsisten
         torch.manual_seed(42)
         with torch.no_grad():
-            wav = _generate_with_safe_kwargs(model, text, prompt_path)
-        if wav.dim() == 1:
-            wav = wav.unsqueeze(0)
-        sr = getattr(model, "sr", 24000)
         out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-        ta.save(out_path, wav.cpu(), sr)
         return out_path
     except Exception as e:
@@ -167,17 +268,28 @@ def clone_voice(text: str, audio_file, audio_url: str):
 with gr.Blocks(title="Chatterbox Indonesian Voice Cloning (CPU)") as demo:
     gr.Markdown("## Chatterbox-TTS Indonesian (CPU)")
-    gr.Markdown("Masukkan teks + upload WAV (atau URL WAV)")
     text_in = gr.Textbox(
         label="Text Prompt",
-        lines=4,
-        placeholder="Contoh: Apa kabar."
     )
     wav_in = gr.Audio(
         label="Upload WAV Prompt",
         type="filepath"
     )
     url_in = gr.Textbox(
         label="Audio URL WAV (opsional)",
         placeholder="https://example.com/input.wav"

 import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""  # force CPU-only
 import re
 import inspect
 import torchaudio as ta
 import gradio as gr
+# =========================
+# CONFIG (ANTI NGARET)
+# =========================
+MODEL_REPO = "grandhigh/Chatterbox-TTS-Indonesian"
+CHECKPOINT_FILENAME = "t3_cfg.safetensors"
+DEVICE = "cpu"
+# Batasi beban CPU
+MAX_TOTAL_CHARS = int(os.getenv("MAX_TOTAL_CHARS", "2400"))       # total karakter per request
+MAX_CHARS_PER_CHUNK = int(os.getenv("MAX_CHARS_PER_CHUNK", "220"))# karakter per chunk
+MAX_CHUNKS = int(os.getenv("MAX_CHUNKS", "12"))                   # maksimal jumlah chunk
+PAUSE_SECONDS = float(os.getenv("PAUSE_SECONDS", "0.15"))         # jeda antar chunk
+DOWNLOAD_TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "90"))
 # =========================
 # HARD PATCH CPU DESERIALIZE
 # =========================
+torch.cuda.is_available = lambda: False  # noqa: E731
 _original_torch_load = torch.load
 def _torch_load_cpu(*args, **kwargs):
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 _model = None
 _model_lock = Lock()
                 t3_state = load_file(ckpt_path, device="cpu")
                 m.t3.load_state_dict(t3_state)
                 if hasattr(m, "eval"):
                     m.eval()
 def _download_wav(url: str) -> str:
+    r = requests.get(url, timeout=DOWNLOAD_TIMEOUT)
     r.raise_for_status()
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
     tmp.write(r.content)
     tmp.close()
 def _resolve_audio_input(audio_file, audio_url: str):
+    # gr.Audio(type="filepath") -> string path
     if isinstance(audio_file, str) and audio_file.strip():
         return audio_file
+    # fallback dict
     if isinstance(audio_file, dict):
         p = audio_file.get("path")
         if p:
             return p
+    # URL fallback
     if audio_url and audio_url.strip():
         return _download_wav(audio_url.strip())
 def _prepare_text_exact(text: str) -> str:
+    t = re.sub(r"\s+", " ", (text or "").strip())
     if not t:
         raise gr.Error("Text prompt tidak boleh kosong.")
     if not re.search(r"[.!?…]$", t):
         t += "."
     return t
+def _split_text_safely(text: str, max_chars: int = MAX_CHARS_PER_CHUNK):
+    text = re.sub(r"\s+", " ", (text or "").strip())
+    if not text:
+        return []
+    # Split kalimat
+    sentences = re.split(r"(?<=[.!?])\s+", text)
+    chunks = []
+    current = ""
+    for s in sentences:
+        s = s.strip()
+        if not s:
+            continue
+        # Jika kalimat panjang, pecah pakai koma/titik koma/titik dua
+        parts = [s] if len(s) <= max_chars else re.split(r"(?<=[,;:])\s+", s)
+        for p in parts:
+            p = p.strip()
+            if not p:
+                continue
+            # kalau masih kepanjangan, hard-cut berbasis kata
+            if len(p) > max_chars:
+                words = p.split()
+                tmp = ""
+                for w in words:
+                    cand = f"{tmp} {w}".strip() if tmp else w
+                    if len(cand) <= max_chars:
+                        tmp = cand
+                    else:
+                        if tmp:
+                            chunks.append(tmp)
+                        tmp = w
+                if tmp:
+                    chunks.append(tmp)
+                continue
+            candidate = f"{current} {p}".strip() if current else p
+            if len(candidate) <= max_chars:
+                current = candidate
+            else:
+                if current:
+                    chunks.append(current)
+                current = p
+    if current:
+        chunks.append(current)
+    return chunks
 def _generate_with_safe_kwargs(model, text: str, prompt_path: str):
     sig = inspect.signature(model.generate)
     params = sig.parameters
     kwargs = {}
+    # prompt audio
     if "audio_prompt_path" in params:
         kwargs["audio_prompt_path"] = prompt_path
+    # Stabilitas & kecepatan (kalau param tersedia)
     if "temperature" in params:
         kwargs["temperature"] = 0.05
     if "top_p" in params:
         kwargs["exaggeration"] = 0.25
     if "cfg_weight" in params:
         kwargs["cfg_weight"] = 0.3
+    if "max_new_tokens" in params:
+        kwargs["max_new_tokens"] = 260  # cegah runaway generation
+    # Coba gaya call paling umum
     try:
         return model.generate(text, **kwargs)
     except TypeError:
         if "text" in params:
             kwargs["text"] = text
             return model.generate(**kwargs)
         return model.generate(text)
+def clone_voice(text: str, audio_file, audio_url: str, progress=gr.Progress(track_tqdm=False)):
     try:
+        raw_text = (text or "").strip()
+        if not raw_text:
+            raise gr.Error("Text prompt tidak boleh kosong.")
+        if len(raw_text) > MAX_TOTAL_CHARS:
+            raise gr.Error(
+                f"Teks terlalu panjang ({len(raw_text)} karakter). "
+                f"Maksimal {MAX_TOTAL_CHARS} karakter per request."
+            )
+        prompt_path = _resolve_audio_input(audio_file, audio_url)
         if not prompt_path:
             raise gr.Error("Upload WAV atau isi Audio URL WAV.")
+        chunks = _split_text_safely(raw_text, max_chars=MAX_CHARS_PER_CHUNK)
+        if not chunks:
+            raise gr.Error("Gagal memproses teks (chunk kosong).")
+        if len(chunks) > MAX_CHUNKS:
+            raise gr.Error(
+                f"Teks terlalu panjang ({len(chunks)} chunk). "
+                f"Maksimal {MAX_CHUNKS} chunk per request. "
+                "Silakan pecah teks jadi beberapa bagian."
+            )
         model = get_model()
+        sr = getattr(model, "sr", 24000)
         torch.manual_seed(42)
+        wav_parts = []
+        pause = torch.zeros(1, int(sr * PAUSE_SECONDS))
+        total = len(chunks)
         with torch.no_grad():
+            for i, ch in enumerate(chunks, start=1):
+                progress((i - 1) / total, desc=f"Processing chunk {i}/{total}...")
+                ch = _prepare_text_exact(ch)
+                wav = _generate_with_safe_kwargs(model, ch, prompt_path)
+                if wav.dim() == 1:
+                    wav = wav.unsqueeze(0)
+                wav_parts.append(wav.cpu())
+                wav_parts.append(pause)
+        # buang pause terakhir
+        if wav_parts:
+            wav_parts = wav_parts[:-1]
+        full_wav = torch.cat(wav_parts, dim=1)
         out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+        ta.save(out_path, full_wav, sr)
+        progress(1.0, desc="Selesai ✅")
         return out_path
     except Exception as e:
 with gr.Blocks(title="Chatterbox Indonesian Voice Cloning (CPU)") as demo:
     gr.Markdown("## Chatterbox-TTS Indonesian (CPU)")
+    gr.Markdown(
+        f"""
+Masukkan teks + upload WAV (atau URL WAV).
+**Batas anti-ngaret saat ini:**
+- Maks total teks: **{MAX_TOTAL_CHARS}** karakter
+- Maks per chunk: **{MAX_CHARS_PER_CHUNK}** karakter
+- Maks chunk: **{MAX_CHUNKS}**
+"""
+    )
     text_in = gr.Textbox(
         label="Text Prompt",
+        lines=8,
+        placeholder="Contoh: Materi ini membahas data mining..."
     )
     wav_in = gr.Audio(
         label="Upload WAV Prompt",
         type="filepath"
     )
     url_in = gr.Textbox(
         label="Audio URL WAV (opsional)",
         placeholder="https://example.com/input.wav"