Spaces:

Alstears
/

chatterbox-id-clone-api

Running

App Files Files Community

Alstears commited on about 23 hours ago

Commit

c8cf72b

verified ·

1 Parent(s): b78b518

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -58

app.py CHANGED Viewed

@@ -1,38 +1,75 @@
-# =========================
-# PATCH: long-text batching
-# =========================
 import os
 import re
 import gc
 import math
 import tempfile
 import traceback
 import torch
 import torchaudio as ta
 import gradio as gr
-# ---- CONFIG ----
-MAX_TOTAL_CHARS = 50000          # batas aman total teks
-MAX_CHARS_PER_CHUNK = 220        # default chunk size
-BATCH_SIZE = 8                   # jumlah chunk diproses per batch
-PAUSE_SECONDS = 0.12             # jeda antar chunk (detik)
-MAX_CHUNKS_HARD = 300            # guardrail biar nggak abuse
 def _split_text_safely(text: str, max_chars: int = 220):
-    """
-    Split teks berdasarkan kalimat dulu, lalu fallback per kata
-    agar setiap chunk <= max_chars.
-    """
     text = (text or "").strip()
     if not text:
         return []
-    # rapikan whitespace
     text = re.sub(r"\s+", " ", text).strip()
-    # pecah per kalimat (cukup robust utk id/en)
     sentences = re.split(r"(?<=[\.\!\?。！？])\s+", text)
     sentences = [s.strip() for s in sentences if s.strip()]
@@ -55,7 +92,6 @@ def _split_text_safely(text: str, max_chars: int = 220):
                 push_cur()
                 cur = sent
         else:
-            # kalimat kepanjangan -> pecah per kata
             words = sent.split()
             temp = ""
             for w in words:
@@ -85,12 +121,12 @@ def _prepare_text_exact(s: str) -> str:
 def _resolve_audio_input(audio_file, audio_url: str):
     """
-    Prioritas:
-    1) upload file
-    2) URL audio (download ke tmp)
-    Return local path WAV/Audio.
     """
-    # upload file dari gradio biasanya punya .name
     if audio_file is not None:
         p = getattr(audio_file, "name", None)
         if p and os.path.exists(p):
@@ -99,9 +135,9 @@ def _resolve_audio_input(audio_file, audio_url: str):
     url = (audio_url or "").strip()
     if url:
         try:
-            import requests
             r = requests.get(url, timeout=30)
             r.raise_for_status()
             suffix = ".wav"
             ct = (r.headers.get("content-type") or "").lower()
             if "mpeg" in ct or url.lower().endswith(".mp3"):
@@ -121,26 +157,16 @@ def _resolve_audio_input(audio_file, audio_url: str):
 def _auto_clean_prompt(prompt_path: str, target_sr: int = 24000):
-    """
-    Clean ringan untuk audio referensi user umum:
-    - convert mono
-    - resample ke target_sr
-    - trim silence depan/belakang
-    - normalize peak
-    """
     wav, sr = ta.load(prompt_path)  # [C, T]
-    # mono
     if wav.size(0) > 1:
         wav = wav.mean(dim=0, keepdim=True)
-    # resample
     if sr != target_sr:
         wav = ta.functional.resample(wav, sr, target_sr)
         sr = target_sr
-    # trim silence sederhana
-    # threshold linear: semakin kecil => trim lebih agresif
     thr = 0.01
     x = wav.abs().squeeze(0)
     idx = torch.where(x > thr)[0]
@@ -159,13 +185,76 @@ def _auto_clean_prompt(prompt_path: str, target_sr: int = 24000):
     return out
-def clone_voice(text: str, audio_file, audio_url: str, progress=gr.Progress(track_tqdm=False)):
     """
-    LONG-TEXT READY:
-    - auto split
-    - auto batch
-    - concat jadi 1 final wav
     """
     try:
         raw_text = (text or "").strip()
         if not raw_text:
@@ -181,10 +270,9 @@ def clone_voice(text: str, audio_file, audio_url: str, progress=gr.Progress(trac
         if not prompt_path:
             raise gr.Error("Upload file audio atau isi Audio URL yang valid.")
-        # split normal
         chunks = _split_text_safely(raw_text, max_chars=MAX_CHARS_PER_CHUNK)
-        # auto-relax 1x kalau chunk terlalu banyak
         if len(chunks) > 120:
             chunks = _split_text_safely(raw_text, max_chars=min(300, MAX_CHARS_PER_CHUNK + 60))
@@ -197,22 +285,20 @@ def clone_voice(text: str, audio_file, audio_url: str, progress=gr.Progress(trac
                 f"Maksimal {MAX_CHUNKS_HARD} chunk per request."
             )
-        # model singleton dari kode kamu yang sudah ada
         model = get_model()
         sr = int(getattr(model, "sr", 24000))
-        # clean prompt otomatis (tetap support input umum/noisy)
         prompt_clean = _auto_clean_prompt(prompt_path, target_sr=sr)
-        # seed optional biar stabil
-        torch.manual_seed(42)
         total_chunks = len(chunks)
         total_batches = math.ceil(total_chunks / BATCH_SIZE)
         all_wavs = []
         pause = torch.zeros(1, int(sr * PAUSE_SECONDS))
-        progress(0.0, desc=f"Mulai proses {total_chunks} chunk ({total_batches} batch)...")
         with torch.no_grad():
             for b in range(total_batches):
@@ -220,27 +306,18 @@ def clone_voice(text: str, audio_file, audio_url: str, progress=gr.Progress(trac
                 end = min((b + 1) * BATCH_SIZE, total_chunks)
                 batch = chunks[start:end]
-                progress(start / total_chunks, desc=f"Batch {b+1}/{total_batches}...")
                 for i, ch in enumerate(batch, start=start + 1):
                     ch = _prepare_text_exact(ch)
-                    # pakai helper lama kamu (yang sudah safe kwargs)
-                    wav = _generate_with_safe_kwargs(model, ch, prompt_clean)
-                    if wav.dim() == 1:
-                        wav = wav.unsqueeze(0)
-                    wav = wav.cpu()
                     all_wavs.append(wav)
-                    # kasih pause kalau bukan chunk terakhir
                     if i < total_chunks:
                         all_wavs.append(pause)
                     progress(i / total_chunks, desc=f"Chunk {i}/{total_chunks}")
-                # cleanup ringan antar batch
                 gc.collect()
         if not all_wavs:
@@ -254,9 +331,57 @@ def clone_voice(text: str, audio_file, audio_url: str, progress=gr.Progress(trac
         return out_path
     except gr.Error:
-        # penting: jangan dibungkus lagi, biar pesan asli tampil bersih
         raise
     except Exception as e:
         print("[ERROR]", repr(e))
         print(traceback.format_exc())
         raise gr.Error(f"Gagal generate audio: {e}")

 import os
 import re
 import gc
 import math
 import tempfile
 import traceback
+import warnings
+import inspect
+import threading
+import requests
 import torch
 import torchaudio as ta
 import gradio as gr
+# Optional: redam warning deprecate yang bukan error
+warnings.filterwarnings(
+    "ignore",
+    message=r".*torch\.backends\.cuda\.sdp_kernel\(\).*deprecated.*",
+    category=FutureWarning,
+)
+# =========================================================
+# === MODEL IMPORT ===
+# Sesuaikan jika path import model kamu berbeda
+# =========================================================
+# Contoh umum untuk Chatterbox:
+from chatterbox.tts import ChatterboxTTS
+# =========================
+# CONFIG
+# =========================
+MAX_TOTAL_CHARS = 50000
+MAX_CHARS_PER_CHUNK = 220
+BATCH_SIZE = 8
+PAUSE_SECONDS = 0.12
+MAX_CHUNKS_HARD = 300
+# inferensi config ringan (CPU-friendly)
+SEED = 42
+EXAGGERATION = 0.5
+CFG_WEIGHT = 0.5
+TEMPERATURE = 0.8
+# =========================
+# MODEL SINGLETON
+# =========================
+_MODEL = None
+_MODEL_LOCK = threading.Lock()
+def get_model():
+    global _MODEL
+    if _MODEL is None:
+        with _MODEL_LOCK:
+            if _MODEL is None:
+                _MODEL = ChatterboxTTS.from_pretrained(device="cpu")
+                _MODEL.eval()
+    return _MODEL
+# =========================
+# HELPERS
+# =========================
 def _split_text_safely(text: str, max_chars: int = 220):
     text = (text or "").strip()
     if not text:
         return []
     text = re.sub(r"\s+", " ", text).strip()
     sentences = re.split(r"(?<=[\.\!\?。！？])\s+", text)
     sentences = [s.strip() for s in sentences if s.strip()]
                 push_cur()
                 cur = sent
         else:
             words = sent.split()
             temp = ""
             for w in words:
 def _resolve_audio_input(audio_file, audio_url: str):
     """
+    audio_file dari gr.Audio(type="filepath") biasanya string path.
+    fallback support object .name.
     """
+    if isinstance(audio_file, str) and audio_file.strip() and os.path.exists(audio_file):
+        return audio_file
     if audio_file is not None:
         p = getattr(audio_file, "name", None)
         if p and os.path.exists(p):
     url = (audio_url or "").strip()
     if url:
         try:
             r = requests.get(url, timeout=30)
             r.raise_for_status()
             suffix = ".wav"
             ct = (r.headers.get("content-type") or "").lower()
             if "mpeg" in ct or url.lower().endswith(".mp3"):
 def _auto_clean_prompt(prompt_path: str, target_sr: int = 24000):
     wav, sr = ta.load(prompt_path)  # [C, T]
     if wav.size(0) > 1:
         wav = wav.mean(dim=0, keepdim=True)
     if sr != target_sr:
         wav = ta.functional.resample(wav, sr, target_sr)
         sr = target_sr
+    # trim silence ringan
     thr = 0.01
     x = wav.abs().squeeze(0)
     idx = torch.where(x > thr)[0]
     return out
+def _normalize_wav_output(out):
+    """
+    Normalisasi output model ke tensor [1, T].
+    """
+    if isinstance(out, tuple) or isinstance(out, list):
+        out = out[0]
+    if isinstance(out, torch.Tensor):
+        wav = out
+    else:
+        wav = torch.tensor(out)
+    if wav.dim() == 1:
+        wav = wav.unsqueeze(0)
+    elif wav.dim() == 2 and wav.shape[0] > wav.shape[1]:
+        # jaga-jaga shape kebalik
+        wav = wav.transpose(0, 1)
+    return wav.float()
+def _generate_with_safe_kwargs(model, text, prompt_path):
     """
+    Coba beberapa signature generate() karena tiap versi library kadang beda.
     """
+    sig = inspect.signature(model.generate)
+    accepted = set(sig.parameters.keys())
+    base = {
+        "text": text,
+        "audio_prompt_path": prompt_path,
+        "exaggeration": EXAGGERATION,
+        "cfg_weight": CFG_WEIGHT,
+        "temperature": TEMPERATURE,
+    }
+    # kandidat nama arg untuk prompt path
+    prompt_keys = ["audio_prompt_path", "prompt_path", "speaker_wav", "audio_path"]
+    tried = []
+    for pk in prompt_keys:
+        kwargs = base.copy()
+        kwargs.pop("audio_prompt_path", None)
+        kwargs[pk] = prompt_path
+        # filter param yang didukung signature
+        filtered = {k: v for k, v in kwargs.items() if k in accepted}
+        if "text" not in filtered and "text" in accepted:
+            filtered["text"] = text
+        try:
+            out = model.generate(**filtered)
+            return _normalize_wav_output(out)
+        except Exception as e:
+            tried.append(f"{pk}: {e}")
+    # fallback positional
+    try:
+        out = model.generate(text, prompt_path)
+        return _normalize_wav_output(out)
+    except Exception as e:
+        tried.append(f"positional(text, prompt): {e}")
+    raise RuntimeError("generate() gagal di semua signature percobaan:\n- " + "\n- ".join(tried))
+# =========================
+# MAIN INFERENCE
+# =========================
+def clone_voice(text: str, audio_file, audio_url: str, progress=gr.Progress(track_tqdm=False)):
     try:
         raw_text = (text or "").strip()
         if not raw_text:
         if not prompt_path:
             raise gr.Error("Upload file audio atau isi Audio URL yang valid.")
         chunks = _split_text_safely(raw_text, max_chars=MAX_CHARS_PER_CHUNK)
+        # auto-relax sekali kalau chunk terlalu banyak
         if len(chunks) > 120:
             chunks = _split_text_safely(raw_text, max_chars=min(300, MAX_CHARS_PER_CHUNK + 60))
                 f"Maksimal {MAX_CHUNKS_HARD} chunk per request."
             )
         model = get_model()
         sr = int(getattr(model, "sr", 24000))
         prompt_clean = _auto_clean_prompt(prompt_path, target_sr=sr)
+        torch.manual_seed(SEED)
         total_chunks = len(chunks)
         total_batches = math.ceil(total_chunks / BATCH_SIZE)
         all_wavs = []
         pause = torch.zeros(1, int(sr * PAUSE_SECONDS))
+        progress(0.0, desc=f"Mulai {total_chunks} chunk ({total_batches} batch)...")
         with torch.no_grad():
             for b in range(total_batches):
                 end = min((b + 1) * BATCH_SIZE, total_chunks)
                 batch = chunks[start:end]
+                progress(start / total_chunks, desc=f"Batch {b+1}/{total_batches}")
                 for i, ch in enumerate(batch, start=start + 1):
                     ch = _prepare_text_exact(ch)
+                    wav = _generate_with_safe_kwargs(model, ch, prompt_clean).cpu()
                     all_wavs.append(wav)
                     if i < total_chunks:
                         all_wavs.append(pause)
                     progress(i / total_chunks, desc=f"Chunk {i}/{total_chunks}")
                 gc.collect()
         if not all_wavs:
         return out_path
     except gr.Error:
         raise
     except Exception as e:
         print("[ERROR]", repr(e))
         print(traceback.format_exc())
         raise gr.Error(f"Gagal generate audio: {e}")
+# =========================
+# UI
+# =========================
+with gr.Blocks(title="Chatterbox Indonesian Voice Cloning (CPU)") as demo:
+    gr.Markdown("## Chatterbox Indonesian Voice Cloning (CPU)")
+    gr.Markdown(
+        "Masukkan teks panjang + upload audio referensi (atau URL audio). "
+        "Sistem akan auto-batch lalu gabung jadi 1 file WAV."
+    )
+    text_in = gr.Textbox(
+        label="Teks",
+        lines=10,
+        placeholder="Masukkan teks panjang di sini..."
+    )
+    audio_file_in = gr.Audio(
+        label="Upload Audio Referensi",
+        type="filepath",
+        sources=["upload", "microphone"]
+    )
+    audio_url_in = gr.Textbox(
+        label="Atau Audio URL",
+        placeholder="https://.../sample.wav"
+    )
+    run_btn = gr.Button("Generate Audio", variant="primary")
+    out_audio = gr.Audio(label="Hasil Audio", type="filepath")
+    run_btn.click(
+        fn=clone_voice,
+        inputs=[text_in, audio_file_in, audio_url_in],
+        outputs=[out_audio],
+        api_name="clone_voice"
+    )
+# =========================
+# LAUNCH
+# =========================
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    )