Spaces:

Alstears
/

chatterbox-id-clone-api

Running

App Files Files Community

Alstears commited on 20 days ago

Commit

5407e8e

verified ·

1 Parent(s): 02b7b3a

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -312

app.py CHANGED Viewed

@@ -1,384 +1,199 @@
 import os
 import re
-import gc
-import math
 import tempfile
 import traceback
-import warnings
-import inspect
-import threading
 import requests
 import torch
 import torchaudio as ta
 import gradio as gr
-# Optional: redam warning deprecate yang bukan error
-warnings.filterwarnings(
-    "ignore",
-    message=r".*torch\.backends\.cuda\.sdp_kernel\(\).*deprecated.*",
-    category=FutureWarning,
-)
-# =========================================================
-# === MODEL IMPORT ===
-# Sesuaikan jika path import model kamu berbeda
-# =========================================================
-# Contoh umum untuk Chatterbox:
-from chatterbox.tts import ChatterboxTTS
 # =========================
-# CONFIG
 # =========================
-MAX_TOTAL_CHARS = 50000
-MAX_CHARS_PER_CHUNK = 220
-BATCH_SIZE = 8
-PAUSE_SECONDS = 0.12
-MAX_CHUNKS_HARD = 300
-# inferensi config ringan (CPU-friendly)
-SEED = 42
-EXAGGERATION = 0.5
-CFG_WEIGHT = 0.5
-TEMPERATURE = 0.8
 # =========================
-# MODEL SINGLETON
 # =========================
-_MODEL = None
-_MODEL_LOCK = threading.Lock()
-def get_model():
-    global _MODEL
-    if _MODEL is None:
-        with _MODEL_LOCK:
-            if _MODEL is None:
-                _MODEL = ChatterboxTTS.from_pretrained(device="cpu")
-                _MODEL.eval()
-    return _MODEL
-# =========================
-# HELPERS
-# =========================
-def _split_text_safely(text: str, max_chars: int = 220):
-    text = (text or "").strip()
-    if not text:
-        return []
-    text = re.sub(r"\s+", " ", text).strip()
-    sentences = re.split(r"(?<=[\.\!\?。！？])\s+", text)
-    sentences = [s.strip() for s in sentences if s.strip()]
-    chunks = []
-    cur = ""
-    def push_cur():
-        nonlocal cur
-        if cur.strip():
-            chunks.append(cur.strip())
-        cur = ""
-    for sent in sentences:
-        if len(sent) <= max_chars:
-            if not cur:
-                cur = sent
-            elif len(cur) + 1 + len(sent) <= max_chars:
-                cur = f"{cur} {sent}"
-            else:
-                push_cur()
-                cur = sent
-        else:
-            words = sent.split()
-            temp = ""
-            for w in words:
-                if not temp:
-                    temp = w
-                elif len(temp) + 1 + len(w) <= max_chars:
-                    temp = f"{temp} {w}"
-                else:
-                    chunks.append(temp.strip())
-                    temp = w
-            if temp.strip():
-                if not cur:
-                    cur = temp.strip()
-                elif len(cur) + 1 + len(temp) <= max_chars:
-                    cur = f"{cur} {temp}".strip()
-                else:
-                    push_cur()
-                    cur = temp.strip()
-    push_cur()
-    return [c for c in chunks if c.strip()]
-def _prepare_text_exact(s: str) -> str:
-    return re.sub(r"\s+", " ", (s or "")).strip()
 def _resolve_audio_input(audio_file, audio_url: str):
-    """
-    audio_file dari gr.Audio(type="filepath") biasanya string path.
-    fallback support object .name.
-    """
-    if isinstance(audio_file, str) and audio_file.strip() and os.path.exists(audio_file):
         return audio_file
-    if audio_file is not None:
-        p = getattr(audio_file, "name", None)
-        if p and os.path.exists(p):
             return p
-    url = (audio_url or "").strip()
-    if url:
-        try:
-            r = requests.get(url, timeout=30)
-            r.raise_for_status()
-            suffix = ".wav"
-            ct = (r.headers.get("content-type") or "").lower()
-            if "mpeg" in ct or url.lower().endswith(".mp3"):
-                suffix = ".mp3"
-            elif "ogg" in ct or url.lower().endswith(".ogg"):
-                suffix = ".ogg"
-            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
-            tmp.write(r.content)
-            tmp.flush()
-            tmp.close()
-            return tmp.name
-        except Exception:
-            return None
     return None
-def _auto_clean_prompt(prompt_path: str, target_sr: int = 24000):
-    wav, sr = ta.load(prompt_path)  # [C, T]
-    if wav.size(0) > 1:
-        wav = wav.mean(dim=0, keepdim=True)
-    if sr != target_sr:
-        wav = ta.functional.resample(wav, sr, target_sr)
-        sr = target_sr
-    # trim silence ringan
-    thr = 0.01
-    x = wav.abs().squeeze(0)
-    idx = torch.where(x > thr)[0]
-    if idx.numel() > 0:
-        start = int(idx[0].item())
-        end = int(idx[-1].item()) + 1
-        wav = wav[:, start:end]
-    # normalize peak
-    peak = wav.abs().max().item() if wav.numel() else 0.0
-    if peak > 1e-6:
-        wav = (wav / peak) * 0.95
-    out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-    ta.save(out, wav, sr)
-    return out
-def _normalize_wav_output(out):
-    """
-    Normalisasi output model ke tensor [1, T].
-    """
-    if isinstance(out, tuple) or isinstance(out, list):
-        out = out[0]
-    if isinstance(out, torch.Tensor):
-        wav = out
-    else:
-        wav = torch.tensor(out)
-    if wav.dim() == 1:
-        wav = wav.unsqueeze(0)
-    elif wav.dim() == 2 and wav.shape[0] > wav.shape[1]:
-        # jaga-jaga shape kebalik
-        wav = wav.transpose(0, 1)
-    return wav.float()
-def _generate_with_safe_kwargs(model, text, prompt_path):
-    """
-    Coba beberapa signature generate() karena tiap versi library kadang beda.
-    """
     sig = inspect.signature(model.generate)
-    accepted = set(sig.parameters.keys())
-    base = {
-        "text": text,
-        "audio_prompt_path": prompt_path,
-        "exaggeration": EXAGGERATION,
-        "cfg_weight": CFG_WEIGHT,
-        "temperature": TEMPERATURE,
-    }
-    # kandidat nama arg untuk prompt path
-    prompt_keys = ["audio_prompt_path", "prompt_path", "speaker_wav", "audio_path"]
-    tried = []
-    for pk in prompt_keys:
-        kwargs = base.copy()
-        kwargs.pop("audio_prompt_path", None)
-        kwargs[pk] = prompt_path
-        # filter param yang didukung signature
-        filtered = {k: v for k, v in kwargs.items() if k in accepted}
-        if "text" not in filtered and "text" in accepted:
-            filtered["text"] = text
-        try:
-            out = model.generate(**filtered)
-            return _normalize_wav_output(out)
-        except Exception as e:
-            tried.append(f"{pk}: {e}")
-    # fallback positional
     try:
-        out = model.generate(text, prompt_path)
-        return _normalize_wav_output(out)
-    except Exception as e:
-        tried.append(f"positional(text, prompt): {e}")
-    raise RuntimeError("generate() gagal di semua signature percobaan:\n- " + "\n- ".join(tried))
-# =========================
-# MAIN INFERENCE
-# =========================
-def clone_voice(text: str, audio_file, audio_url: str, progress=gr.Progress(track_tqdm=False)):
     try:
-        raw_text = (text or "").strip()
-        if not raw_text:
-            raise gr.Error("Text prompt tidak boleh kosong.")
-        if len(raw_text) > MAX_TOTAL_CHARS:
-            raise gr.Error(
-                f"Teks terlalu panjang ({len(raw_text)} karakter). "
-                f"Maksimal {MAX_TOTAL_CHARS} karakter per request."
-            )
         prompt_path = _resolve_audio_input(audio_file, audio_url)
-        if not prompt_path:
-            raise gr.Error("Upload file audio atau isi Audio URL yang valid.")
-        chunks = _split_text_safely(raw_text, max_chars=MAX_CHARS_PER_CHUNK)
-        # auto-relax sekali kalau chunk terlalu banyak
-        if len(chunks) > 120:
-            chunks = _split_text_safely(raw_text, max_chars=min(300, MAX_CHARS_PER_CHUNK + 60))
-        if not chunks:
-            raise gr.Error("Gagal memproses teks (chunk kosong).")
-        if len(chunks) > MAX_CHUNKS_HARD:
-            raise gr.Error(
-                f"Teks terlalu panjang ({len(chunks)} chunk). "
-                f"Maksimal {MAX_CHUNKS_HARD} chunk per request."
-            )
         model = get_model()
-        sr = int(getattr(model, "sr", 24000))
-        prompt_clean = _auto_clean_prompt(prompt_path, target_sr=sr)
-        torch.manual_seed(SEED)
-        total_chunks = len(chunks)
-        total_batches = math.ceil(total_chunks / BATCH_SIZE)
-        all_wavs = []
-        pause = torch.zeros(1, int(sr * PAUSE_SECONDS))
-        progress(0.0, desc=f"Mulai {total_chunks} chunk ({total_batches} batch)...")
         with torch.no_grad():
-            for b in range(total_batches):
-                start = b * BATCH_SIZE
-                end = min((b + 1) * BATCH_SIZE, total_chunks)
-                batch = chunks[start:end]
-                progress(start / total_chunks, desc=f"Batch {b+1}/{total_batches}")
-                for i, ch in enumerate(batch, start=start + 1):
-                    ch = _prepare_text_exact(ch)
-                    wav = _generate_with_safe_kwargs(model, ch, prompt_clean).cpu()
-                    all_wavs.append(wav)
-                    if i < total_chunks:
-                        all_wavs.append(pause)
-                    progress(i / total_chunks, desc=f"Chunk {i}/{total_chunks}")
-                gc.collect()
-        if not all_wavs:
-            raise gr.Error("Tidak ada audio yang berhasil digenerate.")
-        full_wav = torch.cat(all_wavs, dim=1)
         out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
-        ta.save(out_path, full_wav, sr)
-        progress(1.0, desc="Selesai ✅")
         return out_path
-    except gr.Error:
-        raise
     except Exception as e:
         print("[ERROR]", repr(e))
         print(traceback.format_exc())
         raise gr.Error(f"Gagal generate audio: {e}")
-# =========================
-# UI
-# =========================
 with gr.Blocks(title="Chatterbox Indonesian Voice Cloning (CPU)") as demo:
-    gr.Markdown("## Chatterbox Indonesian Voice Cloning (CPU)")
-    gr.Markdown(
-        "Masukkan teks panjang + upload audio referensi (atau URL audio). "
-        "Sistem akan auto-batch lalu gabung jadi 1 file WAV."
-    )
     text_in = gr.Textbox(
-        label="Teks",
-        lines=10,
-        placeholder="Masukkan teks panjang di sini..."
     )
-    audio_file_in = gr.Audio(
-        label="Upload Audio Referensi",
-        type="filepath",
-        sources=["upload", "microphone"]
     )
-    audio_url_in = gr.Textbox(
-        label="Atau Audio URL",
-        placeholder="https://.../sample.wav"
     )
-    run_btn = gr.Button("Generate Audio", variant="primary")
     out_audio = gr.Audio(label="Hasil Audio", type="filepath")
-    run_btn.click(
         fn=clone_voice,
-        inputs=[text_in, audio_file_in, audio_url_in],
         outputs=[out_audio],
         api_name="clone_voice"
     )
-# =========================
-# LAUNCH
-# =========================
 if __name__ == "__main__":
-    print("Launching Gradio...")
-    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

 import os
+os.environ["CUDA_VISIBLE_DEVICES"] = ""  # paksa CPU-only
 import re
+import inspect
 import tempfile
 import traceback
+from threading import Lock
 import requests
 import torch
 import torchaudio as ta
 import gradio as gr
 # =========================
+# HARD PATCH CPU DESERIALIZE
 # =========================
+torch.cuda.is_available = lambda: False
+_original_torch_load = torch.load
+def _torch_load_cpu(*args, **kwargs):
+    kwargs["map_location"] = torch.device("cpu")
+    return _original_torch_load(*args, **kwargs)
+torch.load = _torch_load_cpu
+if hasattr(torch.jit, "load"):
+    _original_jit_load = torch.jit.load
+    def _jit_load_cpu(*args, **kwargs):
+        kwargs["map_location"] = torch.device("cpu")
+        return _original_jit_load(*args, **kwargs)
+    torch.jit.load = _jit_load_cpu
 # =========================
+# MODEL IMPORT
 # =========================
+from chatterbox.tts import ChatterboxTTS
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+MODEL_REPO = "grandhigh/Chatterbox-TTS-Indonesian"
+CHECKPOINT_FILENAME = "t3_cfg.safetensors"
+DEVICE = "cpu"
+_model = None
+_model_lock = Lock()
+def get_model():
+    global _model
+    if _model is None:
+        with _model_lock:
+            if _model is None:
+                print("[INIT] Loading model on CPU...")
+                m = ChatterboxTTS.from_pretrained(device=DEVICE)
+                ckpt_path = hf_hub_download(
+                    repo_id=MODEL_REPO,
+                    filename=CHECKPOINT_FILENAME
+                )
+                t3_state = load_file(ckpt_path, device="cpu")
+                m.t3.load_state_dict(t3_state)
+                # ChatterboxTTS tidak punya .to(), jadi jangan pakai m.to("cpu")
+                if hasattr(m, "eval"):
+                    m.eval()
+                _model = m
+                print("[INIT] Model ready.")
+    return _model
+def _download_wav(url: str) -> str:
+    r = requests.get(url, timeout=90)
+    r.raise_for_status()
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    tmp.write(r.content)
+    tmp.close()
+    return tmp.name
 def _resolve_audio_input(audio_file, audio_url: str):
+    # gr.Audio(type="filepath") biasanya return string path
+    if isinstance(audio_file, str) and audio_file.strip():
         return audio_file
+    # fallback kalau format dict
+    if isinstance(audio_file, dict):
+        p = audio_file.get("path")
+        if p:
             return p
+    if audio_url and audio_url.strip():
+        return _download_wav(audio_url.strip())
     return None
+def _prepare_text_exact(text: str) -> str:
+    t = (text or "").strip()
+    if not t:
+        raise gr.Error("Text prompt tidak boleh kosong.")
+    # tambah tanda akhir agar model tidak lanjut ngawur
+    if not re.search(r"[.!?…]$", t):
+        t += "."
+    return t
+def _generate_with_safe_kwargs(model, text: str, prompt_path: str):
     sig = inspect.signature(model.generate)
+    params = sig.parameters
+    kwargs = {}
+    if "audio_prompt_path" in params:
+        kwargs["audio_prompt_path"] = prompt_path
+    # Set parameter jika didukung versi chatterbox yang terpasang
+    if "temperature" in params:
+        kwargs["temperature"] = 0.05
+    if "top_p" in params:
+        kwargs["top_p"] = 0.7
+    if "exaggeration" in params:
+        kwargs["exaggeration"] = 0.25
+    if "cfg_weight" in params:
+        kwargs["cfg_weight"] = 0.3
+    # Coba gaya pemanggilan paling umum
     try:
+        return model.generate(text, **kwargs)
+    except TypeError:
+        # fallback: beberapa versi pakai named argument
+        if "text" in params:
+            kwargs["text"] = text
+            return model.generate(**kwargs)
+        # fallback paling basic
+        return model.generate(text)
+def clone_voice(text: str, audio_file, audio_url: str):
     try:
+        text = _prepare_text_exact(text)
         prompt_path = _resolve_audio_input(audio_file, audio_url)
+        if not prompt_path:
+            raise gr.Error("Upload WAV atau isi Audio URL WAV.")
         model = get_model()
+        # bikin output lebih konsisten
+        torch.manual_seed(42)
         with torch.no_grad():
+            wav = _generate_with_safe_kwargs(model, text, prompt_path)
+        if wav.dim() == 1:
+            wav = wav.unsqueeze(0)
+        sr = getattr(model, "sr", 24000)
         out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
+        ta.save(out_path, wav.cpu(), sr)
         return out_path
     except Exception as e:
         print("[ERROR]", repr(e))
         print(traceback.format_exc())
         raise gr.Error(f"Gagal generate audio: {e}")
 with gr.Blocks(title="Chatterbox Indonesian Voice Cloning (CPU)") as demo:
+    gr.Markdown("## Chatterbox-TTS Indonesian (CPU)")
+    gr.Markdown("Masukkan teks + upload WAV (atau URL WAV)")
     text_in = gr.Textbox(
+        label="Text Prompt",
+        lines=4,
+        placeholder="Contoh: Apa kabar."
     )
+    wav_in = gr.Audio(
+        label="Upload WAV Prompt",
+        type="filepath"
     )
+    url_in = gr.Textbox(
+        label="Audio URL WAV (opsional)",
+        placeholder="https://example.com/input.wav"
     )
+    btn = gr.Button("Generate")
     out_audio = gr.Audio(label="Hasil Audio", type="filepath")
+    btn.click(
         fn=clone_voice,
+        inputs=[text_in, wav_in, url_in],
         outputs=[out_audio],
         api_name="clone_voice"
     )
 if __name__ == "__main__":
+    port = int(os.getenv("PORT", "7860"))
+    demo.queue(default_concurrency_limit=1)
+    demo.launch(server_name="0.0.0.0", server_port=port, show_error=True)