Spaces:

owlninjam
/

chatterbox-tts

Sleeping

App Files Files Community

owlninjam commited on Mar 10

Commit

21b59b8

verified ·

1 Parent(s): 46c731b

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +42 -0
README.md +45 -10
app.py +492 -0
requirements.txt +8 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+FROM python:3.11-slim
+# System dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ffmpeg \
+        git \
+        build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Install numpy FIRST (pkuseg needs it at build time)
+RUN pip install --no-cache-dir numpy==1.25.2
+# Install chatterbox-tts (now pkuseg can build because numpy is available)
+# Using --no-build-isolation so pkuseg's setup.py can see the installed numpy
+RUN pip install --no-cache-dir --no-build-isolation chatterbox-tts
+# Install remaining dependencies
+RUN pip install --no-cache-dir \
+    torch \
+    torchaudio \
+    soundfile \
+    pydub \
+    fastapi \
+    uvicorn \
+    gradio==5.31.0
+# Create non-root user (required by HF Spaces)
+RUN useradd -m -u 1000 user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR /home/user/app
+# Copy application
+COPY --chown=user app.py .
+USER user
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,45 @@
----
-title: Chatterbox Tts
-emoji: 🚀
-colorFrom: blue
-colorTo: indigo
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Chatterbox TTS API
+emoji: 🎙️
+colorFrom: purple
+colorTo: blue
+sdk: docker
+app_port: 7860
+pinned: false
+license: mit
+---
+# Chatterbox TTS API
+A free, CPU-powered TTS service with **voice cloning** and an **OpenAI-compatible API**.
+## Features
+- 🎤 **Voice Cloning** — clone any voice from a ~10s reference clip
+- 🔌 **OpenAI-Compatible API** — drop-in replacement at `/v1/audio/speech`
+- 🌊 **Streaming** — chunked audio streaming for faster time-to-first-byte
+- 🆓 **Free** — runs on HF Spaces CPU tier
+## API Usage
+```bash
+# Basic TTS
+curl -X POST https://YOUR-SPACE.hf.space/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -d '{"model":"chatterbox","input":"Hello world!","voice":"default"}' \
+  --output speech.wav
+# Voice cloning (multipart)
+curl -X POST https://YOUR-SPACE.hf.space/v1/audio/speech \
+  -F 'request={"model":"chatterbox","input":"Hello!","voice":"clone"};type=application/json' \
+  -F "file=@reference.wav" \
+  --output cloned.wav
+```
+## OpenAI SDK
+```python
+from openai import OpenAI
+client = OpenAI(base_url="https://YOUR-SPACE.hf.space/v1", api_key="not-needed")
+response = client.audio.speech.create(model="chatterbox", voice="default", input="Hello!")
+response.stream_to_file("output.wav")
+```

app.py ADDED Viewed

	@@ -0,0 +1,492 @@

+"""
+Chatterbox TTS — HF Space with Gradio UI + OpenAI-Compatible API
+Supports voice cloning and chunked streaming on CPU.
+"""
+import io
+import os
+import re
+import json
+import tempfile
+import logging
+from typing import Optional
+import torch
+import torchaudio as ta
+import soundfile as sf
+import numpy as np
+import gradio as gr
+from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
+from fastapi.responses import StreamingResponse, Response
+from pydub import AudioSegment
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("chatterbox-tts")
+# ---------------------------------------------------------------------------
+# Global model (loaded once at startup)
+# ---------------------------------------------------------------------------
+MODEL = None
+MODEL_NAME = "chatterbox"
+DEVICE = "cpu"
+def get_model():
+    """Lazy-load the Chatterbox model."""
+    global MODEL
+    if MODEL is None:
+        logger.info("Loading Chatterbox TTS model on CPU — this may take 30-60s on first run...")
+        try:
+            # Try Turbo first (faster, 350M, 1-step decoder)
+            from chatterbox.tts_turbo import ChatterboxTurboTTS
+            MODEL = ChatterboxTurboTTS.from_pretrained(device=DEVICE)
+            logger.info("Loaded ChatterboxTurboTTS (350M) successfully.")
+        except Exception as e:
+            logger.warning(f"Turbo model failed ({e}), falling back to standard ChatterboxTTS...")
+            from chatterbox.tts import ChatterboxTTS
+            MODEL = ChatterboxTTS.from_pretrained(device=DEVICE)
+            logger.info("Loaded ChatterboxTTS (standard) successfully.")
+    return MODEL
+# ---------------------------------------------------------------------------
+# Audio helpers
+# ---------------------------------------------------------------------------
+def wav_tensor_to_bytes(wav: torch.Tensor, sr: int, fmt: str = "wav") -> bytes:
+    """Convert a waveform tensor to audio bytes in the requested format."""
+    # Ensure 2D: (channels, samples)
+    if wav.dim() == 1:
+        wav = wav.unsqueeze(0)
+    buf = io.BytesIO()
+    ta.save(buf, wav, sr, format="wav")
+    buf.seek(0)
+    if fmt == "wav":
+        return buf.read()
+    elif fmt == "mp3":
+        audio_seg = AudioSegment.from_wav(buf)
+        mp3_buf = io.BytesIO()
+        audio_seg.export(mp3_buf, format="mp3")
+        mp3_buf.seek(0)
+        return mp3_buf.read()
+    elif fmt == "opus":
+        audio_seg = AudioSegment.from_wav(buf)
+        opus_buf = io.BytesIO()
+        audio_seg.export(opus_buf, format="opus")
+        opus_buf.seek(0)
+        return opus_buf.read()
+    elif fmt == "flac":
+        audio_seg = AudioSegment.from_wav(buf)
+        flac_buf = io.BytesIO()
+        audio_seg.export(flac_buf, format="flac")
+        flac_buf.seek(0)
+        return flac_buf.read()
+    else:
+        return buf.read()
+def split_into_sentences(text: str) -> list[str]:
+    """Split text into sentences for chunked streaming."""
+    # Split on sentence-ending punctuation followed by space or end
+    parts = re.split(r'(?<=[.!?])\s+', text.strip())
+    # Merge very short fragments with their predecessor
+    merged = []
+    for p in parts:
+        p = p.strip()
+        if not p:
+            continue
+        if merged and len(merged[-1]) < 20:
+            merged[-1] = merged[-1] + " " + p
+        else:
+            merged.append(p)
+    return merged if merged else [text]
+MIME_TYPES = {
+    "wav": "audio/wav",
+    "mp3": "audio/mpeg",
+    "opus": "audio/opus",
+    "flac": "audio/flac",
+}
+# ---------------------------------------------------------------------------
+# Core TTS generation
+# ---------------------------------------------------------------------------
+def generate_speech(
+    text: str,
+    ref_audio_path: Optional[str] = None,
+    response_format: str = "wav",
+    stream: bool = False,
+):
+    """
+    Generate speech from text. Optionally clone voice from ref_audio_path.
+    If stream=True, yields audio chunks per sentence.
+    """
+    model = get_model()
+    if stream:
+        sentences = split_into_sentences(text)
+        for sentence in sentences:
+            logger.info(f"Generating chunk: {sentence[:50]}...")
+            if ref_audio_path:
+                wav = model.generate(sentence, audio_prompt_path=ref_audio_path)
+            else:
+                wav = model.generate(sentence)
+            chunk_bytes = wav_tensor_to_bytes(wav, model.sr, response_format)
+            yield chunk_bytes
+    else:
+        logger.info(f"Generating full: {text[:80]}...")
+        if ref_audio_path:
+            wav = model.generate(text, audio_prompt_path=ref_audio_path)
+        else:
+            wav = model.generate(text)
+        yield wav_tensor_to_bytes(wav, model.sr, response_format)
+# ---------------------------------------------------------------------------
+# FastAPI — OpenAI-compatible /v1/audio/speech
+# ---------------------------------------------------------------------------
+api_app = FastAPI(title="Chatterbox TTS API", version="1.0.0")
+@api_app.get("/v1/models")
+async def list_models():
+    """OpenAI-compatible model listing."""
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": "chatterbox",
+                "object": "model",
+                "created": 1700000000,
+                "owned_by": "resemble-ai",
+            },
+            {
+                "id": "chatterbox-turbo",
+                "object": "model",
+                "created": 1700000000,
+                "owned_by": "resemble-ai",
+            },
+        ],
+    }
+@api_app.post("/v1/audio/speech")
+async def openai_tts(request: Request):
+    """
+    OpenAI-compatible TTS endpoint.
+    Accepts either:
+      1. JSON body: {"model": "chatterbox", "input": "text", "voice": "default"}
+      2. Multipart form: model, input, voice fields + optional 'file' for voice cloning
+    voice="clone" + file upload = voice cloning
+    voice="default" (or anything else) = default voice
+    """
+    content_type = request.headers.get("content-type", "")
+    ref_audio_path = None
+    tmp_file = None
+    try:
+        if "multipart/form-data" in content_type:
+            # Parse multipart — could have JSON part + file
+            form = await request.form()
+            # Check if there's a 'request' JSON field (for combined JSON+file uploads)
+            if "request" in form:
+                try:
+                    params = json.loads(form["request"])
+                except (json.JSONDecodeError, TypeError):
+                    params = {}
+                model = params.get("model", "chatterbox")
+                text = params.get("input", "")
+                voice = params.get("voice", "default")
+                response_format = params.get("response_format", "wav")
+            else:
+                model = form.get("model", "chatterbox")
+                text = form.get("input", "")
+                voice = form.get("voice", "default")
+                response_format = form.get("response_format", "wav")
+            # Handle file upload for voice cloning
+            file_field = form.get("file")
+            if file_field and hasattr(file_field, "read"):
+                tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+                content = await file_field.read()
+                tmp_file.write(content)
+                tmp_file.flush()
+                ref_audio_path = tmp_file.name
+                voice = "clone"
+        elif "application/json" in content_type:
+            body = await request.json()
+            model = body.get("model", "chatterbox")
+            text = body.get("input", "")
+            voice = body.get("voice", "default")
+            response_format = body.get("response_format", "wav")
+        else:
+            # Try JSON anyway
+            try:
+                body = await request.json()
+                model = body.get("model", "chatterbox")
+                text = body.get("input", "")
+                voice = body.get("voice", "default")
+                response_format = body.get("response_format", "wav")
+            except Exception:
+                raise HTTPException(status_code=400, detail="Unsupported content type. Use application/json or multipart/form-data.")
+        if not text:
+            raise HTTPException(status_code=400, detail="'input' field is required.")
+        if response_format not in MIME_TYPES:
+            response_format = "wav"
+        mime = MIME_TYPES[response_format]
+        # Determine if voice cloning
+        use_clone = voice == "clone" and ref_audio_path is not None
+        # Check if streaming is beneficial (multiple sentences)
+        sentences = split_into_sentences(text)
+        use_streaming = len(sentences) > 1
+        if use_streaming:
+            def audio_stream():
+                try:
+                    for chunk in generate_speech(
+                        text,
+                        ref_audio_path=ref_audio_path if use_clone else None,
+                        response_format=response_format,
+                        stream=True,
+                    ):
+                        yield chunk
+                finally:
+                    if tmp_file and os.path.exists(tmp_file.name):
+                        os.unlink(tmp_file.name)
+            return StreamingResponse(
+                audio_stream(),
+                media_type=mime,
+                headers={
+                    "Content-Disposition": f"attachment; filename=speech.{response_format}",
+                    "Transfer-Encoding": "chunked",
+                },
+            )
+        else:
+            # Single chunk — return directly
+            try:
+                audio_bytes = b""
+                for chunk in generate_speech(
+                    text,
+                    ref_audio_path=ref_audio_path if use_clone else None,
+                    response_format=response_format,
+                    stream=False,
+                ):
+                    audio_bytes += chunk
+            finally:
+                if tmp_file and os.path.exists(tmp_file.name):
+                    os.unlink(tmp_file.name)
+            return Response(
+                content=audio_bytes,
+                media_type=mime,
+                headers={
+                    "Content-Disposition": f"attachment; filename=speech.{response_format}",
+                },
+            )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"TTS generation failed: {e}", exc_info=True)
+        if tmp_file and os.path.exists(tmp_file.name):
+            os.unlink(tmp_file.name)
+        raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+def gradio_tts(text: str, ref_audio, response_format: str = "wav"):
+    """Gradio handler for TTS generation with optional voice cloning."""
+    if not text or not text.strip():
+        return None
+    ref_path = None
+    if ref_audio is not None:
+        ref_path = ref_audio  # Gradio gives us a file path
+    model = get_model()
+    logger.info(f"Gradio TTS: text={text[:60]}..., clone={ref_path is not None}")
+    if ref_path:
+        wav = model.generate(text, audio_prompt_path=ref_path)
+    else:
+        wav = model.generate(text)
+    # Save to temp file for Gradio audio output
+    if wav.dim() == 1:
+        wav = wav.unsqueeze(0)
+    tmp = tempfile.NamedTemporaryFile(suffix=f".{response_format}", delete=False)
+    ta.save(tmp.name, wav, model.sr, format="wav")
+    if response_format != "wav":
+        audio_seg = AudioSegment.from_wav(tmp.name)
+        out_path = tmp.name.replace(".wav", f".{response_format}")
+        audio_seg.export(out_path, format=response_format)
+        os.unlink(tmp.name)
+        return out_path
+    return tmp.name
+# Build Gradio interface
+with gr.Blocks(
+    title="🎙️ Chatterbox TTS",
+    theme=gr.themes.Soft(
+        primary_hue="purple",
+        secondary_hue="blue",
+    ),
+) as demo:
+    gr.Markdown(
+        """
+        # 🎙️ Chatterbox TTS
+        ### Free, open-source text-to-speech with voice cloning
+        *Powered by [Resemble AI Chatterbox](https://github.com/resemble-ai/chatterbox) — MIT Licensed*
+        """
+    )
+    with gr.Tabs():
+        # ---- Tab 1: TTS ----
+        with gr.TabItem("🗣️ Text to Speech"):
+            with gr.Row():
+                with gr.Column(scale=3):
+                    text_input = gr.Textbox(
+                        label="Text",
+                        placeholder="Type or paste your text here...",
+                        lines=5,
+                        max_lines=20,
+                    )
+                    ref_audio_input = gr.Audio(
+                        label="🎤 Reference Audio (optional — for voice cloning)",
+                        type="filepath",
+                        sources=["upload", "microphone"],
+                    )
+                    with gr.Row():
+                        format_dropdown = gr.Dropdown(
+                            choices=["wav", "mp3"],
+                            value="wav",
+                            label="Output Format",
+                        )
+                        generate_btn = gr.Button(
+                            "🔊 Generate Speech",
+                            variant="primary",
+                            size="lg",
+                        )
+                with gr.Column(scale=2):
+                    audio_output = gr.Audio(
+                        label="Generated Audio",
+                        type="filepath",
+                    )
+            generate_btn.click(
+                fn=gradio_tts,
+                inputs=[text_input, ref_audio_input, format_dropdown],
+                outputs=[audio_output],
+            )
+            gr.Examples(
+                examples=[
+                    ["Hello! This is Chatterbox TTS running on a free Hugging Face Space. Pretty cool, right?", None, "wav"],
+                    ["The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs.", None, "wav"],
+                    ["I can't believe it worked! [laugh] This is absolutely amazing.", None, "wav"],
+                ],
+                inputs=[text_input, ref_audio_input, format_dropdown],
+                outputs=[audio_output],
+                fn=gradio_tts,
+                cache_examples=False,
+            )
+        # ---- Tab 2: API Docs ----
+        with gr.TabItem("🔌 API"):
+            gr.Markdown(
+                """
+                ## OpenAI-Compatible API
+                This Space exposes an OpenAI-compatible `/v1/audio/speech` endpoint.
+                ### Base URL
+                ```
+                https://YOUR-SPACE-NAME.hf.space/v1
+                ```
+                ---
+                ### Basic TTS (JSON)
+                ```bash
+                curl -X POST https://YOUR-SPACE.hf.space/v1/audio/speech \\
+                  -H "Content-Type: application/json" \\
+                  -d '{"model":"chatterbox","input":"Hello world!","voice":"default","response_format":"wav"}' \\
+                  --output speech.wav
+                ```
+                ### Voice Cloning (Multipart)
+                ```bash
+                curl -X POST https://YOUR-SPACE.hf.space/v1/audio/speech \\
+                  -F 'request={"model":"chatterbox","input":"Hello!","voice":"clone"};type=application/json' \\
+                  -F "file=@your_reference.wav" \\
+                  --output cloned.wav
+                ```
+                ### OpenAI Python SDK
+                ```python
+                from openai import OpenAI
+                client = OpenAI(
+                    base_url="https://YOUR-SPACE.hf.space/v1",
+                    api_key="not-needed"
+                )
+                # Default voice
+                response = client.audio.speech.create(
+                    model="chatterbox",
+                    voice="default",
+                    input="Hello, this is a test!",
+                    response_format="wav"
+                )
+                response.stream_to_file("output.wav")
+                ```
+                ### Streaming
+                Multi-sentence inputs are automatically streamed sentence-by-sentence
+                for faster time-to-first-byte.
+                ### Parameters
+                | Parameter | Type | Required | Description |
+                |---|---|---|---|
+                | `model` | string | ✅ | `"chatterbox"` or `"chatterbox-turbo"` |
+                | `input` | string | ✅ | Text to synthesize |
+                | `voice` | string | ✅ | `"default"` or `"clone"` |
+                | `response_format` | string | ❌ | `"wav"` (default), `"mp3"`, `"opus"`, `"flac"` |
+                | `file` | binary | ❌ | Reference audio for cloning (multipart only) |
+                ---
+                *⚡ Running on CPU — expect 5-15s per sentence. Multi-sentence inputs stream chunks as they're ready.*
+                """
+            )
+# Mount FastAPI + Gradio together
+app = gr.mount_gradio_app(api_app, demo, path="/")
+# For local development
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy
+chatterbox-tts
+torch
+torchaudio
+soundfile
+pydub
+fastapi
+uvicorn