Spaces:

BosonLab
/

chatterbox-desi

Runtime error

File size: 6,143 Bytes

68268d5

"""
ChatterBox Desi TTS Space
Fine-tuned multi-language Indic TTS model for text-to-speech synthesis
Supports: Bengali, Hindi, Marathi, Gujarati, Tamil, Telugu
"""

import sys
import os
import gradio as gr
import torch
from huggingface_hub import snapshot_download
import torchaudio

# Add chatterbox-finetuning to path
chatterbox_finetuning_path = os.path.join(os.path.dirname(__file__), "chatterbox-finetuning")
sys.path.insert(0, chatterbox_finetuning_path)

from src.chatterbox_.tts import ChatterboxTTS

# Model configuration
MODEL_ID = "BosonLab/chatterbox-desi"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Language configuration
LANGUAGES = {
    "Bengali (বাংলা)": {
        "code": "bn",
        "tag": "[bn]",
        "example": "আমি বাংলায় কথা বলতে পারি। এটি একটি পরীক্ষামূলক বাক্য।",
    },
    "Hindi (हिंदी)": {
        "code": "hi",
        "tag": "[hi]",
        "example": "मैं हिंदी में बोल सकता हूँ। यह एक परीक्षण वाक्य है।",
    },
    "Marathi (मराठी)": {
        "code": "mr",
        "tag": "[mr]",
        "example": "मी मराठीत बोलू शकतो. हे एक चाचणी वाक्य आहे.",
    },
    "Gujarati (ગુજરાતી)": {
        "code": "gu",
        "tag": "[gu]",
        "example": "હું ગુજરાતીમાં બોલી શકું છું. આ એક પ્રાયોગિક વાક્ય છે.",
    },
    "Tamil (தமிழ்)": {
        "code": "ta",
        "tag": "[ta]",
        "example": "நான் தமிழில் பேச முடியும். இது ஒரு சோதனை வாக்கியம்.",
    },
    "Telugu (తెలుగు)": {
        "code": "te",
        "tag": "[te]",
        "example": "నేను తెలుగులో మాట్లాడగలను. ఇది ఒక పరీక్ష వాక్యం.",
    },
}

# Load model
print(f"Loading model from {MODEL_ID}...")
print(f"Using device: {DEVICE}")

try:
    model_dir = snapshot_download(MODEL_ID)
    print(f"Model downloaded to: {model_dir}")
    model = ChatterboxTTS.from_local(model_dir, device=DEVICE)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    raise


def get_example_text(language):
    """Return example text for the selected language."""
    lang_info = LANGUAGES.get(language, LANGUAGES["Bengali (বাংলা)"])
    return lang_info["example"]


def generate_speech(text, language, reference_audio=None):
    """
    Generate speech from text using the fine-tuned model.

    Args:
        text: Input text to convert to speech
        language: Selected language name
        reference_audio: Optional reference audio path for voice cloning

    Returns:
        Tuple of (audio_data, metadata_str)
    """
    try:
        lang_info = LANGUAGES.get(language, LANGUAGES["Bengali (বাংলা)"])
        lang_tag = lang_info["tag"]

        # Prepend language tag if not already present
        if not text.strip().startswith("["):
            tagged_text = f"{lang_tag} {text.strip()}"
        else:
            tagged_text = text.strip()

        # Generate speech with optional voice cloning
        if reference_audio is not None:
            wav = model.generate(tagged_text, audio_prompt_path=reference_audio)
        else:
            wav = model.generate(tagged_text)

        metadata = (
            f"Language: {language}\n"
            f"Tag: {lang_tag}\n"
            f"Input: {text[:100]}{'...' if len(text) > 100 else ''}\n"
            f"Characters: {len(text)}"
        )
        return (model.sr, wav.squeeze(0).numpy()), metadata

    except Exception as e:
        print(f"Error generating speech: {e}")
        return None, f"Error: {str(e)}"


# Create Gradio interface
with gr.Blocks(title="ChatterBox Desi TTS", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # ChatterBox Desi TTS
        Multi-language Indic text-to-speech powered by [BosonLab/chatterbox-desi](https://huggingface.co/BosonLab/chatterbox-desi).
        Supports **Bengali, Hindi, Marathi, Gujarati, Tamil, and Telugu**.

        > **Note**: The model automatically adds the language tag. Just select your language and type your text.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            language_dropdown = gr.Dropdown(
                label="Language",
                choices=list(LANGUAGES.keys()),
                value="Bengali (বাংলা)",
            )

            text_input = gr.Textbox(
                label="Text",
                placeholder="Enter text in your selected language...",
                lines=5,
                max_lines=10,
            )

            example_btn = gr.Button("Load Example Text", variant="secondary")

            reference_audio = gr.Audio(
                label="Reference Audio (Optional — for voice cloning)",
                type="filepath",
            )

            generate_btn = gr.Button("Generate Speech", variant="primary")

        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Generated Speech")
            metadata_output = gr.Textbox(label="Info", lines=4)

    # Language examples
    gr.Markdown("### Example Texts by Language")
    with gr.Row():
        for lang_name, lang_info in LANGUAGES.items():
            gr.Markdown(
                f"**{lang_name}**\n\n{lang_info['example']}"
            )

    # Event handlers
    example_btn.click(
        fn=get_example_text,
        inputs=[language_dropdown],
        outputs=[text_input],
    )

    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, language_dropdown, reference_audio],
        outputs=[audio_output, metadata_output],
    )


if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
    )