""" ChatterBox Desi TTS Space Fine-tuned multi-language Indic TTS model for text-to-speech synthesis Supports: Bengali, Hindi, Marathi, Gujarati, Tamil, Telugu """ import sys import os import gradio as gr import torch from huggingface_hub import snapshot_download import torchaudio # Add chatterbox-finetuning to path chatterbox_finetuning_path = os.path.join(os.path.dirname(__file__), "chatterbox-finetuning") sys.path.insert(0, chatterbox_finetuning_path) from src.chatterbox_.tts import ChatterboxTTS # Model configuration MODEL_ID = "BosonLab/chatterbox-desi" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Language configuration LANGUAGES = { "Bengali (বাংলা)": { "code": "bn", "tag": "[bn]", "example": "আমি বাংলায় কথা বলতে পারি। এটি একটি পরীক্ষামূলক বাক্য।", }, "Hindi (हिंदी)": { "code": "hi", "tag": "[hi]", "example": "मैं हिंदी में बोल सकता हूँ। यह एक परीक्षण वाक्य है।", }, "Marathi (मराठी)": { "code": "mr", "tag": "[mr]", "example": "मी मराठीत बोलू शकतो. हे एक चाचणी वाक्य आहे.", }, "Gujarati (ગુજરાતી)": { "code": "gu", "tag": "[gu]", "example": "હું ગુજરાતીમાં બોલી શકું છું. આ એક પ્રાયોગિક વાક્ય છે.", }, "Tamil (தமிழ்)": { "code": "ta", "tag": "[ta]", "example": "நான் தமிழில் பேச முடியும். இது ஒரு சோதனை வாக்கியம்.", }, "Telugu (తెలుగు)": { "code": "te", "tag": "[te]", "example": "నేను తెలుగులో మాట్లాడగలను. ఇది ఒక పరీక్ష వాక్యం.", }, } # Load model print(f"Loading model from {MODEL_ID}...") print(f"Using device: {DEVICE}") try: model_dir = snapshot_download(MODEL_ID) print(f"Model downloaded to: {model_dir}") model = ChatterboxTTS.from_local(model_dir, device=DEVICE) print("Model loaded successfully!") except Exception as e: print(f"Error loading model: {e}") raise def get_example_text(language): """Return example text for the selected language.""" lang_info = LANGUAGES.get(language, LANGUAGES["Bengali (বাংলা)"]) return lang_info["example"] def generate_speech(text, language, reference_audio=None): """ Generate speech from text using the fine-tuned model. Args: text: Input text to convert to speech language: Selected language name reference_audio: Optional reference audio path for voice cloning Returns: Tuple of (audio_data, metadata_str) """ try: lang_info = LANGUAGES.get(language, LANGUAGES["Bengali (বাংলা)"]) lang_tag = lang_info["tag"] # Prepend language tag if not already present if not text.strip().startswith("["): tagged_text = f"{lang_tag} {text.strip()}" else: tagged_text = text.strip() # Generate speech with optional voice cloning if reference_audio is not None: wav = model.generate(tagged_text, audio_prompt_path=reference_audio) else: wav = model.generate(tagged_text) metadata = ( f"Language: {language}\n" f"Tag: {lang_tag}\n" f"Input: {text[:100]}{'...' if len(text) > 100 else ''}\n" f"Characters: {len(text)}" ) return (model.sr, wav.squeeze(0).numpy()), metadata except Exception as e: print(f"Error generating speech: {e}") return None, f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="ChatterBox Desi TTS", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # ChatterBox Desi TTS Multi-language Indic text-to-speech powered by [BosonLab/chatterbox-desi](https://huggingface.co/BosonLab/chatterbox-desi). Supports **Bengali, Hindi, Marathi, Gujarati, Tamil, and Telugu**. > **Note**: The model automatically adds the language tag. Just select your language and type your text. """ ) with gr.Row(): with gr.Column(scale=1): language_dropdown = gr.Dropdown( label="Language", choices=list(LANGUAGES.keys()), value="Bengali (বাংলা)", ) text_input = gr.Textbox( label="Text", placeholder="Enter text in your selected language...", lines=5, max_lines=10, ) example_btn = gr.Button("Load Example Text", variant="secondary") reference_audio = gr.Audio( label="Reference Audio (Optional — for voice cloning)", type="filepath", ) generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(scale=1): audio_output = gr.Audio(label="Generated Speech") metadata_output = gr.Textbox(label="Info", lines=4) # Language examples gr.Markdown("### Example Texts by Language") with gr.Row(): for lang_name, lang_info in LANGUAGES.items(): gr.Markdown( f"**{lang_name}**\n\n{lang_info['example']}" ) # Event handlers example_btn.click( fn=get_example_text, inputs=[language_dropdown], outputs=[text_input], ) generate_btn.click( fn=generate_speech, inputs=[text_input, language_dropdown, reference_audio], outputs=[audio_output, metadata_output], ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, )