Spaces:
Sleeping
Sleeping
| """ | |
| ChatterBox Bengali TTS Space | |
| Fine-tuned Bengali TTS model for text-to-speech synthesis | |
| """ | |
| import sys | |
| import os | |
| import tempfile | |
| import gradio as gr | |
| import torch | |
| from huggingface_hub import snapshot_download | |
| import torchaudio | |
| # Add chatterbox-finetuning to path | |
| chatterbox_finetuning_path = os.path.join(os.path.dirname(__file__), "chatterbox-finetuning") | |
| sys.path.insert(0, chatterbox_finetuning_path) | |
| from src.chatterbox_.tts import ChatterboxTTS | |
| # Model configuration | |
| MODEL_ID = "BosonLab/chatterbox-bangla" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load model and tokenizer | |
| print(f"Loading model from {MODEL_ID}...") | |
| print(f"Using device: {DEVICE}") | |
| try: | |
| # Download model from Hugging Face | |
| model_dir = snapshot_download(MODEL_ID) | |
| print(f"Model downloaded to: {model_dir}") | |
| # Load ChatterBox TTS model | |
| model = ChatterboxTTS.from_local(model_dir, device=DEVICE) | |
| print("Model loaded successfully!") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| raise | |
| def generate_speech(text, reference_audio=None): | |
| """ | |
| Generate speech from text using the fine-tuned model | |
| Args: | |
| text: Bengali text to convert to speech | |
| reference_audio: Path to reference audio file for voice cloning | |
| Returns: | |
| Tuple of (audio, metadata) | |
| """ | |
| try: | |
| # Create temporary directory for output | |
| temp_dir = tempfile.mkdtemp() | |
| # Generate speech with voice cloning if reference audio is provided | |
| if reference_audio is not None: | |
| wav = model.generate(text, audio_prompt_path=reference_audio) | |
| output_path = os.path.join(temp_dir, "output.wav") | |
| else: | |
| wav = model.generate(text) | |
| output_path = os.path.join(temp_dir, "output.wav") | |
| return (model.sr, wav.squeeze(0).numpy()), f"Generated {len(text)} characters of Bengali text" | |
| # Save audio to file | |
| # torchaudio.save(output_path, wav, model.sr) | |
| # return output_path, f"Generated {len(text)} characters of Bengali text" | |
| except Exception as e: | |
| print(f"Error generating speech: {e}") | |
| return None, f"Error: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="ChatterBox Bengali TTS", theme=gr.themes.Soft()) as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.Textbox( | |
| label="Bengali Text", | |
| placeholder="Enter Bengali text here...", | |
| lines=5, | |
| max_lines=10 | |
| ) | |
| reference_audio = gr.Audio( | |
| label="Reference Audio (Optional for Voice Cloning)", | |
| type="filepath" | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio(label="Generated Speech") | |
| metadata_output = gr.Textbox(label="Metadata", lines=3) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, reference_audio], | |
| outputs=[audio_output, metadata_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) |