Spaces:
Sleeping
Sleeping
| """ | |
| TTS Hub: XTTS2 + Bark Text-to-Speech | |
| CPU inference for HuggingFace Spaces free tier | |
| Models: | |
| - XTTS2: Voice cloning with reference audio (default) | |
| - Bark: Preset voices with non-speech sounds [laughter], [music], etc. | |
| Bark implementation matches original C0untFloyd/bark-gui features: | |
| - Temperature controls (text_temp, waveform_temp) | |
| - Text chunking for long inputs | |
| - Seed control for reproducibility | |
| """ | |
| import argparse | |
| import gc | |
| import os | |
| import re | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| import numpy as np | |
| import torch | |
| import scipy.io.wavfile as wavfile | |
| # Force CPU | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "" | |
| os.environ["COQUI_TOS_AGREED"] = "1" | |
| os.environ["SUNO_USE_SMALL_MODELS"] = "1" | |
| DEVICE = "cpu" | |
| # Bark sample rate (matches original) | |
| BARK_SAMPLE_RATE = 24_000 | |
| # Global models (lazy loaded) | |
| XTTS_MODEL = None | |
| BARK_MODEL = None | |
| BARK_PROCESSOR = None | |
| # XTTS2 supported languages | |
| XTTS_LANGUAGES = { | |
| "English": "en", | |
| "Spanish": "es", | |
| "French": "fr", | |
| "German": "de", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Polish": "pl", | |
| "Turkish": "tr", | |
| "Russian": "ru", | |
| "Dutch": "nl", | |
| "Czech": "cs", | |
| "Arabic": "ar", | |
| "Chinese": "zh-cn", | |
| "Japanese": "ja", | |
| "Korean": "ko", | |
| "Hungarian": "hu", | |
| } | |
| # Bark voice presets (matches original bark/generation.py ALLOWED_PROMPTS) | |
| BARK_VOICES = [ | |
| "v2/en_speaker_0", | |
| "v2/en_speaker_1", | |
| "v2/en_speaker_2", | |
| "v2/en_speaker_3", | |
| "v2/en_speaker_4", | |
| "v2/en_speaker_5", | |
| "v2/en_speaker_6", | |
| "v2/en_speaker_7", | |
| "v2/en_speaker_8", | |
| "v2/en_speaker_9", | |
| "v2/de_speaker_0", | |
| "v2/de_speaker_1", | |
| "v2/de_speaker_2", | |
| "v2/fr_speaker_0", | |
| "v2/fr_speaker_1", | |
| "v2/es_speaker_0", | |
| "v2/es_speaker_1", | |
| "v2/zh_speaker_0", | |
| "v2/zh_speaker_1", | |
| "v2/ja_speaker_0", | |
| "v2/ko_speaker_0", | |
| ] | |
| def load_xtts(): | |
| """Load XTTS2 model (lazy loading)""" | |
| global XTTS_MODEL | |
| if XTTS_MODEL is not None: | |
| return XTTS_MODEL | |
| print("Loading XTTS2 model...") | |
| from TTS.api import TTS | |
| XTTS_MODEL = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE) | |
| gc.collect() | |
| print("XTTS2 loaded!") | |
| return XTTS_MODEL | |
| def load_bark(): | |
| """Load Bark model (lazy loading)""" | |
| global BARK_MODEL, BARK_PROCESSOR | |
| if BARK_MODEL is not None: | |
| return BARK_MODEL, BARK_PROCESSOR | |
| print("Loading Bark model...") | |
| from transformers import AutoProcessor, BarkModel | |
| BARK_PROCESSOR = AutoProcessor.from_pretrained("suno/bark-small") | |
| BARK_MODEL = BarkModel.from_pretrained("suno/bark-small").to(DEVICE) | |
| gc.collect() | |
| print("Bark loaded!") | |
| return BARK_MODEL, BARK_PROCESSOR | |
| def split_and_recombine_text(text, desired_length=200, max_length=300): | |
| """ | |
| Split text into chunks for processing. | |
| Matches original bark-gui/util/parseinput.py split_and_recombine_text() | |
| Args: | |
| text: Input text to split | |
| desired_length: Target chunk length | |
| max_length: Maximum chunk length | |
| Returns: | |
| List of text chunks | |
| """ | |
| text = text.strip() | |
| if len(text) == 0: | |
| return [] | |
| # Split on sentence boundaries | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| sentence = sentence.strip() | |
| if not sentence: | |
| continue | |
| # If adding this sentence exceeds max_length, save current chunk | |
| if len(current_chunk) + len(sentence) + 1 > max_length and current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = "" | |
| # If single sentence exceeds max_length, split by words | |
| if len(sentence) > max_length: | |
| words = sentence.split() | |
| for word in words: | |
| if len(current_chunk) + len(word) + 1 > max_length and current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = "" | |
| current_chunk += " " + word if current_chunk else word | |
| else: | |
| current_chunk += " " + sentence if current_chunk else sentence | |
| # If current chunk exceeds desired_length and ends with punctuation, save it | |
| if len(current_chunk) >= desired_length and current_chunk[-1] in '.!?': | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = "" | |
| # Add remaining text | |
| if current_chunk.strip(): | |
| chunks.append(current_chunk.strip()) | |
| return chunks if chunks else [text] | |
| def generate_bark_audio( | |
| model, | |
| processor, | |
| text: str, | |
| voice_preset: str, | |
| text_temp: float = 0.7, | |
| waveform_temp: float = 0.7, | |
| seed: int = None, | |
| ) -> np.ndarray: | |
| """ | |
| Generate audio for a single text chunk using Bark. | |
| Matches original bark-gui temperature and seed handling. | |
| Args: | |
| model: BarkModel instance | |
| processor: AutoProcessor instance | |
| text: Text to synthesize | |
| voice_preset: Voice preset string (e.g., "v2/en_speaker_6") | |
| text_temp: Semantic/text generation temperature (0.0-1.0) | |
| waveform_temp: Coarse/fine generation temperature (0.0-1.0) | |
| seed: Random seed for reproducibility (None = random) | |
| Returns: | |
| Audio array (numpy) | |
| """ | |
| # Set seed for reproducibility (matches original pytorch_seed usage) | |
| if seed is not None and seed > 0: | |
| torch.manual_seed(seed) | |
| np.random.seed(seed) | |
| # Prepare inputs | |
| inputs = processor(text, voice_preset=voice_preset, return_tensors="pt").to(DEVICE) | |
| # Generate audio with temperature controls | |
| # Temperature kwargs passed directly to model.generate() | |
| with torch.no_grad(): | |
| audio_array = model.generate( | |
| **inputs, | |
| do_sample=True, | |
| semantic_temperature=text_temp, | |
| coarse_temperature=waveform_temp, | |
| fine_temperature=0.5, | |
| ) | |
| audio_array = audio_array.cpu().numpy().squeeze() | |
| return audio_array | |
| def synthesize_xtts( | |
| text: str, | |
| reference_audio: str, | |
| language: str = "English", | |
| speed: float = 1.0, | |
| progress=None, | |
| ) -> tuple: | |
| """XTTS2 synthesis with voice cloning""" | |
| if not text or text.strip() == "": | |
| return None, "Please enter text to synthesize" | |
| if reference_audio is None: | |
| return None, "XTTS2 requires a reference voice audio file" | |
| if progress: | |
| progress(0.1, "Loading XTTS2...") | |
| tts = load_xtts() | |
| if progress: | |
| progress(0.3, "Processing...") | |
| lang_code = XTTS_LANGUAGES.get(language, "en") | |
| output_path = tempfile.mktemp(suffix=".wav") | |
| try: | |
| if progress: | |
| progress(0.5, "Generating speech (XTTS2)...") | |
| tts.tts_to_file( | |
| text=text, | |
| file_path=output_path, | |
| speaker_wav=reference_audio, | |
| language=lang_code, | |
| speed=speed, | |
| ) | |
| if progress: | |
| progress(1.0, "Done!") | |
| gc.collect() | |
| return output_path, "XTTS2: Speech generated successfully!" | |
| except Exception as e: | |
| gc.collect() | |
| return None, f"XTTS2 Error: {str(e)}" | |
| def synthesize_bark( | |
| text: str, | |
| voice_preset: str = "v2/en_speaker_6", | |
| text_temp: float = 0.7, | |
| waveform_temp: float = 0.7, | |
| seed: int = -1, | |
| progress=None, | |
| ) -> tuple: | |
| """ | |
| Bark synthesis with preset voices. | |
| Matches original bark-gui features: temperature control, text chunking, seed. | |
| Args: | |
| text: Text to synthesize (can be long, will be chunked) | |
| voice_preset: Voice preset (e.g., "v2/en_speaker_6") | |
| text_temp: Text/semantic temperature (0.1-1.0, default 0.7) | |
| waveform_temp: Waveform/coarse temperature (0.1-1.0, default 0.7) | |
| seed: Random seed (-1 = random, >0 = fixed seed) | |
| progress: Gradio progress callback | |
| Returns: | |
| Tuple of (audio_path, status_message) | |
| """ | |
| if not text or text.strip() == "": | |
| return None, "Please enter text to synthesize" | |
| if progress: | |
| progress(0.1, "Loading Bark...") | |
| model, processor = load_bark() | |
| if progress: | |
| progress(0.2, "Processing text...") | |
| # Handle seed (matches original: -1 or None = random) | |
| if seed is None or seed <= 0: | |
| seed = np.random.default_rng().integers(1, 2**32 - 1) | |
| # Clamp seed to valid range (matches original assertion) | |
| seed = int(seed) % (2**32 - 1) | |
| if seed <= 0: | |
| seed = 1 | |
| output_path = tempfile.mktemp(suffix=".wav") | |
| try: | |
| # Split text into chunks (matches original split_and_recombine_text) | |
| text_chunks = split_and_recombine_text(text, desired_length=200, max_length=300) | |
| if progress: | |
| progress(0.3, f"Generating {len(text_chunks)} chunk(s)...") | |
| all_audio_parts = [] | |
| # Silence between sentences (matches original: settings.silence_sentence) | |
| silence_samples = int(0.25 * BARK_SAMPLE_RATE) # 250ms silence | |
| silence = np.zeros(silence_samples, dtype=np.float32) | |
| current_seed = seed | |
| for i, chunk in enumerate(text_chunks): | |
| if progress: | |
| pct = 0.3 + 0.6 * (i / len(text_chunks)) | |
| progress(pct, f"Generating chunk {i+1}/{len(text_chunks)}...") | |
| # Generate audio for this chunk | |
| audio_array = generate_bark_audio( | |
| model=model, | |
| processor=processor, | |
| text=chunk, | |
| voice_preset=voice_preset, | |
| text_temp=text_temp, | |
| waveform_temp=waveform_temp, | |
| seed=current_seed, | |
| ) | |
| all_audio_parts.append(audio_array) | |
| # Add silence between chunks (not after last) | |
| if i < len(text_chunks) - 1: | |
| all_audio_parts.append(silence) | |
| # Update seed for next chunk (matches original: currentseed = torch.random.initial_seed()) | |
| current_seed = torch.random.initial_seed() % (2**32 - 1) | |
| if current_seed <= 0: | |
| current_seed = 1 | |
| # Concatenate all audio parts | |
| final_audio = np.concatenate(all_audio_parts) | |
| # Save as WAV (matches original: write_wav(filename, SAMPLE_RATE, audio_array)) | |
| wavfile.write(output_path, BARK_SAMPLE_RATE, final_audio) | |
| if progress: | |
| progress(1.0, "Done!") | |
| gc.collect() | |
| chunk_info = f" ({len(text_chunks)} chunks)" if len(text_chunks) > 1 else "" | |
| return output_path, f"Bark: Generated with {voice_preset}, seed={seed}{chunk_info}" | |
| except Exception as e: | |
| gc.collect() | |
| return None, f"Bark Error: {str(e)}" | |
| def synthesize( | |
| text: str, | |
| model_choice: str = "XTTS2 (Voice Cloning)", | |
| reference_audio: str = None, | |
| language: str = "English", | |
| speed: float = 1.0, | |
| voice_preset: str = "v2/en_speaker_6", | |
| text_temp: float = 0.7, | |
| waveform_temp: float = 0.7, | |
| seed: int = -1, | |
| progress=None, | |
| ) -> tuple: | |
| """ | |
| Unified TTS synthesis. | |
| Args: | |
| text: Text to convert to speech | |
| model_choice: "XTTS2 (Voice Cloning)" or "Bark (Preset Voices)" | |
| reference_audio: Reference audio for XTTS2 voice cloning | |
| language: Target language (XTTS2 only) | |
| speed: Speech speed (XTTS2 only) | |
| voice_preset: Bark voice preset | |
| text_temp: Bark text/semantic temperature (0.1-1.0) | |
| waveform_temp: Bark waveform/coarse temperature (0.1-1.0) | |
| seed: Bark random seed (-1 = random) | |
| Returns: | |
| Tuple of (audio_path, status_message) | |
| """ | |
| if "XTTS2" in model_choice: | |
| return synthesize_xtts(text, reference_audio, language, speed, progress) | |
| else: | |
| return synthesize_bark(text, voice_preset, text_temp, waveform_temp, seed, progress) | |
| def cli_synthesize(args): | |
| """CLI mode for synthesis""" | |
| if args.model == "xtts2": | |
| if not args.reference: | |
| print("Error: XTTS2 requires --reference audio file") | |
| sys.exit(1) | |
| from TTS.api import TTS | |
| print("Loading XTTS2 model...") | |
| tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE) | |
| lang_code = XTTS_LANGUAGES.get(args.language, "en") | |
| print(f"Text: {args.text[:100]}{'...' if len(args.text) > 100 else ''}") | |
| print(f"Reference: {args.reference}") | |
| print(f"Language: {args.language} ({lang_code})") | |
| print(f"Speed: {args.speed}") | |
| print("Generating speech...") | |
| tts.tts_to_file( | |
| text=args.text, | |
| file_path=args.output, | |
| speaker_wav=args.reference, | |
| language=lang_code, | |
| speed=args.speed, | |
| ) | |
| else: # bark | |
| print("Loading Bark model...") | |
| model, processor = load_bark() | |
| # Handle seed | |
| seed = args.seed | |
| if seed <= 0: | |
| seed = np.random.default_rng().integers(1, 2**32 - 1) | |
| seed = int(seed) % (2**32 - 1) | |
| print(f"Text: {args.text[:100]}{'...' if len(args.text) > 100 else ''}") | |
| print(f"Voice: {args.voice}") | |
| print(f"Text temp: {args.text_temp}") | |
| print(f"Waveform temp: {args.waveform_temp}") | |
| print(f"Seed: {seed}") | |
| # Split text into chunks | |
| text_chunks = split_and_recombine_text(args.text, desired_length=200, max_length=300) | |
| print(f"Processing {len(text_chunks)} chunk(s)...") | |
| all_audio_parts = [] | |
| silence = np.zeros(int(0.25 * BARK_SAMPLE_RATE), dtype=np.float32) | |
| current_seed = seed | |
| for i, chunk in enumerate(text_chunks): | |
| print(f" Chunk {i+1}/{len(text_chunks)}: {chunk[:50]}...") | |
| audio_array = generate_bark_audio( | |
| model=model, | |
| processor=processor, | |
| text=chunk, | |
| voice_preset=args.voice, | |
| text_temp=args.text_temp, | |
| waveform_temp=args.waveform_temp, | |
| seed=current_seed, | |
| ) | |
| all_audio_parts.append(audio_array) | |
| if i < len(text_chunks) - 1: | |
| all_audio_parts.append(silence) | |
| current_seed = torch.random.initial_seed() % (2**32 - 1) | |
| final_audio = np.concatenate(all_audio_parts) | |
| wavfile.write(args.output, BARK_SAMPLE_RATE, final_audio) | |
| print(f"Output saved to: {args.output}") | |
| def launch_gradio(): | |
| """Launch Gradio UI""" | |
| import gradio as gr | |
| description = """ | |
| # TTS Hub: XTTS2 + Bark | |
| Two powerful TTS models in one space: | |
| | Model | Voice Source | Special Features | | |
| |-------|--------------|------------------| | |
| | **XTTS2** (default) | Your audio sample | Voice cloning, 16 languages | | |
| | **Bark** | Preset voices | [laughter], [music], temperature control, seed | | |
| **Bark special tokens:** `[laughter]` `[laughs]` `[sighs]` `[music]` `[gasps]` `[clears throat]` `♪ singing ♪` | |
| """ | |
| with gr.Blocks(title="TTS Hub") as demo: | |
| gr.Markdown(description) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to synthesize", | |
| placeholder="Enter text... Use [laughter] [music] for Bark. Long text will be automatically chunked.", | |
| lines=4, | |
| value="Hello! This is a test of the text to speech system.", | |
| ) | |
| model_choice = gr.Radio( | |
| choices=["XTTS2 (Voice Cloning)", "Bark (Preset Voices)"], | |
| value="XTTS2 (Voice Cloning)", | |
| label="Model", | |
| ) | |
| # XTTS2 options | |
| with gr.Group(visible=True) as xtts_options: | |
| reference_audio = gr.Audio( | |
| label="Reference Voice (3-30 seconds)", | |
| type="filepath", | |
| ) | |
| language = gr.Dropdown( | |
| choices=list(XTTS_LANGUAGES.keys()), | |
| value="English", | |
| label="Language", | |
| ) | |
| speed = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="Speed", | |
| ) | |
| # Bark options (matches original bark-gui controls) | |
| with gr.Group(visible=False) as bark_options: | |
| voice_preset = gr.Dropdown( | |
| choices=BARK_VOICES, | |
| value="v2/en_speaker_6", | |
| label="Voice Preset", | |
| ) | |
| with gr.Row(): | |
| text_temp = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Text Temperature", | |
| info="Higher = more diverse, lower = more conservative" | |
| ) | |
| waveform_temp = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Waveform Temperature", | |
| info="Higher = more diverse, lower = more conservative" | |
| ) | |
| seed = gr.Number( | |
| value=-1, | |
| label="Seed", | |
| info="-1 = random, positive number = reproducible", | |
| precision=0, | |
| ) | |
| gr.Markdown(""" | |
| **Voice codes:** `en`=English, `de`=German, `fr`=French, `es`=Spanish, `zh`=Chinese, `ja`=Japanese, `ko`=Korean | |
| """) | |
| generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_audio = gr.Audio(label="Generated Speech") | |
| status = gr.Textbox(label="Status", interactive=False) | |
| # Toggle visibility based on model choice | |
| def toggle_options(choice): | |
| if "XTTS2" in choice: | |
| return gr.update(visible=True), gr.update(visible=False) | |
| else: | |
| return gr.update(visible=False), gr.update(visible=True) | |
| model_choice.change( | |
| fn=toggle_options, | |
| inputs=[model_choice], | |
| outputs=[xtts_options, bark_options], | |
| ) | |
| generate_btn.click( | |
| fn=synthesize, | |
| inputs=[text_input, model_choice, reference_audio, language, speed, | |
| voice_preset, text_temp, waveform_temp, seed], | |
| outputs=[output_audio, status], | |
| api_name="synthesize", | |
| ) | |
| gr.Markdown(""" | |
| ## Tips | |
| - **XTTS2:** Use 3-30 seconds of clear speech as reference. No background noise. | |
| - **Bark:** Long text is automatically split into chunks. Use temperature controls to adjust output diversity. | |
| - **Bark Seed:** Use a positive seed number to get reproducible results. | |
| - **CPU Speed:** XTTS2 ~1-2 min, Bark ~30-60 sec per chunk. | |
| """) | |
| demo.queue().launch() | |
| def main(): | |
| parser = argparse.ArgumentParser(description="TTS Hub: XTTS2 + Bark") | |
| subparsers = parser.add_subparsers(dest="command") | |
| # TTS command | |
| tts_parser = subparsers.add_parser("tts", help="Text-to-speech synthesis") | |
| tts_parser.add_argument("-t", "--text", required=True, help="Text to synthesize") | |
| tts_parser.add_argument("-o", "--output", required=True, help="Output audio path") | |
| tts_parser.add_argument( | |
| "-m", "--model", | |
| default="xtts2", | |
| choices=["xtts2", "bark"], | |
| help="TTS model (default: xtts2)" | |
| ) | |
| # XTTS2 options | |
| tts_parser.add_argument("-r", "--reference", help="Reference voice audio (XTTS2)") | |
| tts_parser.add_argument("-l", "--language", default="English", help="Language (XTTS2)") | |
| tts_parser.add_argument("-s", "--speed", type=float, default=1.0, help="Speed (XTTS2)") | |
| # Bark options (matches original bark-gui) | |
| tts_parser.add_argument( | |
| "-v", "--voice", | |
| default="v2/en_speaker_6", | |
| help="Voice preset (Bark)" | |
| ) | |
| tts_parser.add_argument( | |
| "--text-temp", | |
| type=float, | |
| default=0.7, | |
| help="Text/semantic temperature 0.1-1.0 (Bark, default: 0.7)" | |
| ) | |
| tts_parser.add_argument( | |
| "--waveform-temp", | |
| type=float, | |
| default=0.7, | |
| help="Waveform/coarse temperature 0.1-1.0 (Bark, default: 0.7)" | |
| ) | |
| tts_parser.add_argument( | |
| "--seed", | |
| type=int, | |
| default=-1, | |
| help="Random seed, -1=random (Bark)" | |
| ) | |
| args = parser.parse_args() | |
| if args.command == "tts": | |
| cli_synthesize(args) | |
| else: | |
| launch_gradio() | |
| if __name__ == "__main__": | |
| if len(sys.argv) > 1: | |
| main() | |
| else: | |
| launch_gradio() | |