# app.py """ Hugging Face / Local XTTS-v2 + Bark Gradio Web UI - Upload your voice (wav) as reference - Choose model: XTTS-v2 (Coqui) or Bark (Suno) - No external API token required if models are installed locally HOW THIS WORKS (quick): 1) Install requirements from requirements.txt 2) Download XTTS-v2 model files into ./models/xtts_v2/ (see notes below) OR install Coqui TTS via pip and let it download pretrained models 3) Bark will use the Hugging Face model cache (or local model dir) 4) Run: python app.py -> open http://localhost:7860 NOTES: - Generating ~2 hours of audio is possible by splitting text into chunks and concatenating outputs, but it is VERY memory/CPU/GPU intensive. - For best results use a decent GPU and enough disk space (several GBs for models). """ import os import tempfile import math from pathlib import Path from typing import Optional import torch import numpy as np import soundfile as sf import gradio as gr # Try imports for Coqui XTTS and Bark/Transformers try: from TTS.api import TTS # Coqui TTS COQUI_AVAILABLE = True except Exception: COQUI_AVAILABLE = False try: from transformers import pipeline TRANSFORMERS_AVAILABLE = True except Exception: TRANSFORMERS_AVAILABLE = False # Utility: chunk text into smaller pieces def split_text(text: str, max_tokens: int = 2000): # naive splitter by sentences/commas - keeps punctuation import re sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] cur = "" for s in sentences: if len(cur) + len(s) < max_tokens: cur = cur + " " + s if cur else s else: chunks.append(cur) cur = s if cur: chunks.append(cur) return chunks # Generate with Coqui XTTS (local) def generate_xtts(reference_wav_path: str, text: str, model_path: Optional[str] = None, out_path: str = "output_xtts.wav"): if not COQUI_AVAILABLE: raise RuntimeError("Coqui TTS package not available. Install 'coqui-tts' from requirements.txt") # Initialize TTS - if model_path provided, try to load it if model_path and os.path.exists(model_path): tts = TTS(model_path) else: # fallback to default XTTS if installed # list available models models = TTS.list_models() # prefer XTTS if present selected = None for m in models: if 'xtts' in m.lower(): selected = m break if selected is None: selected = models[0] tts = TTS(selected) # For cloning, Coqui XTTS can accept speaker_wav as reference per docs chunks = split_text(text, max_tokens=1500) audio_pieces = [] for idx, chunk in enumerate(chunks): wav = tts.tts_with_vocoder(text=chunk, speaker_wav=reference_wav_path) audio_pieces.append(wav) # concatenate full = np.concatenate(audio_pieces, axis=0) sf.write(out_path, full, samplerate=tts.synthesizer.output_sample_rate) return out_path # Generate with Bark via transformers TTS pipeline (local) def generate_bark(text: str, out_path: str = "output_bark.wav"): if not TRANSFORMERS_AVAILABLE: raise RuntimeError("transformers not installed. Install from requirements.txt") # Bark models via 'suno/bark' or 'suno/bark' local repo # Use pipeline TTS tts = pipeline('text-to-speech', model='suno/bark') chunks = split_text(text, max_tokens=600) audio_buffers = [] sr = None for ch in chunks: result = tts(ch) # result may be a dict with 'audio' bytes or numpy array audio = result['audio'] if isinstance(result, dict) and 'audio' in result else result # Convert to numpy array and sample rate if isinstance(audio, np.ndarray): arr = audio if sr is None: sr = 22050 else: # assume bytes (wav) import io data, samplerate = sf.read(io.BytesIO(audio)) arr = data sr = samplerate audio_buffers.append(arr) full = np.concatenate(audio_buffers, axis=0) if sr is None: sr = 22050 sf.write(out_path, full, samplerate=sr) return out_path # Gradio UI callbacks def run_generate(model_choice, uploaded_ref, text_input, xtts_model_dir): # uploaded_ref is a tuple (name, temp_path) if uploaded_ref is None: return "Please upload a reference WAV (at least 3-6 seconds)." ref_path = uploaded_ref.name if hasattr(uploaded_ref, 'name') else uploaded_ref # Gradio may provide a tempfile path try: # move to a stable temp file tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') tmp_ref.close() # if uploaded_ref is a path if isinstance(uploaded_ref, str) and os.path.exists(uploaded_ref): import shutil shutil.copy(uploaded_ref, tmp_ref.name) else: # uploaded_ref is a file-like object with open(tmp_ref.name, 'wb') as f: f.write(uploaded_ref.read()) ref_path = tmp_ref.name except Exception: # fallback ref_path = uploaded_ref out_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name try: if model_choice == 'XTTS-v2 (Coqui)': generated = generate_xtts(ref_path, text_input, model_path=xtts_model_dir if xtts_model_dir else None, out_path=out_file) else: generated = generate_bark(text_input, out_path=out_file) except Exception as e: return f"Error during generation: {e}" return generated with gr.Blocks() as demo: gr.Markdown("# Local Voice Clone — XTTS-v2 & Bark (Gradio)\nUpload a short reference WAV (3–10s) and enter text. No external API token required if models are local.") with gr.Row(): with gr.Column(scale=1): model_choice = gr.Radio(choices=['XTTS-v2 (Coqui)', 'Bark (Suno)'], value='XTTS-v2 (Coqui)', label='Model') ref_upload = gr.File(label='Upload reference WAV (3-10 sec preferred)') xtts_dir = gr.Textbox(label='XTTS local model dir (optional)', placeholder='./models/xtts_v2/') text_input = gr.Textbox(label='Text to synthesize', lines=10, placeholder='Type the text you want spoken...') run_btn = gr.Button('Generate Voice Now') with gr.Column(scale=1): out_file = gr.File(label='Generated WAV (download)') status = gr.Textbox(label='Status', interactive=False) run_btn.click(fn=run_generate, inputs=[model_choice, ref_upload, text_input, xtts_dir], outputs=[out_file]) if __name__ == '__main__': demo.launch(server_name='0.0.0.0', share=False) # ------------------------- requirements.txt ------------------------- # Put this block into a separate 'requirements.txt' file when ready. # For convenience it's included in this single file package. # requirements.txt content (copy to a file): # gradio>=3.30 # torch # numpy # soundfile # scipy # transformers>=4.31.0 # coqui-tts # librosa # typing-extensions # accelerate # # Notes: Installing 'coqui-tts' may pull many dependencies and requires Git LFS to download large pretrained files. # For Bark use the 'suno/bark' model via transformers pipeline (Transformers>=4.31). If you prefer the official 'bark' repo, # follow instructions at https://github.com/suno-ai/bark