Spaces:
Sleeping
Sleeping
| # app.py | |
| """ | |
| Hugging Face / Local XTTS-v2 + Bark Gradio Web UI | |
| - Upload your voice (wav) as reference | |
| - Choose model: XTTS-v2 (Coqui) or Bark (Suno) | |
| - No external API token required if models are installed locally | |
| HOW THIS WORKS (quick): | |
| 1) Install requirements from requirements.txt | |
| 2) Download XTTS-v2 model files into ./models/xtts_v2/ (see notes below) | |
| OR install Coqui TTS via pip and let it download pretrained models | |
| 3) Bark will use the Hugging Face model cache (or local model dir) | |
| 4) Run: python app.py -> open http://localhost:7860 | |
| NOTES: | |
| - Generating ~2 hours of audio is possible by splitting text into chunks and concatenating outputs, but it is VERY memory/CPU/GPU intensive. | |
| - For best results use a decent GPU and enough disk space (several GBs for models). | |
| """ | |
| import os | |
| import tempfile | |
| import math | |
| from pathlib import Path | |
| from typing import Optional | |
| import torch | |
| import numpy as np | |
| import soundfile as sf | |
| import gradio as gr | |
| # Try imports for Coqui XTTS and Bark/Transformers | |
| try: | |
| from TTS.api import TTS # Coqui TTS | |
| COQUI_AVAILABLE = True | |
| except Exception: | |
| COQUI_AVAILABLE = False | |
| try: | |
| from transformers import pipeline | |
| TRANSFORMERS_AVAILABLE = True | |
| except Exception: | |
| TRANSFORMERS_AVAILABLE = False | |
| # Utility: chunk text into smaller pieces | |
| def split_text(text: str, max_tokens: int = 2000): | |
| # naive splitter by sentences/commas - keeps punctuation | |
| import re | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| chunks = [] | |
| cur = "" | |
| for s in sentences: | |
| if len(cur) + len(s) < max_tokens: | |
| cur = cur + " " + s if cur else s | |
| else: | |
| chunks.append(cur) | |
| cur = s | |
| if cur: | |
| chunks.append(cur) | |
| return chunks | |
| # Generate with Coqui XTTS (local) | |
| def generate_xtts(reference_wav_path: str, text: str, model_path: Optional[str] = None, out_path: str = "output_xtts.wav"): | |
| if not COQUI_AVAILABLE: | |
| raise RuntimeError("Coqui TTS package not available. Install 'coqui-tts' from requirements.txt") | |
| # Initialize TTS - if model_path provided, try to load it | |
| if model_path and os.path.exists(model_path): | |
| tts = TTS(model_path) | |
| else: | |
| # fallback to default XTTS if installed | |
| # list available models | |
| models = TTS.list_models() | |
| # prefer XTTS if present | |
| selected = None | |
| for m in models: | |
| if 'xtts' in m.lower(): | |
| selected = m | |
| break | |
| if selected is None: | |
| selected = models[0] | |
| tts = TTS(selected) | |
| # For cloning, Coqui XTTS can accept speaker_wav as reference per docs | |
| chunks = split_text(text, max_tokens=1500) | |
| audio_pieces = [] | |
| for idx, chunk in enumerate(chunks): | |
| wav = tts.tts_with_vocoder(text=chunk, speaker_wav=reference_wav_path) | |
| audio_pieces.append(wav) | |
| # concatenate | |
| full = np.concatenate(audio_pieces, axis=0) | |
| sf.write(out_path, full, samplerate=tts.synthesizer.output_sample_rate) | |
| return out_path | |
| # Generate with Bark via transformers TTS pipeline (local) | |
| def generate_bark(text: str, out_path: str = "output_bark.wav"): | |
| if not TRANSFORMERS_AVAILABLE: | |
| raise RuntimeError("transformers not installed. Install from requirements.txt") | |
| # Bark models via 'suno/bark' or 'suno/bark' local repo | |
| # Use pipeline TTS | |
| tts = pipeline('text-to-speech', model='suno/bark') | |
| chunks = split_text(text, max_tokens=600) | |
| audio_buffers = [] | |
| sr = None | |
| for ch in chunks: | |
| result = tts(ch) | |
| # result may be a dict with 'audio' bytes or numpy array | |
| audio = result['audio'] if isinstance(result, dict) and 'audio' in result else result | |
| # Convert to numpy array and sample rate | |
| if isinstance(audio, np.ndarray): | |
| arr = audio | |
| if sr is None: | |
| sr = 22050 | |
| else: | |
| # assume bytes (wav) | |
| import io | |
| data, samplerate = sf.read(io.BytesIO(audio)) | |
| arr = data | |
| sr = samplerate | |
| audio_buffers.append(arr) | |
| full = np.concatenate(audio_buffers, axis=0) | |
| if sr is None: | |
| sr = 22050 | |
| sf.write(out_path, full, samplerate=sr) | |
| return out_path | |
| # Gradio UI callbacks | |
| def run_generate(model_choice, uploaded_ref, text_input, xtts_model_dir): | |
| # uploaded_ref is a tuple (name, temp_path) | |
| if uploaded_ref is None: | |
| return "Please upload a reference WAV (at least 3-6 seconds)." | |
| ref_path = uploaded_ref.name if hasattr(uploaded_ref, 'name') else uploaded_ref | |
| # Gradio may provide a tempfile path | |
| try: | |
| # move to a stable temp file | |
| tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
| tmp_ref.close() | |
| # if uploaded_ref is a path | |
| if isinstance(uploaded_ref, str) and os.path.exists(uploaded_ref): | |
| import shutil | |
| shutil.copy(uploaded_ref, tmp_ref.name) | |
| else: | |
| # uploaded_ref is a file-like object | |
| with open(tmp_ref.name, 'wb') as f: | |
| f.write(uploaded_ref.read()) | |
| ref_path = tmp_ref.name | |
| except Exception: | |
| # fallback | |
| ref_path = uploaded_ref | |
| out_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name | |
| try: | |
| if model_choice == 'XTTS-v2 (Coqui)': | |
| generated = generate_xtts(ref_path, text_input, model_path=xtts_model_dir if xtts_model_dir else None, out_path=out_file) | |
| else: | |
| generated = generate_bark(text_input, out_path=out_file) | |
| except Exception as e: | |
| return f"Error during generation: {e}" | |
| return generated | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Local Voice Clone — XTTS-v2 & Bark (Gradio)\nUpload a short reference WAV (3–10s) and enter text. No external API token required if models are local.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_choice = gr.Radio(choices=['XTTS-v2 (Coqui)', 'Bark (Suno)'], value='XTTS-v2 (Coqui)', label='Model') | |
| ref_upload = gr.File(label='Upload reference WAV (3-10 sec preferred)') | |
| xtts_dir = gr.Textbox(label='XTTS local model dir (optional)', placeholder='./models/xtts_v2/') | |
| text_input = gr.Textbox(label='Text to synthesize', lines=10, placeholder='Type the text you want spoken...') | |
| run_btn = gr.Button('Generate Voice Now') | |
| with gr.Column(scale=1): | |
| out_file = gr.File(label='Generated WAV (download)') | |
| status = gr.Textbox(label='Status', interactive=False) | |
| run_btn.click(fn=run_generate, inputs=[model_choice, ref_upload, text_input, xtts_dir], outputs=[out_file]) | |
| if __name__ == '__main__': | |
| demo.launch(server_name='0.0.0.0', share=False) | |
| # ------------------------- requirements.txt ------------------------- | |
| # Put this block into a separate 'requirements.txt' file when ready. | |
| # For convenience it's included in this single file package. | |
| # requirements.txt content (copy to a file): | |
| # gradio>=3.30 | |
| # torch | |
| # numpy | |
| # soundfile | |
| # scipy | |
| # transformers>=4.31.0 | |
| # coqui-tts | |
| # librosa | |
| # typing-extensions | |
| # accelerate | |
| # | |
| # Notes: Installing 'coqui-tts' may pull many dependencies and requires Git LFS to download large pretrained files. | |
| # For Bark use the 'suno/bark' model via transformers pipeline (Transformers>=4.31). If you prefer the official 'bark' repo, | |
| # follow instructions at https://github.com/suno-ai/bark | |