cvoice-clone / app.py
awais7777's picture
Update app.py
269fa9d verified
# app.py
"""
Hugging Face / Local XTTS-v2 + Bark Gradio Web UI
- Upload your voice (wav) as reference
- Choose model: XTTS-v2 (Coqui) or Bark (Suno)
- No external API token required if models are installed locally
HOW THIS WORKS (quick):
1) Install requirements from requirements.txt
2) Download XTTS-v2 model files into ./models/xtts_v2/ (see notes below)
OR install Coqui TTS via pip and let it download pretrained models
3) Bark will use the Hugging Face model cache (or local model dir)
4) Run: python app.py -> open http://localhost:7860
NOTES:
- Generating ~2 hours of audio is possible by splitting text into chunks and concatenating outputs, but it is VERY memory/CPU/GPU intensive.
- For best results use a decent GPU and enough disk space (several GBs for models).
"""
import os
import tempfile
import math
from pathlib import Path
from typing import Optional
import torch
import numpy as np
import soundfile as sf
import gradio as gr
# Try imports for Coqui XTTS and Bark/Transformers
try:
from TTS.api import TTS # Coqui TTS
COQUI_AVAILABLE = True
except Exception:
COQUI_AVAILABLE = False
try:
from transformers import pipeline
TRANSFORMERS_AVAILABLE = True
except Exception:
TRANSFORMERS_AVAILABLE = False
# Utility: chunk text into smaller pieces
def split_text(text: str, max_tokens: int = 2000):
# naive splitter by sentences/commas - keeps punctuation
import re
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
cur = ""
for s in sentences:
if len(cur) + len(s) < max_tokens:
cur = cur + " " + s if cur else s
else:
chunks.append(cur)
cur = s
if cur:
chunks.append(cur)
return chunks
# Generate with Coqui XTTS (local)
def generate_xtts(reference_wav_path: str, text: str, model_path: Optional[str] = None, out_path: str = "output_xtts.wav"):
if not COQUI_AVAILABLE:
raise RuntimeError("Coqui TTS package not available. Install 'coqui-tts' from requirements.txt")
# Initialize TTS - if model_path provided, try to load it
if model_path and os.path.exists(model_path):
tts = TTS(model_path)
else:
# fallback to default XTTS if installed
# list available models
models = TTS.list_models()
# prefer XTTS if present
selected = None
for m in models:
if 'xtts' in m.lower():
selected = m
break
if selected is None:
selected = models[0]
tts = TTS(selected)
# For cloning, Coqui XTTS can accept speaker_wav as reference per docs
chunks = split_text(text, max_tokens=1500)
audio_pieces = []
for idx, chunk in enumerate(chunks):
wav = tts.tts_with_vocoder(text=chunk, speaker_wav=reference_wav_path)
audio_pieces.append(wav)
# concatenate
full = np.concatenate(audio_pieces, axis=0)
sf.write(out_path, full, samplerate=tts.synthesizer.output_sample_rate)
return out_path
# Generate with Bark via transformers TTS pipeline (local)
def generate_bark(text: str, out_path: str = "output_bark.wav"):
if not TRANSFORMERS_AVAILABLE:
raise RuntimeError("transformers not installed. Install from requirements.txt")
# Bark models via 'suno/bark' or 'suno/bark' local repo
# Use pipeline TTS
tts = pipeline('text-to-speech', model='suno/bark')
chunks = split_text(text, max_tokens=600)
audio_buffers = []
sr = None
for ch in chunks:
result = tts(ch)
# result may be a dict with 'audio' bytes or numpy array
audio = result['audio'] if isinstance(result, dict) and 'audio' in result else result
# Convert to numpy array and sample rate
if isinstance(audio, np.ndarray):
arr = audio
if sr is None:
sr = 22050
else:
# assume bytes (wav)
import io
data, samplerate = sf.read(io.BytesIO(audio))
arr = data
sr = samplerate
audio_buffers.append(arr)
full = np.concatenate(audio_buffers, axis=0)
if sr is None:
sr = 22050
sf.write(out_path, full, samplerate=sr)
return out_path
# Gradio UI callbacks
def run_generate(model_choice, uploaded_ref, text_input, xtts_model_dir):
# uploaded_ref is a tuple (name, temp_path)
if uploaded_ref is None:
return "Please upload a reference WAV (at least 3-6 seconds)."
ref_path = uploaded_ref.name if hasattr(uploaded_ref, 'name') else uploaded_ref
# Gradio may provide a tempfile path
try:
# move to a stable temp file
tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
tmp_ref.close()
# if uploaded_ref is a path
if isinstance(uploaded_ref, str) and os.path.exists(uploaded_ref):
import shutil
shutil.copy(uploaded_ref, tmp_ref.name)
else:
# uploaded_ref is a file-like object
with open(tmp_ref.name, 'wb') as f:
f.write(uploaded_ref.read())
ref_path = tmp_ref.name
except Exception:
# fallback
ref_path = uploaded_ref
out_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name
try:
if model_choice == 'XTTS-v2 (Coqui)':
generated = generate_xtts(ref_path, text_input, model_path=xtts_model_dir if xtts_model_dir else None, out_path=out_file)
else:
generated = generate_bark(text_input, out_path=out_file)
except Exception as e:
return f"Error during generation: {e}"
return generated
with gr.Blocks() as demo:
gr.Markdown("# Local Voice Clone — XTTS-v2 & Bark (Gradio)\nUpload a short reference WAV (3–10s) and enter text. No external API token required if models are local.")
with gr.Row():
with gr.Column(scale=1):
model_choice = gr.Radio(choices=['XTTS-v2 (Coqui)', 'Bark (Suno)'], value='XTTS-v2 (Coqui)', label='Model')
ref_upload = gr.File(label='Upload reference WAV (3-10 sec preferred)')
xtts_dir = gr.Textbox(label='XTTS local model dir (optional)', placeholder='./models/xtts_v2/')
text_input = gr.Textbox(label='Text to synthesize', lines=10, placeholder='Type the text you want spoken...')
run_btn = gr.Button('Generate Voice Now')
with gr.Column(scale=1):
out_file = gr.File(label='Generated WAV (download)')
status = gr.Textbox(label='Status', interactive=False)
run_btn.click(fn=run_generate, inputs=[model_choice, ref_upload, text_input, xtts_dir], outputs=[out_file])
if __name__ == '__main__':
demo.launch(server_name='0.0.0.0', share=False)
# ------------------------- requirements.txt -------------------------
# Put this block into a separate 'requirements.txt' file when ready.
# For convenience it's included in this single file package.
# requirements.txt content (copy to a file):
# gradio>=3.30
# torch
# numpy
# soundfile
# scipy
# transformers>=4.31.0
# coqui-tts
# librosa
# typing-extensions
# accelerate
#
# Notes: Installing 'coqui-tts' may pull many dependencies and requires Git LFS to download large pretrained files.
# For Bark use the 'suno/bark' model via transformers pipeline (Transformers>=4.31). If you prefer the official 'bark' repo,
# follow instructions at https://github.com/suno-ai/bark