Spaces:

awais7777
/

cvoice-clone

Sleeping

App Files Files Community

cvoice-clone / app.py

awais7777

Update app.py

269fa9d verified 3 months ago

raw

history blame contribute delete

7.42 kB

	# app.py
	"""
	Hugging Face / Local XTTS-v2 + Bark Gradio Web UI
	- Upload your voice (wav) as reference
	- Choose model: XTTS-v2 (Coqui) or Bark (Suno)
	- No external API token required if models are installed locally

	HOW THIS WORKS (quick):
	1) Install requirements from requirements.txt
	2) Download XTTS-v2 model files into ./models/xtts_v2/ (see notes below)
	OR install Coqui TTS via pip and let it download pretrained models
	3) Bark will use the Hugging Face model cache (or local model dir)
	4) Run: python app.py -> open http://localhost:7860

	NOTES:
	- Generating ~2 hours of audio is possible by splitting text into chunks and concatenating outputs, but it is VERY memory/CPU/GPU intensive.
	- For best results use a decent GPU and enough disk space (several GBs for models).
	"""

	import os
	import tempfile
	import math
	from pathlib import Path
	from typing import Optional

	import torch
	import numpy as np
	import soundfile as sf

	import gradio as gr

	# Try imports for Coqui XTTS and Bark/Transformers
	try:
	from TTS.api import TTS # Coqui TTS
	COQUI_AVAILABLE = True
	except Exception:
	COQUI_AVAILABLE = False

	try:
	from transformers import pipeline
	TRANSFORMERS_AVAILABLE = True
	except Exception:
	TRANSFORMERS_AVAILABLE = False

	# Utility: chunk text into smaller pieces
	def split_text(text: str, max_tokens: int = 2000):
	# naive splitter by sentences/commas - keeps punctuation
	import re
	sentences = re.split(r'(?<=[.!?])\s+', text)
	chunks = []
	cur = ""
	for s in sentences:
	if len(cur) + len(s) < max_tokens:
	cur = cur + " " + s if cur else s
	else:
	chunks.append(cur)
	cur = s
	if cur:
	chunks.append(cur)
	return chunks

	# Generate with Coqui XTTS (local)
	def generate_xtts(reference_wav_path: str, text: str, model_path: Optional[str] = None, out_path: str = "output_xtts.wav"):
	if not COQUI_AVAILABLE:
	raise RuntimeError("Coqui TTS package not available. Install 'coqui-tts' from requirements.txt")

	# Initialize TTS - if model_path provided, try to load it
	if model_path and os.path.exists(model_path):
	tts = TTS(model_path)
	else:
	# fallback to default XTTS if installed
	# list available models
	models = TTS.list_models()
	# prefer XTTS if present
	selected = None
	for m in models:
	if 'xtts' in m.lower():
	selected = m
	break
	if selected is None:
	selected = models[0]
	tts = TTS(selected)

	# For cloning, Coqui XTTS can accept speaker_wav as reference per docs
	chunks = split_text(text, max_tokens=1500)
	audio_pieces = []
	for idx, chunk in enumerate(chunks):
	wav = tts.tts_with_vocoder(text=chunk, speaker_wav=reference_wav_path)
	audio_pieces.append(wav)
	# concatenate
	full = np.concatenate(audio_pieces, axis=0)
	sf.write(out_path, full, samplerate=tts.synthesizer.output_sample_rate)
	return out_path

	# Generate with Bark via transformers TTS pipeline (local)
	def generate_bark(text: str, out_path: str = "output_bark.wav"):
	if not TRANSFORMERS_AVAILABLE:
	raise RuntimeError("transformers not installed. Install from requirements.txt")

	# Bark models via 'suno/bark' or 'suno/bark' local repo
	# Use pipeline TTS
	tts = pipeline('text-to-speech', model='suno/bark')

	chunks = split_text(text, max_tokens=600)
	audio_buffers = []
	sr = None
	for ch in chunks:
	result = tts(ch)
	# result may be a dict with 'audio' bytes or numpy array
	audio = result['audio'] if isinstance(result, dict) and 'audio' in result else result
	# Convert to numpy array and sample rate
	if isinstance(audio, np.ndarray):
	arr = audio
	if sr is None:
	sr = 22050
	else:
	# assume bytes (wav)
	import io
	data, samplerate = sf.read(io.BytesIO(audio))
	arr = data
	sr = samplerate
	audio_buffers.append(arr)
	full = np.concatenate(audio_buffers, axis=0)
	if sr is None:
	sr = 22050
	sf.write(out_path, full, samplerate=sr)
	return out_path


	# Gradio UI callbacks
	def run_generate(model_choice, uploaded_ref, text_input, xtts_model_dir):
	# uploaded_ref is a tuple (name, temp_path)
	if uploaded_ref is None:
	return "Please upload a reference WAV (at least 3-6 seconds)."

	ref_path = uploaded_ref.name if hasattr(uploaded_ref, 'name') else uploaded_ref
	# Gradio may provide a tempfile path
	try:
	# move to a stable temp file
	tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
	tmp_ref.close()
	# if uploaded_ref is a path
	if isinstance(uploaded_ref, str) and os.path.exists(uploaded_ref):
	import shutil
	shutil.copy(uploaded_ref, tmp_ref.name)
	else:
	# uploaded_ref is a file-like object
	with open(tmp_ref.name, 'wb') as f:
	f.write(uploaded_ref.read())
	ref_path = tmp_ref.name
	except Exception:
	# fallback
	ref_path = uploaded_ref

	out_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name

	try:
	if model_choice == 'XTTS-v2 (Coqui)':
	generated = generate_xtts(ref_path, text_input, model_path=xtts_model_dir if xtts_model_dir else None, out_path=out_file)
	else:
	generated = generate_bark(text_input, out_path=out_file)
	except Exception as e:
	return f"Error during generation: {e}"

	return generated

	with gr.Blocks() as demo:
	gr.Markdown("# Local Voice Clone — XTTS-v2 & Bark (Gradio)\nUpload a short reference WAV (3–10s) and enter text. No external API token required if models are local.")
	with gr.Row():
	with gr.Column(scale=1):
	model_choice = gr.Radio(choices=['XTTS-v2 (Coqui)', 'Bark (Suno)'], value='XTTS-v2 (Coqui)', label='Model')
	ref_upload = gr.File(label='Upload reference WAV (3-10 sec preferred)')
	xtts_dir = gr.Textbox(label='XTTS local model dir (optional)', placeholder='./models/xtts_v2/')
	text_input = gr.Textbox(label='Text to synthesize', lines=10, placeholder='Type the text you want spoken...')
	run_btn = gr.Button('Generate Voice Now')
	with gr.Column(scale=1):
	out_file = gr.File(label='Generated WAV (download)')
	status = gr.Textbox(label='Status', interactive=False)

	run_btn.click(fn=run_generate, inputs=[model_choice, ref_upload, text_input, xtts_dir], outputs=[out_file])

	if __name__ == '__main__':
	demo.launch(server_name='0.0.0.0', share=False)


	# ------------------------- requirements.txt -------------------------
	# Put this block into a separate 'requirements.txt' file when ready.
	# For convenience it's included in this single file package.

	# requirements.txt content (copy to a file):
	# gradio>=3.30
	# torch
	# numpy
	# soundfile
	# scipy
	# transformers>=4.31.0
	# coqui-tts
	# librosa
	# typing-extensions
	# accelerate
	#
	# Notes: Installing 'coqui-tts' may pull many dependencies and requires Git LFS to download large pretrained files.
	# For Bark use the 'suno/bark' model via transformers pipeline (Transformers>=4.31). If you prefer the official 'bark' repo,
	# follow instructions at https://github.com/suno-ai/bark