# app.py
"""
Hugging Face / Local XTTS-v2 + Bark Gradio Web UI
- Upload your voice (wav) as reference
- Choose model: XTTS-v2 (Coqui) or Bark (Suno)
- No external API token required if models are installed locally

HOW THIS WORKS (quick):
1) Install requirements from requirements.txt
2) Download XTTS-v2 model files into ./models/xtts_v2/ (see notes below)
   OR install Coqui TTS via pip and let it download pretrained models
3) Bark will use the Hugging Face model cache (or local model dir)
4) Run: python app.py -> open http://localhost:7860

NOTES:
- Generating ~2 hours of audio is possible by splitting text into chunks and concatenating outputs, but it is VERY memory/CPU/GPU intensive.
- For best results use a decent GPU and enough disk space (several GBs for models).
"""

import os
import tempfile
import math
from pathlib import Path
from typing import Optional

import torch
import numpy as np
import soundfile as sf

import gradio as gr

# Try imports for Coqui XTTS and Bark/Transformers
try:
    from TTS.api import TTS  # Coqui TTS
    COQUI_AVAILABLE = True
except Exception:
    COQUI_AVAILABLE = False

try:
    from transformers import pipeline
    TRANSFORMERS_AVAILABLE = True
except Exception:
    TRANSFORMERS_AVAILABLE = False

# Utility: chunk text into smaller pieces
def split_text(text: str, max_tokens: int = 2000):
    # naive splitter by sentences/commas - keeps punctuation
    import re
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    cur = ""
    for s in sentences:
        if len(cur) + len(s) < max_tokens:
            cur = cur + " " + s if cur else s
        else:
            chunks.append(cur)
            cur = s
    if cur:
        chunks.append(cur)
    return chunks

# Generate with Coqui XTTS (local)
def generate_xtts(reference_wav_path: str, text: str, model_path: Optional[str] = None, out_path: str = "output_xtts.wav"):
    if not COQUI_AVAILABLE:
        raise RuntimeError("Coqui TTS package not available. Install 'coqui-tts' from requirements.txt")

    # Initialize TTS - if model_path provided, try to load it
    if model_path and os.path.exists(model_path):
        tts = TTS(model_path)
    else:
        # fallback to default XTTS if installed
        # list available models
        models = TTS.list_models()
        # prefer XTTS if present
        selected = None
        for m in models:
            if 'xtts' in m.lower():
                selected = m
                break
        if selected is None:
            selected = models[0]
        tts = TTS(selected)

    # For cloning, Coqui XTTS can accept speaker_wav as reference per docs
    chunks = split_text(text, max_tokens=1500)
    audio_pieces = []
    for idx, chunk in enumerate(chunks):
        wav = tts.tts_with_vocoder(text=chunk, speaker_wav=reference_wav_path)
        audio_pieces.append(wav)
    # concatenate
    full = np.concatenate(audio_pieces, axis=0)
    sf.write(out_path, full, samplerate=tts.synthesizer.output_sample_rate)
    return out_path

# Generate with Bark via transformers TTS pipeline (local)
def generate_bark(text: str, out_path: str = "output_bark.wav"):
    if not TRANSFORMERS_AVAILABLE:
        raise RuntimeError("transformers not installed. Install from requirements.txt")

    # Bark models via 'suno/bark' or 'suno/bark' local repo
    # Use pipeline TTS
    tts = pipeline('text-to-speech', model='suno/bark')

    chunks = split_text(text, max_tokens=600)
    audio_buffers = []
    sr = None
    for ch in chunks:
        result = tts(ch)
        # result may be a dict with 'audio' bytes or numpy array
        audio = result['audio'] if isinstance(result, dict) and 'audio' in result else result
        # Convert to numpy array and sample rate
        if isinstance(audio, np.ndarray):
            arr = audio
            if sr is None:
                sr = 22050
        else:
            # assume bytes (wav)
            import io
            data, samplerate = sf.read(io.BytesIO(audio))
            arr = data
            sr = samplerate
        audio_buffers.append(arr)
    full = np.concatenate(audio_buffers, axis=0)
    if sr is None:
        sr = 22050
    sf.write(out_path, full, samplerate=sr)
    return out_path


# Gradio UI callbacks
def run_generate(model_choice, uploaded_ref, text_input, xtts_model_dir):
    # uploaded_ref is a tuple (name, temp_path)
    if uploaded_ref is None:
        return "Please upload a reference WAV (at least 3-6 seconds)."

    ref_path = uploaded_ref.name if hasattr(uploaded_ref, 'name') else uploaded_ref
    # Gradio may provide a tempfile path
    try:
        # move to a stable temp file
        tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
        tmp_ref.close()
        # if uploaded_ref is a path
        if isinstance(uploaded_ref, str) and os.path.exists(uploaded_ref):
            import shutil
            shutil.copy(uploaded_ref, tmp_ref.name)
        else:
            # uploaded_ref is a file-like object
            with open(tmp_ref.name, 'wb') as f:
                f.write(uploaded_ref.read())
        ref_path = tmp_ref.name
    except Exception:
        # fallback
        ref_path = uploaded_ref

    out_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav').name

    try:
        if model_choice == 'XTTS-v2 (Coqui)':
            generated = generate_xtts(ref_path, text_input, model_path=xtts_model_dir if xtts_model_dir else None, out_path=out_file)
        else:
            generated = generate_bark(text_input, out_path=out_file)
    except Exception as e:
        return f"Error during generation: {e}"

    return generated

with gr.Blocks() as demo:
    gr.Markdown("# Local Voice Clone — XTTS-v2 & Bark (Gradio)\nUpload a short reference WAV (3–10s) and enter text. No external API token required if models are local.")
    with gr.Row():
        with gr.Column(scale=1):
            model_choice = gr.Radio(choices=['XTTS-v2 (Coqui)', 'Bark (Suno)'], value='XTTS-v2 (Coqui)', label='Model')
            ref_upload = gr.File(label='Upload reference WAV (3-10 sec preferred)')
            xtts_dir = gr.Textbox(label='XTTS local model dir (optional)', placeholder='./models/xtts_v2/')
            text_input = gr.Textbox(label='Text to synthesize', lines=10, placeholder='Type the text you want spoken...')
            run_btn = gr.Button('Generate Voice Now')
        with gr.Column(scale=1):
            out_file = gr.File(label='Generated WAV (download)')
            status = gr.Textbox(label='Status', interactive=False)

    run_btn.click(fn=run_generate, inputs=[model_choice, ref_upload, text_input, xtts_dir], outputs=[out_file])

if __name__ == '__main__':
    demo.launch(server_name='0.0.0.0', share=False)


# ------------------------- requirements.txt -------------------------
# Put this block into a separate 'requirements.txt' file when ready.
# For convenience it's included in this single file package.

# requirements.txt content (copy to a file):
# gradio>=3.30
# torch
# numpy
# soundfile
# scipy
# transformers>=4.31.0
# coqui-tts
# librosa
# typing-extensions
# accelerate
#
# Notes: Installing 'coqui-tts' may pull many dependencies and requires Git LFS to download large pretrained files.
# For Bark use the 'suno/bark' model via transformers pipeline (Transformers>=4.31). If you prefer the official 'bark' repo,
# follow instructions at https://github.com/suno-ai/bark