import gc
import re
import threading
import traceback

import gradio as gr
import numpy as np
import torch
from transformers import pipeline

MODEL_ID = "fishaudio/s2-pro"
DEFAULT_SR = 24000
SILENCE_MS = 180
CHUNK_CHARS = 280

_pipe = None
_pipe_error = None
_pipe_lock = threading.Lock()
_gen_lock = threading.Lock()


def load_pipeline():
    global _pipe, _pipe_error
    if _pipe is not None:
        return _pipe
    if _pipe_error is not None:
        raise RuntimeError(_pipe_error)

    with _pipe_lock:
        if _pipe is not None:
            return _pipe
        if _pipe_error is not None:
            raise RuntimeError(_pipe_error)
        try:
            _pipe = pipeline(
                task="text-to-audio",
                model=MODEL_ID,
                device=-1,
                trust_remote_code=True,
            )
            return _pipe
        except Exception as e:
            _pipe_error = f"Failed to load {MODEL_ID}: {e}"
            raise RuntimeError(_pipe_error) from e


def normalize_audio(audio):
    audio = np.asarray(audio, dtype=np.float32)
    if audio.ndim > 1:
        audio = audio.squeeze()
    max_abs = np.max(np.abs(audio)) if audio.size else 0.0
    if max_abs > 1.0:
        audio = audio / max_abs
    return audio


def split_long_sentence(sentence: str, limit: int):
    words = sentence.split()
    if not words:
        return []
    chunks = []
    current = words[0]
    for word in words[1:]:
        trial = current + " " + word
        if len(trial) <= limit:
            current = trial
        else:
            chunks.append(current)
            current = word
    if current:
        chunks.append(current)
    return chunks


def chunk_text(text: str, limit: int = CHUNK_CHARS):
    text = re.sub(r"\s+", " ", (text or "").strip())
    if not text:
        return []

    sentences = re.split(r"(?<=[.!?।])\s+", text)
    chunks = []
    current = ""

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        parts = [sentence] if len(sentence) <= limit else split_long_sentence(sentence, limit)
        for part in parts:
            if not current:
                current = part
            elif len(current) + 1 + len(part) <= limit:
                current += " " + part
            else:
                chunks.append(current)
                current = part
    if current:
        chunks.append(current)
    return chunks


def run_one_chunk(pipe, text_chunk: str):
    result = pipe(text_chunk)
    if isinstance(result, dict):
        audio = result.get("audio")
        sr = result.get("sampling_rate") or result.get("sample_rate") or DEFAULT_SR
    elif isinstance(result, tuple) and len(result) == 2:
        sr, audio = result
    else:
        raise gr.Error(f"Unexpected model output type: {type(result)}")
    if audio is None:
        raise gr.Error("Model returned no audio.")
    return int(sr), normalize_audio(audio)


def synthesize_long(text: str):
    text = (text or "").strip()
    if not text:
        raise gr.Error("Please enter some text.")

    chunks = chunk_text(text)
    if not chunks:
        raise gr.Error("Could not split input text.")

    pipe = load_pipeline()
    silence = None
    pieces = []
    sr = DEFAULT_SR

    with _gen_lock:
        try:
            for idx, chunk in enumerate(chunks, start=1):
                sr, audio = run_one_chunk(pipe, chunk)
                if silence is None:
                    silence = np.zeros(int(sr * SILENCE_MS / 1000), dtype=np.float32)
                pieces.append(audio)
                if idx < len(chunks):
                    pieces.append(silence)
        except Exception as e:
            tb = traceback.format_exc(limit=2)
            raise gr.Error(f"Generation failed: {e}\n\n{tb}") from e
        finally:
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    if not pieces:
        raise gr.Error("No audio was generated.")

    final_audio = np.concatenate(pieces)
    info = (
        f"Done. Model: {MODEL_ID} | Segments: {len(chunks)} | "
        f"Input characters: {len(text)} | Output seconds: {len(final_audio) / sr:.1f}"
    )
    return (sr, final_audio), info


def app_info():
    return (
        "Long text is supported by auto-splitting your input into smaller chunks and stitching the audio together. "
        "There is no small textbox cap or single-pass text cap in the app itself, but the machine and model still have practical limits."
    )


with gr.Blocks() as demo:
    gr.Markdown("# Fish Audio S2 Pro Text to Speech")
    gr.Markdown(app_info())
    text = gr.Textbox(
        label="Text",
        lines=14,
        placeholder="Type very long text here. The app will split it into chunks automatically.",
    )
    btn = gr.Button("Generate Speech")
    audio = gr.Audio(label="Audio", type="numpy", show_download_button=True)
    status = gr.Textbox(label="Status", interactive=False)

    btn.click(synthesize_long, inputs=text, outputs=[audio, status], api_name="tts")


if __name__ == "__main__":
    demo.launch()