import spaces
import gradio as gr
import torch
import numpy as np
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"

repo_id = "PHBJT/french_parler_tts_mini_v0.1"

model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)

SAMPLE_RATE = model.config.sampling_rate
SEED = 42
MAX_CHARS_PER_SEGMENT = 200

default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
default_description = (
    "A male voice speaks very fast with very clear audio, neutral tone, no background noise."
)

examples = [
    [
        "La voix humaine est un instrument de musique au-dessus de tous les autres.",
        default_description,
    ],
    [
        "Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
        "A male voice delivers a slightly expressive and animated speech with a moderate speed. "
        "The recording features a low-pitch voice, creating a close-sounding audio experience.",
    ],
    [
        "La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
        "A male voice provides a monotone yet slightly fast delivery, with a very close recording "
        "that almost has no background noise.",
    ],
    [
        "Le progrès fait naître plus de besoins qu'il n'en satisfait.",
        "A female voice, in a very poor recording quality, delivers slightly expressive and animated "
        "words with a fast pace. There's a high level of background noise and a very distant-sounding "
        "reverberation. The voice is slightly higher pitched than average.",
    ],
]


def preprocess_text(text):
    """Prétraite le texte pour le rendre compatible avec le modèle TTS."""
    # Remplacer les apostrophes (curly et droites) par des espaces
    text = text.replace("\u2019", " ").replace("\u2018", " ").replace("'", " ")
    # Remplacer les caractères accentués par leur équivalent non accentué
    accent_map = {
        "À": "A", "Â": "A", "Ä": "A", "à": "a", "â": "a", "ä": "a",
        "É": "é", "È": "è", "Ê": "E", "Ë": "E", "ê": "e", "ë": "e",
        "Î": "I", "Ï": "I", "î": "i", "ï": "i",
        "Ô": "O", "Ö": "O", "ô": "o", "ö": "o",
        "Ù": "U", "Û": "U", "Ü": "U", "ù": "u", "û": "u", "ü": "u",
        "Ÿ": "Y", "ÿ": "y",
        "Ç": "C", "ç": "c",
        "Œ": "OE", "œ": "oe",
        "Æ": "AE", "æ": "ae",
    }
    for accented, plain in accent_map.items():
        text = text.replace(accented, plain)
    return text


def split_text(text, max_chars=MAX_CHARS_PER_SEGMENT):
    """Découpe le texte en segments de taille maximale, en coupant sur la ponctuation."""
    if len(text) <= max_chars:
        return [text]

    segments = []
    remaining = text

    while remaining:
        if len(remaining) <= max_chars:
            segments.append(remaining.strip())
            break

        # Chercher un point de coupure dans la limite
        chunk = remaining[:max_chars]

        # Priorité : fin de phrase (. ! ?)
        cut_pos = -1
        for punct in ".!?":
            pos = chunk.rfind(punct)
            if pos > cut_pos:
                cut_pos = pos

        # Si pas trouvé, chercher ponctuation secondaire (; : ,)
        if cut_pos == -1:
            for punct in ";:,":
                pos = chunk.rfind(punct)
                if pos > cut_pos:
                    cut_pos = pos

        # Si toujours pas trouvé, couper sur un espace
        if cut_pos == -1:
            cut_pos = chunk.rfind(" ")

        # En dernier recours, couper à max_chars
        if cut_pos == -1:
            cut_pos = max_chars - 1

        segment = remaining[: cut_pos + 1].strip()
        if segment:
            segments.append(segment)
        remaining = remaining[cut_pos + 1 :].strip()

    return segments


def trim_silence(audio_arr, threshold=0.005):
    """Supprime le silence au début et à la fin de l'audio."""
    mask = np.abs(audio_arr) > threshold
    if mask.any():
        start = np.argmax(mask)
        end = len(mask) - np.argmax(mask[::-1])
        return audio_arr[start:end]
    return audio_arr


@spaces.GPU  # <- important pour ZeroGPU
def generate_segment(text_segment, description):
    """Génère l'audio pour un segment de texte."""
    torch.manual_seed(SEED)
    desc_inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
    text_inputs = tokenizer(text_segment.strip(), return_tensors="pt").to(device)

    generation = model.generate(
        input_ids=desc_inputs.input_ids,
        prompt_input_ids=text_inputs.input_ids,
        attention_mask=desc_inputs.attention_mask,
        prompt_attention_mask=text_inputs.attention_mask,
        do_sample=True,
        temperature=1.0,
        max_new_tokens=1024,
    )

    audio_arr = generation.cpu().numpy().squeeze()
    return trim_silence(audio_arr)


def gen_tts(text, description):
    if not text.strip():
        return None

    text = preprocess_text(text)
    segments = split_text(text)

    audio_segments = []
    for segment in segments:
        audio_arr = generate_segment(segment, description)
        audio_segments.append(audio_arr)

    # Concaténer tous les segments audio
    full_audio = np.concatenate(audio_segments)

    return SAMPLE_RATE, full_audio


with gr.Blocks() as demo:
    gr.HTML(
        """
        <div style="text-align: center; max-width: 700px; margin: 0 auto;">
          <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
            French Parler-TTS 🗣️
          </h1>
          <p>Génération de voix française avec Parler-TTS Mini (ZeroGPU).</p>
        </div>
        """
    )

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Texte d'entrée",
                lines=3,
                value=default_text,
            )
            description = gr.Textbox(
                label="Description de la voix",
                lines=3,
                value=default_description,
            )
            run_button = gr.Button("Générer l'audio", variant="primary")

        with gr.Column():
            audio_out = gr.Audio(
                label="Parler-TTS generation",
                type="numpy",
            )

    run_button.click(
        fn=gen_tts,
        inputs=[input_text, description],
        outputs=[audio_out],
        queue=True,
    )

    gr.Examples(
        examples=examples,
        inputs=[input_text, description],
        outputs=[audio_out],
        fn=gen_tts,
        cache_examples=False,
    )

demo.queue()
demo.launch()