import gc
import io
import asyncio
import numpy as np
import soundfile as sf
import torch
import os

import bark
from fastapi import FastAPI, HTTPException
from fastapi.responses import Response
from bark import generate_audio, preload_models
from typing import List, Dict
from pydantic import BaseModel
from typing import List, Optional
# =====================================================
# APP
# =====================================================
app = FastAPI(title="Bark TTS Production API")

# =====================================================
# LOAD MODELS ON STARTUP
# =====================================================
preload_models()

# =====================================================
# CONCURRENCY CONTROL
# =====================================================

# Max 20 concurrent users total
GLOBAL_SEMAPHORE = asyncio.Semaphore(20)

# Per-voice sequential locks
VOICE_LOCKS: Dict[str, asyncio.Semaphore] = {}

VOICE_LOCKS_LOCK = asyncio.Lock()  # protects VOICE_LOCKS dict


async def get_voice_lock(voice: str) -> asyncio.Semaphore:
    async with VOICE_LOCKS_LOCK:
        if voice not in VOICE_LOCKS:
            VOICE_LOCKS[voice] = asyncio.Semaphore(1)
        return VOICE_LOCKS[voice]

# =====================================================
# METADATA
# =====================================================
EXPRESSIONS = {
    # --------------------------------------------------
    # Breathing & Airflow
    # --------------------------------------------------
    "breathing": "[breathing]",
    "slow_breathing": "[slow breathing]",
    "heavy_breathing": "[heavy breathing]",
    "shallow_breathing": "[shallow breathing]",
    "deep_inhale": "[inhales deeply]",
    "deep_exhale": "[exhales]",
    "sharp_inhale": "[sharp inhale]",
    "gasp": "[gasps]",
    "panting": "[panting]",
    "catching_breath": "[catching breath]",

    # --------------------------------------------------
    # Sighs & Relief
    # --------------------------------------------------
    "sigh": "[sighs]",
    "relieved_sigh": "[sighs with relief]",
    "frustrated_sigh": "[frustrated sigh]",
    "tired_sigh": "[tired sigh]",

    # --------------------------------------------------
    # Hesitation & Thinking
    # --------------------------------------------------
    "hesitation": "...",
    "hesitation_dots": "...",
    "hesitation_dashes": "—",
    "thinking": "[thinking]",
    "thinking_hum": "[hums thoughtfully]",
    "unsure_hum": "[hesitant hum]",
    "pause": "[pause]",

    # --------------------------------------------------
    # Laughter & Amusement
    # --------------------------------------------------
    "laughter": "[laughter]",
    "laughs": "[laughs]",
    "soft_laugh": "[soft laugh]",
    "nervous_laugh": "[nervous laugh]",
    "awkward_laugh": "[awkward laugh]",
    "giggle": "[giggles]",
    "chuckle": "[chuckles]",
    "snort": "[snorts]",

    # --------------------------------------------------
    # Effort & Exertion
    # --------------------------------------------------
    "grunt": "[grunts]",
    "strain": "[straining]",
    "effort": "[exertion]",
    "groan_effort": "[groans]",
    "release_breath": "[releases breath]",
    "fatigue": "[fatigued breath]",

    # --------------------------------------------------
    # Surprise & Reaction
    # --------------------------------------------------
    "surprised_gasp": "[surprised gasp]",
    "startled": "[startled]",
    "shock": "[shocked inhale]",
    "oh_reaction": "[sudden inhale]",

    # --------------------------------------------------
    # Pain / Distress (non-graphic)
    # --------------------------------------------------
    "pain_grunt": "[painful grunt]",
    "wince": "[winces]",
    "distress": "[distressed breathing]",
    "strained_voice": "[strained voice]",

    # --------------------------------------------------
    # Crying & Sadness (non-graphic)
    # --------------------------------------------------
    "sob": "[sobs]",
    "quiet_sob": "[quiet sob]",
    "whimper": "[whimpers]",
    "soft_cry": "[soft crying]",
    "tearful_breath": "[tearful breathing]",

    # --------------------------------------------------
    # Approval / Disapproval
    # --------------------------------------------------
    "approval_hum": "[approving hum]",
    "satisfied_hum": "[satisfied hum]",
    "disappointed": "[disappointed sigh]",
    "annoyed": "[annoyed exhale]",
    "dismissive": "[dismissive huff]",

    # --------------------------------------------------
    # Sleep & Tiredness
    # --------------------------------------------------
    "yawn": "[yawns]",
    "sleepy_breath": "[sleepy breathing]",
    "groggy": "[groggy inhale]",
    "relaxed_exhale": "[relaxed exhale]",

    # --------------------------------------------------
    # Ambient / Texture
    # --------------------------------------------------
    "murmur": "[murmuring]",
    "background_voice": "[background voice]",
    "indistinct_vocal": "[indistinct vocal sounds]",
    "muffled_voice": "[muffled voice]",

    # --------------------------------------------------
    # Voice Modifiers (approximate)
    # --------------------------------------------------
    "whisper": "(whispers)",
    "shout": "(shouting)",
    "breathy_voice": "[breathy voice]",
    "raspy_voice": "[raspy voice]",
    "trembling_voice": "[trembling voice]",

    # --------------------------------------------------
    # Misc
    # --------------------------------------------------
    "clears_throat": "[clears throat]"
}


SOUND_EFFECTS = {
    # --------------------------------------------------
    # Music & Musical Cues (approximate)
    # --------------------------------------------------
    "music": "[music]",
    "background_music": "[background music]",
    "music_fade_in": "[music fades in]",
    "music_fade_out": "[music fades out]",
    "humming_music": "[humming]",
    "song_notes": "♪ {text} ♪",
    "melody": "[melodic humming]",

    # --------------------------------------------------
    # Cinematic / Dramatic
    # --------------------------------------------------
    "dramatic_hit": "*dramatic cinematic sound*",
    "dramatic_pause": "*dramatic pause*",
    "dramatic_silence": "*sudden silence*",
    "impact": "*impact sound*",
    "tension_rise": "*rising tension sound*",
    "reveal": "*cinematic reveal sound*",

    # --------------------------------------------------
    # Radio / Audio Artifacts
    # --------------------------------------------------
    "static": "*radio static noise*",
    "interference": "*audio interference*",
    "signal_drop": "*signal breaking up*",
    "distortion": "*distorted audio*",
    "low_quality": "*low quality audio*",
    "glitch": "*audio glitch*",

    # --------------------------------------------------
    # Echo / Space / Environment
    # --------------------------------------------------
    "echo": "*echoing voice*",
    "reverb": "*reverberation*",
    "hall_echo": "*large hall echo*",
    "distance_voice": "*distant voice*",
    "muffled": "*muffled sound*",
    "underwater": "*underwater sound*",

    # --------------------------------------------------
    # Movement / Presence (vocal-texture based)
    # --------------------------------------------------
    "footsteps": "*footsteps*",
    "approaching": "*approaching sound*",
    "departing": "*fading footsteps*",
    "rustle": "*rustling sound*",
    "cloth_movement": "*fabric movement*",

    # --------------------------------------------------
    # Environmental Ambience (approximate)
    # --------------------------------------------------
    "wind": "*wind blowing*",
    "rain": "*rain falling*",
    "storm": "*distant thunder*",
    "crowd": "*crowd murmuring*",
    "room_tone": "*room tone*",
    "outdoor_ambience": "*outdoor ambience*",

    # --------------------------------------------------
    # Mechanical / Tech (very rough)
    # --------------------------------------------------
    "beep": "*beep*",
    "alarm": "*alarm sound*",
    "machine_hum": "*machine humming*",
    "engine": "*engine rumble*",
    "power_on": "*powering on sound*",
    "power_off": "*powering down sound*",

    # --------------------------------------------------
    # Creature / Non-human (unstable)
    # --------------------------------------------------
    "growl": "*low growl*",
    "roar": "*distant roar*",
    "rumble": "*low rumble*",
    "alien": "*alien-like sound*",

    # --------------------------------------------------
    # Silence / Timing Control
    # --------------------------------------------------
    "pause": "*pause*",
    "short_pause": "*brief pause*",
    "long_pause": "*long pause*"
}


STYLE_HINTS = {
    "narration": "in a calm, professional narration tone",
    "emotional": "with deep emotional expression",
    "robotic": "with a robotic, synthetic delivery",
    "space": "as if speaking through a helmet in space with echo",
    "cinematic": "cinematic, dramatic delivery",
}

LANGUAGES = [
    "English", "French", "Spanish", "German", "Italian",
    "Portuguese", "Polish", "Russian", "Turkish",
    "Chinese", "Japanese", "Korean", "Hindi"
]

def load_all_bark_voice_presets():
    presets = []

    bark_root = os.path.dirname(bark.__file__)
    prompts_dir = os.path.join(bark_root, "assets", "prompts")

    if not os.path.isdir(prompts_dir):
        return presets

    for file in os.listdir(prompts_dir):
        if file.endswith(".npz"):
            presets.append(file.replace(".npz", ""))

    presets.sort()
    return presets


VOICE_PRESETS = load_all_bark_voice_presets()

# =====================================================
# UTILS
# =====================================================
def chunk_text(text: str, max_chars: int = 200):
    words = text.split()
    chunks, current = [], []
    for w in words:
        current.append(w)
        if len(" ".join(current)) >= max_chars:
            chunks.append(" ".join(current))
            current = []
    if current:
        chunks.append(" ".join(current))
    return chunks


def clean_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# =====================================================
# ROUTES
# =====================================================
@app.get("/meta")
def meta():
    return {
        "languages": LANGUAGES,
        "voices": VOICE_PRESETS,
        "expressions": EXPRESSIONS,
        "sound_effects": SOUND_EFFECTS,
        "styles": STYLE_HINTS,
        "limits": {
            "max_concurrent_users": 20,
            "per_voice_sequential": True
        }
    }


class TTSRequest(BaseModel):
    text: str
    voice: str
    expressions: List[str] = []
    sounds: List[str] = []
    style: Optional[str] = None


@app.post("/tts")
async def tts(req: TTSRequest):
    text = req.text
    voice = req.voice
    expressions = req.expressions
    sounds = req.sounds
    style = req.style

    # 1. Validation
    if not text.strip():
        raise HTTPException(status_code=400, detail="Text is required")

    if voice not in VOICE_PRESETS:
        raise HTTPException(status_code=400, detail=f"Invalid voice preset: {voice}")

    # 2. Build the Audio Prefix (Expressions, Sounds, Style)
    prompt_parts = []

    for e in expressions:
        if e in EXPRESSIONS:
            prompt_parts.append(EXPRESSIONS[e])

    for s in sounds:
        if s in SOUND_EFFECTS:
            prompt_parts.append(SOUND_EFFECTS[s])
            
    if style and style in STYLE_HINTS:
        prompt_parts.append(STYLE_HINTS[style])

    prefix = " ".join(prompt_parts).strip()

    # 3. Chunk ONLY the spoken text 
    chunks = chunk_text(text)
    
    # Prepend the prefix (sounds/expressions) only to the very first chunk
    if prefix:
        chunks[0] = f"{prefix} {chunks[0]}"

    # 4. Acquire Locks
    voice_lock = await get_voice_lock(voice)

    async with GLOBAL_SEMAPHORE:
        async with voice_lock:
            
            audio_out = []

            # 5. Generate Audio sequentially for each chunk
            for chunk in chunks:
                # Bark can sometimes fail on completely empty strings, skip if empty
                if not chunk.strip():
                    continue
                    
                audio_array = await asyncio.to_thread(
                    generate_audio,
                    chunk,
                    history_prompt=voice
                )
                audio_out.append(audio_array)
                clean_memory()

            # 6. Combine and package the audio
            if not audio_out:
                raise HTTPException(status_code=500, detail="Failed to generate audio.")

            final_audio = np.concatenate(audio_out)

            buffer = io.BytesIO()
            # Bark's default sample rate is 24kHz
            sf.write(buffer, final_audio, 24000, format="WAV")
            buffer.seek(0)

            clean_memory()

            return Response(
                content=buffer.read(),
                media_type="audio/wav"
            )