import os
import tempfile
from pathlib import Path  

import gradio as gr
import numpy as np
import soundfile as sf
import librosa
import torch

from pyharp import ModelCard, build_endpoint
from df import enhance, init_df

# -----------------------------
# Model metadata for HARP
# -----------------------------
model_card = ModelCard(
    name="Background Noise Remover (DeepFilterNet3)",
    description=(
        "Background noise suppression / speech enhancement using DeepFilterNet3. "
        "Input is converted to mono 48kHz. Slider controls strength through wet/dry blend."
    ),
    author="Derek Llanes",
    tags=["denoise", "speech enhancement", "deepfilternet", "v3"],
)

# -----------------------------
# Device & Model Initialization
# -----------------------------
# Auto-detect GPU for Hugging Face deployment
DEVICE_STR = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE = torch.device(DEVICE_STR)

# Load model 1 time into global memory
MODEL, DF_STATE, _ = init_df()
MODEL = MODEL.to(DEVICE)


def load_audio_mono_48k(path: str):
    # Load audio from filepath, convert to mono float32, resample to 48kHz.
    try:
        audio, sr = sf.read(path, always_2d=False)
    except Exception:
        audio, sr = librosa.load(path, sr=None, mono=False)

    audio = np.asarray(audio)

    # stereo to mono
    if audio.ndim == 2:
        audio = audio.mean(axis=1)

    audio = audio.astype(np.float32)

    # resample to 48k
    if sr != 48000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=48000)
        sr = 48000

    return audio, sr


def apply_attenuation_db(noisy: np.ndarray, enhanced: np.ndarray, noise_atten_db: float, max_db: float = 30.0):
    """
    Map Noise Attenuation (dB) slider to a stable strength in [0,1]
    and crossfade between original and enhanced audio.
    """
    # convert dB slider into [0,1]
    s = float(noise_atten_db) / float(max_db)
    s = max(0.0, min(1.0, s))

    # make same length
    n = min(len(noisy), len(enhanced))
    noisy = noisy[:n]
    enhanced = enhanced[:n]

    out = (1.0 - s) * noisy + s * enhanced
    return out.astype(np.float32)


@torch.inference_mode()
def process_fn(input_audio_path: str, noise_atten_db: float) -> str:
    if not input_audio_path:
        raise ValueError("No input audio provided.")

    # Load and normalize
    noisy, sr = load_audio_mono_48k(input_audio_path)

    # numpy to torch, add channel dim [T] to [1, T]
    noisy_t = torch.from_numpy(noisy).float().unsqueeze(0).to(DEVICE)

    # Denoise, then remove added channel and back to numpy
    enhanced_t = enhance(MODEL, DF_STATE, noisy_t)
    enhanced = enhanced_t.squeeze(0).detach().cpu().numpy()

    # Slider strength
    out = apply_attenuation_db(noisy, enhanced, noise_atten_db, max_db=30.0)

    # Save output WAV and return path
    out_dir = Path(tempfile.gettempdir()) / "pyharp_dfnet_outputs"
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "denoised.wav"

    sf.write(str(out_path), out, sr)
    return str(out_path)


# -----------------------------
# Gradio endpoint
# -----------------------------
with gr.Blocks() as demo:
    input_components = [
        gr.Audio(type="filepath", label="Input Audio").harp_required(True),
        gr.Slider(
            minimum=0,
            maximum=30,
            step=1,
            value=12,
            label="Noise Attenuation (dB)",
            info="0 = no change, 30 = strongest. Implemented as wet/dry strength.",
        ),
    ]

    output_components = [
        gr.Audio(type="filepath", label="Output Audio")
        .set_info("Denoised audio output."),
    ]

    app = build_endpoint(
        model_card=model_card,
        input_components=input_components,
        output_components=output_components,
        process_fn=process_fn,
    )

demo.queue().launch(show_error=False, pwa=True)