import gradio as gr
from pydub import AudioSegment, effects
import numpy as np
import librosa
import soundfile as sf
import matplotlib.pyplot as plt
import io
from PIL import Image


def generate_preset_text(rms, zcr, centroid, hp_cutoff, notch_freq):
    preset = "\U0001F39B️ AI Vocal Chain Preset (FL Studio Style)\n\n"

    if rms < 0.01:
        preset += "- Gain: +6dB\n"
        gain_db = 6
    else:
        preset += "- Gain: OK\n"
        gain_db = 0

    if zcr > 0.08:
        preset += "- De-esser: Target 5kHz–8kHz\n"
    else:
        preset += "- De-esser: Not needed\n"

    preset += f"- High-pass filter at {hp_cutoff}Hz (remove mud)\n"
    preset += f"- Notch EQ: -5dB around {int(notch_freq)}Hz (reduce harshness)\n"
    preset += "- High-shelf boost: +4dB above 10kHz (add air)\n"
    preset += "- Fruity Compressor: Ratio 3:1, Threshold -20dB, Fast Attack\n"
    preset += "- Limiter: Output ceiling -1dB (prevent clipping)\n"

    return preset, gain_db


def find_harsh_frequency(y, sr, freq_range=(2000, 5000)):
    S = np.abs(librosa.stft(y))**2
    freqs = librosa.fft_frequencies(sr=sr)
    power = np.mean(S, axis=1)

    mask = (freqs >= freq_range[0]) & (freqs <= freq_range[1])
    freqs_in_range = freqs[mask]
    power_in_range = power[mask]

    if len(freqs_in_range) == 0:
        return 3000

    peak_idx = np.argmax(power_in_range)
    return freqs_in_range[peak_idx]


def apply_notch(audio, center_freq, width=200, reduction_db=-5):
    notch_band = audio.high_pass_filter(center_freq - width).low_pass_filter(center_freq + width)
    notch_cut = notch_band.apply_gain(reduction_db)
    return audio.overlay(notch_cut, gain_during_overlay=reduction_db)


def apply_smart_eq(audio, sr, y):
    try:
        pitch = librosa.yin(y, fmin=50, fmax=300, sr=sr)
        min_pitch = np.nanmin(pitch)
        hp_cutoff = max(70, int(min_pitch * 0.8)) if not np.isnan(min_pitch) else 80
    except:
        hp_cutoff = 80

    notch_freq = find_harsh_frequency(y, sr)

    hp_filtered = audio.high_pass_filter(hp_cutoff)
    notched = apply_notch(hp_filtered, center_freq=notch_freq, width=200, reduction_db=-5)
    air = audio.high_pass_filter(10000).apply_gain(+4)
    eq_result = notched.overlay(air)

    return eq_result, hp_cutoff, notch_freq


def apply_limiter(audio, threshold_db=-1.0):
    normalized = effects.normalize(audio)
    peak_db = normalized.max_dBFS

    if peak_db > threshold_db:
        reduction = threshold_db - peak_db
        limited = normalized.apply_gain(reduction)
    else:
        limited = normalized

    return limited


def generate_waveform_plot(file_path):
    y, sr = librosa.load(file_path, sr=None)
    duration = librosa.get_duration(y=y, sr=sr)
    times = np.linspace(0, duration, num=len(y))

    fig, ax = plt.subplots(figsize=(10, 2), facecolor="#1e1e1e")
    ax.plot(times, y, color="violet", linewidth=0.8)
    ax.set_xlim([0, duration])
    ax.set_ylim([-1, 1])
    ax.set_xlabel("Time (s)", color="white")
    ax.set_ylabel("Amplitude", color="white")
    ax.set_title("Processed Vocal Waveform", color="white")
    ax.tick_params(colors='white')
    fig.patch.set_facecolor('#1e1e1e')
    ax.set_facecolor('#1e1e1e')
    plt.tight_layout()

    buf = io.BytesIO()
    plt.savefig(buf, format="png", facecolor=fig.get_facecolor())
    buf.seek(0)
    plt.close()

    return Image.open(buf)


def process_vocal(file):
    if file is None:
        raise ValueError("No file was uploaded.")

    audio = AudioSegment.from_file(file)
    audio.export("input.wav", format="wav")
    y, sr = librosa.load("input.wav", sr=None)

    rms = librosa.feature.rms(y=y)[0]
    zcr = librosa.feature.zero_crossing_rate(y=y)[0]
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]

    eq_audio, hp_cutoff, notch_freq = apply_smart_eq(audio, sr, y)
    preset_text, gain_db = generate_preset_text(np.mean(rms), np.mean(zcr), np.mean(centroid), hp_cutoff, notch_freq)

    with open("vocal_chain_preset.txt", "w") as f:
        f.write(preset_text)

    processed = eq_audio.apply_gain(gain_db)
    processed = effects.compress_dynamic_range(processed, threshold=-20.0, ratio=3.0, attack=5.0, release=50.0)
    processed = apply_limiter(processed, threshold_db=-1.0)
    processed.export("processed_output.wav", format="wav")

    waveform_plot = generate_waveform_plot("processed_output.wav")

    return "processed_output.wav", preset_text, "vocal_chain_preset.txt", waveform_plot

with gr.Blocks(css="""
body {
  background-color: #121212;
  color: white;
  font-family: 'Segoe UI', sans-serif;
}
.gr-button {
  background-color: #4f46e5;
  color: white;
  border-radius: 12px;
  border: none;
  padding: 8px 16px;
  font-weight: bold;
  transition: 0.3s;
}
.gr-button:hover {
  background-color: #6366f1;
}
.gr-textbox, .gr-audio, .gr-file, .gr-image {
  border-radius: 10px;
  background-color: #1e1e1e;
  color: white;
  border: 1px solid #6b21a8;
  width: 100%;
}
.gr-row {
  display: flex;
  justify-content: space-between;
  gap: 10px;
}
""") as interface:
    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload Your Vocal")
        submit_btn = gr.Button("Submit")

    with gr.Row():
        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Processed Vocal")
        with gr.Column(scale=1):
            preset_box = gr.Textbox(label="Suggested Vocal Chain / Preset")

    with gr.Row():
        preset_file = gr.File(label="Download Preset (.txt)")

    with gr.Row():
        waveform_plot = gr.Image(label="Waveform Preview")

    submit_btn.click(
        fn=process_vocal,
        inputs=[audio_input],
        outputs=[audio_output, preset_box, preset_file, waveform_plot]
    )

if __name__ == "__main__":
    interface.launch()