import gradio as gr
import torch
import numpy as np
import soundfile as sf
from pathlib import Path
import os
import warnings
import subprocess
import tempfile
import math

warnings.filterwarnings("ignore", category=UserWarning)

# ────────────────────────────────────────────────
# Lazy import for TTS model (not required for video tab)
# ────────────────────────────────────────────────
try:
    from qwen_tts import Qwen3TTSModel
    TTS_AVAILABLE = True
except ImportError:
    TTS_AVAILABLE = False

# ────────────────────────────────────────────────
# Globals & Model Loader
# ────────────────────────────────────────────────

MODELS = {
    "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
    "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
    "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
    "1.7B-Base":        "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
    "0.6B-Base":        "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
}

loaded_models = {}

def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()):
    if not TTS_AVAILABLE:
        raise gr.Error("qwen_tts is not installed. TTS tabs unavailable.")
    key = f"{model_key}_{dtype_str}"
    if key in loaded_models:
        return loaded_models[key]
    progress(0.1, desc=f"Loading {model_key} ({dtype_str}) …")
    repo_id = MODELS[model_key]
    dtype = torch.float32 if dtype_str == "float32" else torch.float16
    try:
        model = Qwen3TTSModel.from_pretrained(
            repo_id, device_map="cpu", dtype=dtype,
            torch_dtype=dtype, low_cpu_mem_usage=True,
        )
    except Exception as e:
        raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.")
    loaded_models[key] = model
    progress(0.9, desc="Model ready.")
    return model


# ────────────────────────────────────────────────
# TTS Inference (unchanged)
# ────────────────────────────────────────────────

def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
    if not text.strip():
        return None, "Please enter some text."
    model = get_model(model_key, precision, progress)
    progress(0.4, desc="Generating …")
    try:
        wavs, sr = model.generate_custom_voice(
            text=text, language=lang if lang != "Auto" else None,
            speaker=speaker, instruct=instruct.strip() or None, max_new_tokens=1500,
        )
        path = "/tmp/output_custom.wav"
        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
        info = f"**Generated with {model_key}**  \nlang: {lang}  \nspeaker: {speaker}  \ninstruct: {instruct or '(none)'}"
        return path, info
    except Exception as e:
        return None, f"**Error**: {str(e)}"


def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
    if not text.strip() or not instruct.strip():
        return None, "Text and voice instruction required."
    model = get_model(model_key, precision, progress)
    progress(0.4, desc="Generating …")
    try:
        wavs, sr = model.generate_voice_design(
            text=text, language=lang if lang != "Auto" else None,
            instruct=instruct, max_new_tokens=1500,
        )
        path = "/tmp/output_design.wav"
        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
        info = f"**Voice Design – {model_key}**  \nlang: {lang}  \ninstruct: {instruct}"
        return path, info
    except Exception as e:
        return None, f"**Error**: {str(e)}"


def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
    if not text.strip():
        return None, "Enter text to synthesize."
    if not ref_audio:
        return None, "Upload reference audio."
    model = get_model(model_key, precision, progress)
    progress(0.3, desc="Processing reference …")
    try:
        wavs, sr = model.generate_voice_clone(
            text=text, language=lang if lang != "Auto" else None,
            ref_audio=ref_audio, ref_text=ref_text.strip() or None,
            x_vector_only_mode=x_vector_only, max_new_tokens=1500,
        )
        path = "/tmp/output_clone.wav"
        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
        info = f"**Voice Clone – {model_key}**  \nlang: {lang}  \nx-vector-only: {x_vector_only}"
        return path, info
    except Exception as e:
        return None, f"**Error**: {str(e)}"


# ────────────────────────────────────────────────
# Video Generation Helpers
# ────────────────────────────────────────────────

RESOLUTIONS = {
    "1080×1920 (TikTok/Reels 9:16)": (1080, 1920),
    "1080×1080 (Instagram Square)":  (1080, 1080),
    "1920×1080 (YouTube Landscape)": (1920, 1080),
    "1280×720 (YouTube 720p)":       (1280, 720),
}

VISUAL_STYLES = [
    "🎙 Solid + Waveform",
    "🌊 Animated Spectrum Bars",
    "⚡ Oscilloscope Line",
    "🌈 Gradient Pulse",
    "🔲 Minimal Dark + Title",
]


def hex_to_rgb(h: str):
    h = h.lstrip("#")
    return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))


def render_frame_solid_waveform(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Solid background with a centered waveform line."""
    from PIL import Image, ImageDraw, ImageFont
    img = Image.new("RGB", (w, h), bg_color)
    draw = ImageDraw.Draw(img)
    # waveform
    n = len(audio_chunk)
    if n == 0:
        return img
    cx = h // 2
    bar_w = max(1, w // max(n, 1))
    for i, amp in enumerate(audio_chunk):
        x = int(i * w / n)
        bar_h = int(abs(amp) * h * 0.4)
        draw.rectangle([x, cx - bar_h, x + bar_w - 1, cx + bar_h], fill=accent_color)
    # title text
    if title:
        try:
            from PIL import ImageFont
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
        except Exception:
            font = None
        draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
    return img


def render_frame_spectrum(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Animated spectrum-like bars using FFT."""
    from PIL import Image, ImageDraw
    img = Image.new("RGB", (w, h), bg_color)
    draw = ImageDraw.Draw(img)
    N_BARS = 64
    n = len(audio_chunk)
    if n > 0:
        spectrum = np.abs(np.fft.rfft(audio_chunk, n=512))[:N_BARS]
        spectrum = spectrum / (spectrum.max() + 1e-9)
    else:
        spectrum = np.zeros(N_BARS)
    bar_w = w // N_BARS
    for i, val in enumerate(spectrum):
        bar_h = int(val * h * 0.8)
        x0 = i * bar_w
        x1 = x0 + bar_w - 2
        # gradient colour from accent to white
        r, g, b = accent_color
        draw.rectangle([x0, h - bar_h, x1, h], fill=(r, g, b))
    if title:
        try:
            from PIL import ImageFont
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
        except Exception:
            font = None
        draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
    return img


def render_frame_oscilloscope(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Classic green-phosphor oscilloscope line."""
    from PIL import Image, ImageDraw
    img = Image.new("RGB", (w, h), (10, 10, 10))
    draw = ImageDraw.Draw(img)
    n = len(audio_chunk)
    cx = h // 2
    pts = []
    for i in range(n):
        x = int(i * w / n)
        y = int(cx - audio_chunk[i] * h * 0.4)
        y = max(0, min(h - 1, y))
        pts.append((x, y))
    if len(pts) > 1:
        draw.line(pts, fill=accent_color, width=max(2, h // 200))
    if title:
        try:
            from PIL import ImageFont
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
        except Exception:
            font = None
        draw.text((w // 2, h // 12), title, fill=(200, 255, 200), anchor="mm", font=font)
    return img


def render_frame_gradient_pulse(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Radial gradient that pulses with RMS energy."""
    from PIL import Image, ImageDraw, ImageFilter
    rms = float(np.sqrt(np.mean(audio_chunk ** 2))) if len(audio_chunk) > 0 else 0
    r0, g0, b0 = bg_color
    r1, g1, b1 = accent_color
    img = Image.new("RGB", (w, h))
    pixels = img.load()
    cx, cy = w // 2, h // 2
    max_r = math.sqrt(cx**2 + cy**2)
    pulse = 0.3 + rms * 2.5
    for y in range(h):
        for x in range(w):
            dist = math.sqrt((x - cx)**2 + (y - cy)**2) / max_r
            t = max(0.0, min(1.0, 1 - dist / pulse))
            pixels[x, y] = (
                int(r0 + t * (r1 - r0)),
                int(g0 + t * (g1 - g0)),
                int(b0 + t * (b1 - b0)),
            )
    draw = ImageDraw.Draw(img)
    if title:
        try:
            from PIL import ImageFont
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
        except Exception:
            font = None
        draw.text((cx, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
    return img


def render_frame_minimal_dark(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
    """Minimal dark with thin progress bar + centered text."""
    from PIL import Image, ImageDraw
    img = Image.new("RGB", (w, h), (18, 18, 22))
    draw = ImageDraw.Draw(img)
    # thin horizontal waveform strip
    strip_h = max(4, h // 15)
    cy = h // 2
    n = len(audio_chunk)
    for i in range(n):
        x = int(i * w / n)
        amp = int(audio_chunk[i] * strip_h)
        draw.rectangle([x, cy - abs(amp), x, cy + abs(amp)], fill=accent_color)
    # bottom progress indicator: thin white line based on frame
    prog_w = int(frame_idx * w / max(fps * 1, 1))  # width grows with time; real duration injected below
    draw.rectangle([0, h - 4, prog_w, h], fill=accent_color)
    if title:
        try:
            from PIL import ImageFont
            font_size = max(24, h // 18)
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
        except Exception:
            font = None
        draw.text((w // 2, h * 2 // 5), title, fill=(240, 240, 245), anchor="mm", font=font)
    return img


RENDERERS = {
    "🎙 Solid + Waveform":     render_frame_solid_waveform,
    "🌊 Animated Spectrum Bars": render_frame_spectrum,
    "⚡ Oscilloscope Line":    render_frame_oscilloscope,
    "🌈 Gradient Pulse":       render_frame_gradient_pulse,
    "🔲 Minimal Dark + Title": render_frame_minimal_dark,
}


def audio_to_video(
    audio_path, style, resolution_label,
    bg_hex, accent_hex, title_text, fps_str,
    progress=gr.Progress()
):
    if not audio_path:
        return None, "❌ No audio file provided. Generate or upload audio first."

    fps = int(fps_str)
    w, h = RESOLUTIONS[resolution_label]
    bg_color    = hex_to_rgb(bg_hex)
    accent_color = hex_to_rgb(accent_hex)
    render_fn   = RENDERERS[style]

    # ---- Load audio ----
    progress(0.05, desc="Reading audio …")
    try:
        audio_data, sr = sf.read(audio_path, dtype="float32")
    except Exception as e:
        return None, f"❌ Could not read audio: {e}"

    if audio_data.ndim > 1:
        audio_data = audio_data.mean(axis=1)  # mono

    duration   = len(audio_data) / sr
    n_frames   = int(duration * fps)
    samples_per_frame = max(1, len(audio_data) // max(n_frames, 1))

    # ---- Write frames to temp dir ----
    progress(0.10, desc="Rendering frames …")
    with tempfile.TemporaryDirectory() as tmpdir:
        frame_dir = Path(tmpdir) / "frames"
        frame_dir.mkdir()

        # gradient_pulse is slow (pixel-by-pixel); warn user
        for fi in range(n_frames):
            if fi % max(1, n_frames // 20) == 0:
                progress(0.10 + 0.65 * fi / n_frames, desc=f"Frame {fi}/{n_frames} …")

            start = fi * samples_per_frame
            end   = min(start + samples_per_frame, len(audio_data))
            chunk = audio_data[start:end] if end > start else np.zeros(64)

            img = render_fn(w, h, chunk, bg_color, accent_color, title_text, fi, fps)
            img.save(str(frame_dir / f"frame_{fi:06d}.png"))

        # ---- Assemble with ffmpeg ----
        progress(0.80, desc="Encoding video …")
        out_path = "/tmp/tts_video.mp4"
        ffmpeg_cmd = [
            "ffmpeg", "-y",
            "-framerate", str(fps),
            "-i", str(frame_dir / "frame_%06d.png"),
            "-i", audio_path,
            "-c:v", "libx264",
            "-preset", "fast",
            "-crf", "23",
            "-pix_fmt", "yuv420p",
            "-c:a", "aac",
            "-b:a", "192k",
            "-shortest",
            "-movflags", "+faststart",
            out_path,
        ]
        result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
        if result.returncode != 0:
            return None, f"❌ ffmpeg error:\n```\n{result.stderr[-1500:]}\n```"

    progress(1.0, desc="Done!")
    info = (
        f"✅ **Video ready!**  \n"
        f"Style: `{style}` · Resolution: `{w}×{h}` · FPS: `{fps}` · Duration: `{duration:.1f}s`"
    )
    return out_path, info


# ────────────────────────────────────────────────
# UI
# ────────────────────────────────────────────────

css = """
.radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
.radio-row > div { min-width: 140px; }
"""

with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants • CPU-friendly • No streaming")

    # ── Tab 1: Custom Voice ────────────────────────────────────────────────
    with gr.Tab("CustomVoice – Preset speakers + instruct"):
        gr.Markdown("Uses 9 built-in premium voices + optional style instruction")
        with gr.Row(elem_classes="radio-row"):
            cv_model     = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
            cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
        with gr.Row():
            cv_text    = gr.Textbox(label="Text to speak", lines=4, value="这是一个测试。希望声音听起来自然一些。")
            cv_lang    = gr.Dropdown(["Auto","Chinese","English","Japanese","Korean"], value="Auto", label="Language")
            cv_speaker = gr.Dropdown(
                ["Vivian","Serena","Uncle_Fu","Dylan","Eric","Ryan","Aiden","Ono_Anna","Sohee"],
                value="Vivian", label="Speaker"
            )
        cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="用特别愤怒的语气说")
        cv_btn   = gr.Button("Generate", variant="primary")
        cv_audio = gr.Audio(label="Generated Speech", type="filepath")
        cv_info  = gr.Markdown()
        cv_btn.click(infer_custom_voice,
                     inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
                     outputs=[cv_audio, cv_info])

    # ── Tab 2: Voice Design ───────────────────────────────────────────────
    with gr.Tab("Voice Design – Describe any voice"):
        gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)")
        with gr.Row(elem_classes="radio-row"):
            vd_model     = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
            vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
        vd_text     = gr.Textbox(label="Text to speak", lines=4, value="哥哥，你回来啦，人家等了好久，要抱抱！")
        vd_lang     = gr.Dropdown(["Auto","Chinese","English"], value="Chinese", label="Language")
        vd_instruct = gr.Textbox(label="Voice description / instruction", lines=4,
                                  value="体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，黏人、做作又刻意卖萌的感觉")
        vd_btn   = gr.Button("Generate", variant="primary")
        vd_audio = gr.Audio(label="Generated Speech", type="filepath")
        vd_info  = gr.Markdown()
        vd_btn.click(infer_voice_design,
                     inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision],
                     outputs=[vd_audio, vd_info])

    # ── Tab 3: Voice Clone ────────────────────────────────────────────────
    with gr.Tab("Base – Voice Clone from reference audio"):
        gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)")
        with gr.Row(elem_classes="radio-row"):
            cl_model     = gr.Radio(["1.7B-Base","0.6B-Base"], value="1.7B-Base", label="Model")
            cl_precision = gr.Radio(["float32","float16"], value="float32", label="Precision")
        cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.")
        cl_lang = gr.Dropdown(["Auto","English","Chinese"], value="Auto", label="Language")
        with gr.Row():
            cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload","microphone"])
            cl_ref_text  = gr.Textbox(label="Transcript of reference (optional)", lines=2)
        cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, lower quality)", value=False)
        cl_btn   = gr.Button("Clone & Generate", variant="primary")
        cl_audio = gr.Audio(label="Cloned Speech", type="filepath")
        cl_info  = gr.Markdown()
        cl_btn.click(infer_voice_clone,
                     inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision],
                     outputs=[cl_audio, cl_info])

    # ── Tab 4: Audio → Video ──────────────────────────────────────────────
    with gr.Tab("🎬 Audio → Video"):
        gr.Markdown(
            "## Audio → Social Media Video\n"
            "Upload **any WAV/MP3** (or paste the path from a generated clip above) "
            "and render it into a shareable MP4 with a visual style.\n\n"
            "> ⚠️ **Gradient Pulse** renders per-pixel and is slow for long audio — prefer other styles for > 30 s clips."
        )

        with gr.Row():
            with gr.Column(scale=3):
                vid_audio = gr.Audio(
                    label="Input audio (upload or record)",
                    type="filepath",
                    sources=["upload", "microphone"],
                )
                vid_title = gr.Textbox(
                    label="Title / caption text (shown on video)",
                    placeholder="My AI Voice · Qwen3-TTS",
                    value=""
                )
            with gr.Column(scale=2):
                vid_style = gr.Radio(
                    VISUAL_STYLES,
                    value="🌊 Animated Spectrum Bars",
                    label="Visual style",
                )
                vid_res = gr.Dropdown(
                    list(RESOLUTIONS.keys()),
                    value="1080×1920 (TikTok/Reels 9:16)",
                    label="Resolution / aspect ratio",
                )
                vid_fps = gr.Radio(["24", "30"], value="24", label="FPS")

        with gr.Row():
            vid_bg     = gr.ColorPicker(value="#0d0d1a", label="Background colour")
            vid_accent = gr.ColorPicker(value="#7c3aed", label="Accent / waveform colour")

        vid_btn   = gr.Button("🎬 Render Video", variant="primary", size="lg")
        vid_out   = gr.Video(label="Output video")
        vid_info  = gr.Markdown()

        vid_btn.click(
            audio_to_video,
            inputs=[vid_audio, vid_style, vid_res, vid_bg, vid_accent, vid_title, vid_fps],
            outputs=[vid_out, vid_info],
        )

        gr.Markdown("""
**Style guide:**
| Style | Best for | Notes |
|---|---|---|
| 🎙 Solid + Waveform | Podcasts, quotes | Fast, clean |
| 🌊 Animated Spectrum Bars | Music / speech highlights | FFT-based, energetic |
| ⚡ Oscilloscope Line | Dark/techy aesthetic | Classic green-on-black |
| 🌈 Gradient Pulse | Ambient / ASMR | Slow render — use short clips |
| 🔲 Minimal Dark + Title | Branded content | Great with a title caption |
        """)

    # ── Footer ────────────────────────────────────────────────────────────
    gr.Markdown("""
**Notes**  
• First generation per model loads weights (may take 1–5 min).  
• Use **float32** if **float16** causes crashes (common on CPU).  
• **0.6B** models are faster / lighter on CPU.  
• Video tab requires `ffmpeg` and `Pillow` (both standard on most systems).  
• Repo & docs: https://github.com/QwenLM/Qwen3-TTS
    """)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)