import os
import base64
import tempfile
import gradio as gr
from pathlib import Path
import requests
from mistralai.client import Mistral

def list_user_voices():
    # Keep the textual list for debugging if needed, or remove it. Let's keep it but improve it.
    try:
        client = get_client()
        result = client.audio.voices.list(limit=100, offset=0)
        if result.total == 0:
            return "No voices found in your account."
        out = f"**Total Voices:** {result.total}\n\n"
        for voice in result.items:
            out += f"- **{voice.name}**\n  - ID: `{voice.id}`\n  - Languages: {', '.join(voice.languages) if hasattr(voice, 'languages') else 'unknown'}\n"
        return out
    except Exception as e:
        return f"Error fetching voices: {str(e)}"

def get_voice_choices():
    try:
        client = get_client()
        res = client.audio.voices.list(limit=100, offset=0)
        # Filter for Official Mistral Voices (Paul, Oliver, Jane, Marie) so we hide randomly cloned user voices
        official_names = ("Paul", "Oliver", "Jane", "Marie")
        official = []
        for v in res.items:
            if v.name.startswith(official_names) and " - " in v.name:
                official.append((f"{v.name}", v.id))
        return official
    except:
        return []


# ─── Client ───────────────────────────────────────────────────────────────────
def get_client():
    api_key = os.environ.get("MISTRAL_API_KEY")
    if not api_key:
        raise gr.Error("MISTRAL_API_KEY secret is not set. Add it in Space Settings → Secrets.")
    return Mistral(api_key=api_key)


# ─── STT ──────────────────────────────────────────────────────────────────────
def transcribe_audio(audio_path, language):
    """Convert audio file → text using Voxtral Mini Transcribe."""
    if audio_path is None:
        return "⚠️ Please record or upload an audio file first."
    try:
        client = get_client()
        lang_param = language if language != "Auto-detect" else None
        with open(audio_path, "rb") as f:
            kwargs = dict(
                model="voxtral-mini-latest",
                file={"content": f, "file_name": Path(audio_path).name},
            )
            if lang_param:
                kwargs["language"] = lang_param
            response = client.audio.transcriptions.complete(**kwargs)
        return response.text
    except Exception as e:
        return f"❌ Error: {str(e)}"


# ─── TTS ──────────────────────────────────────────────────────────────────────
BUILTIN_VOICES = {
    "Default (no voice clone)": None,
}

def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
    """Convert text → speech using Voxtral Mini TTS."""
    if not text.strip():
        return None, "⚠️ Please enter some text."
    try:
        client = get_client()
        voice_id = voice_id_input.strip() if voice_id_input and voice_id_input.strip() else None

        kwargs = dict(
            model="voxtral-mini-tts-2603",
            input=text,
            response_format=audio_format,
        )
        if voice_id:
            kwargs["voice_id"] = voice_id
        
        # Add Reference Audio for Zero-shot tone/voice cloning
        if ref_audio_path:
            with open(ref_audio_path, "rb") as f:
                ref_audio_b64 = base64.b64encode(f.read()).decode("utf-8")
            kwargs["ref_audio"] = ref_audio_b64
        if not voice_id and not ref_audio_path:
            raise gr.Error("Mistral API requires a voice! Please either upload a short 'Reference Audio' clip to define the voice tone zero-shot, OR paste a valid Voice ID you cloned in the Voice Cloning tab. There are no built-in standard voices.")

        response = client.audio.speech.complete(**kwargs)
        audio_bytes = base64.b64decode(response.audio_data)

        # Write to temp file
        suffix = f".{audio_format}"
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
        tmp.write(audio_bytes)
        tmp.close()
        return tmp.name, f"✅ Generated {len(audio_bytes):,} bytes of {audio_format.upper()} audio."
    except Exception as e:
        return None, f"❌ Error: {str(e)}"


# ─── Voice Cloning ────────────────────────────────────────────────────────────
def clone_voice(audio_path, url_input, voice_name, gender, languages_str):
    """Upload a sample audio or provide a URL to create a reusable cloned voice."""
    if not audio_path and not url_input.strip():
        return "⚠️ Please upload an audio clip or provide a media URL.", gr.update()
    if not voice_name.strip():
        return "⚠️ Please enter a name for the voice.", gr.update()
        
    final_audio_path = audio_path
    
    try:
        # If URL is provided, handle direct links or yt-dlp
        if url_input.strip():
            url = url_input.strip()
            base_out = tempfile.mktemp()
            
            # If it's a direct audio file link, bypass yt-dlp and download it directly
            if url.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a')):
                try:
                    ext = url.split('.')[-1]
                    final_audio_path = f"{base_out}.{ext}"
                    with requests.get(url, stream=True, timeout=15) as r:
                        r.raise_for_status()
                        with open(final_audio_path, 'wb') as f:
                            for chunk in r.iter_content(chunk_size=8192):
                                f.write(chunk)
                except Exception as e:
                    return f"❌ Error downloading direct audio link: {str(e)}", gr.update()
            # Otherwise use yt-dlp for TikTok, Twitter, YouTube (if not blocked), etc.
            else:
                import yt_dlp
                ydl_opts = {
                    'format': 'bestaudio/best',
                    'outtmpl': base_out + '.%(ext)s',
                    'quiet': True,
                    'postprocessors': [{
                        'key': 'FFmpegExtractAudio',
                        'preferredcodec': 'mp3',
                        'preferredquality': '128',
                    }],
                    'postprocessor_args': [
                        '-t', '60' # Limit to first 60 seconds
                    ],
                }
                try:
                    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                        info = ydl.extract_info(url, download=True)
                        final_audio_path = base_out + '.mp3'
                except Exception as e:
                    err_msg = str(e)
                    if "Sign in to confirm" in err_msg or "bot" in err_msg.lower() or "youtube" in err_msg.lower():
                        raise gr.Error("YouTube blocked the Hugging Face Server. Please use a TikTok/Twitter link, OR paste a direct .MP3 URL, OR upload the file manually.")
                    else:
                        raise gr.Error(f"Video download failed: {err_msg}")
                
        client = get_client()
        sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode()
        langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"]
        voice = client.audio.voices.create(
            name=voice_name.strip(),
            sample_audio=sample_b64,
            sample_filename=Path(final_audio_path).name,
            languages=langs,
            gender=gender.lower(),
        )
        
        # Clean up downloaded file
        if url_input.strip() and os.path.exists(final_audio_path):
            try: os.remove(final_audio_path)
            except: pass
        # Build new choices specifically for this user session: Official Voices + Their new clone
        new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
        return (
            f"✅ Voice created!\n\n**Voice ID:** `{voice.id}`\n**Name:** {voice.name}\n**Languages:** {', '.join(voice.languages)}\n\nThis voice has been automatically selected in the Text-to-Speech tab!",
            gr.update(choices=new_session_choices, value=voice.id)
        )
    except Exception as e:
        return f"❌ Error: {str(e)}", gr.update()


# ─── UI ───────────────────────────────────────────────────────────────────────
LANGUAGES = [
    "Auto-detect", "en", "fr", "es", "de", "it", "pt",
    "zh", "ja", "ko", "ar", "ru", "hi", "nl"
]

css = """
@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;500;600;700;800&display=swap');

* { font-family: 'Outfit', sans-serif; }

body, .gradio-container {
    background: radial-gradient(circle at 10% 20%, #150f24 0%, #07040d 100%) !important;
    min-height: 100vh;
}

.gradio-container {
    max-width: 1050px !important;
    margin: 0 auto !important;
}

/* App Header */
.app-header {
    text-align: center;
    padding: 3.5rem 1rem 1.5rem;
    position: relative;
    z-index: 10;
}
.app-header h1 {
    font-size: 3.2rem;
    font-weight: 800;
    letter-spacing: -1.5px;
    background: linear-gradient(135deg, #c084fc 0%, #ec4899 50%, #facc15 100%);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    background-clip: text;
    margin-bottom: 0.5rem;
    animation: glow-pulse 3s infinite alternate;
}
.app-header p {
    color: #94a3b8;
    font-size: 1.25rem;
    font-weight: 500;
    margin-top: 0;
}
.highlight-badge {
    background: linear-gradient(135deg, #f59e0b, #ef4444);
    color: white;
    padding: 2px 8px;
    border-radius: 8px;
    font-size: 0.8rem;
    font-weight: 800;
    vertical-align: top;
    margin-left: 10px;
    box-shadow: 0 0 10px rgba(239, 68, 68, 0.6);
}

/* Glass panel wrapper */
div.tabs-container, .panel-box {
    background: rgba(255, 255, 255, 0.02) !important;
    border: 1px solid rgba(255, 255, 255, 0.05) !important;
    border-radius: 20px !important;
    box-shadow: 0 10px 40px 0 rgba(0, 0, 0, 0.4) !important;
    backdrop-filter: blur(15px) !important;
    -webkit-backdrop-filter: blur(15px) !important;
    overflow: hidden;
}

/* Tabs */
.tab-nav {
    border-bottom: 1px solid rgba(255,255,255,0.05) !important;
    padding: 10px 10px 0 10px !important;
}
.tab-nav button {
    background: transparent !important;
    border: none !important;
    border-bottom: 3px solid transparent !important;
    color: #64748b !important;
    border-radius: 0 !important;
    margin: 0 !important;
    padding: 1rem 2rem !important;
    font-weight: 600 !important;
    font-size: 1.05rem !important;
    transition: all 0.3s ease !important;
    box-shadow: none !important;
}
.tab-nav button.selected, .tab-nav button:hover {
    color: #f8fafc !important;
    border-bottom: 3px solid #ec4899 !important;
    box-shadow: 0 20px 20px -20px rgba(236,72,153,0.3) !important;
    background: linear-gradient(0deg, rgba(236,72,153,0.1) 0%, transparent 100%) !important;
}

/* Inputs & Textareas */
textarea, input[type="text"], .dropdown-menu {
    background: rgba(0,0,0,0.25) !important;
    border: 1px solid rgba(255,255,255,0.08) !important;
    border-radius: 14px !important;
    color: #f8fafc !important;
    font-size: 1.05rem !important;
    transition: all 0.2s ease !important;
    padding: 0.75rem !important;
}
textarea:focus, input[type="text"]:focus {
    border-color: #ec4899 !important;
    box-shadow: 0 0 0 3px rgba(236,72,153,0.2) !important;
    background: rgba(0,0,0,0.4) !important;
}

/* Override Gradio layout borders */
div.form {
    border: none !important;
    box-shadow: none !important;
    background: transparent !important;
}

/* Cool gradient buttons */
button.primary {
    background: linear-gradient(135deg, #a78bfa 0%, #ec4899 100%) !important;
    border: none !important;
    color: white !important;
    border-radius: 14px !important;
    font-weight: 700 !important;
    font-size: 1.15rem !important;
    padding: 0.9rem !important;
    letter-spacing: 0.5px !important;
    box-shadow: 0 4px 15px rgba(236,72,153,0.3) !important;
    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
}
button.primary:hover {
    transform: translateY(-3px) !important;
    box-shadow: 0 8px 25px rgba(236,72,153,0.5) !important;
}

/* Secondary Button */
button.secondary {
    background: rgba(255,255,255,0.05) !important;
    border: 1px solid rgba(255,255,255,0.1) !important;
    border-radius: 14px !important;
    color: #e2e8f0 !important;
    transition: all 0.2s ease !important;
    font-weight: 600 !important;
}
button.secondary:hover {
    background: rgba(255,255,255,0.15) !important;
    border-color: rgba(255,255,255,0.3) !important;
}

/* Status text box */
.status-text {
    background: rgba(0,0,0,0.4);
    padding: 1.5rem;
    border-radius: 16px;
    border-left: 5px solid #a78bfa;
    color: #e2e8f0;
    font-size: 1rem;
    line-height: 1.6;
}

/* Highlight labels */
label span {
    color: #cbd5e1 !important;
    font-weight: 500 !important;
    letter-spacing: 0.2px !important;
}

/* Clean audio components */
.audio-component {
    border-radius: 16px !important;
    overflow: hidden !important;
    border: 1px solid rgba(255,255,255,0.05) !important;
}

/* Global Animations */
@keyframes glow-pulse {
    0% { filter: drop-shadow(0 0 15px rgba(167,139,250,0.3)); }
    100% { filter: drop-shadow(0 0 30px rgba(236,72,153,0.6)); }
}

/* Footer Hide */
footer { display: none !important; }
"""

INITIAL_VOICES = get_voice_choices()

with gr.Blocks(title="Voxtral Studio — Mistral AI Audio", css=css) as demo:

    gr.HTML("""
    <div class="app-header">
        <h1>🎙️ Voxtral Studio <span class="highlight-badge">VOICE CLONING</span></h1>
        <p>Powered by Mistral AI · STT & Elite Text-to-Speech + Instant Zero-Shot Cloning</p>

    </div>
    """)

    with gr.Tabs():

        # ── TAB 1: Speech to Text ──────────────────────────────────────────
        with gr.TabItem("🎤 Speech → Text"):
            gr.Markdown("""
            **Upload or record audio** and Voxtral Mini will transcribe it with high accuracy.
            Supports 13 languages, handles noise, and can detect the language automatically.
            """)
            with gr.Row():
                with gr.Column(scale=1):
                    stt_audio = gr.Audio(
                        label="Audio Input",
                        sources=["microphone", "upload"],
                        type="filepath",
                        elem_classes=["audio-component"],
                    )
                    stt_language = gr.Dropdown(
                        choices=LANGUAGES,
                        value="Auto-detect",
                        label="Language",
                    )
                    stt_btn = gr.Button("✨ Transcribe", variant="primary")

                with gr.Column(scale=1):
                    stt_output = gr.Textbox(
                        label="Transcription",
                        lines=12,
                        placeholder="Your transcribed text will appear here...",
                    )

            stt_btn.click(
                fn=transcribe_audio,
                inputs=[stt_audio, stt_language],
                outputs=stt_output,
            )

        # ── TAB 2: Text to Speech ──────────────────────────────────────────
        with gr.TabItem("🔊 Text → Speech", elem_classes=["tabs-container"]):
            gr.Markdown("""
            **Type text** and Voxtral Mini TTS converts it to natural speech.
            Optionally paste a **Voice ID** from the Voice Cloning tab to use your own cloned voice.
            """)
            with gr.Row():
                with gr.Column(scale=1):
                    tts_text = gr.Textbox(
                        label="Text to speak",
                        lines=8,
                        placeholder="Enter text here (max ~300 words for best results). Avoid markdown or special characters.",
                        value="Hello! Welcome to Voxtral Studio, powered by Mistral AI. This is a demonstration of high-quality neural text-to-speech synthesis.",
                    )
                    with gr.Row():
                        tts_voice_id = gr.Dropdown(
                            label="Select a Mistral Voice or Your Clones",
                            choices=INITIAL_VOICES,
                            value=INITIAL_VOICES[0][1] if INITIAL_VOICES else None,
                            allow_custom_value=True,
                            scale=3,
                        )
                        voices_btn = gr.Button("🔄 Refresh List", size="sm", scale=1)
                        
                    voices_list_out = gr.Markdown(visible=False) # Hide text list since we use dropdown now
                    
                    tts_ref_audio = gr.Audio(
                        label="OR: Reference Audio (Set voice tone instantly)",
                        sources=["upload", "microphone"],
                        type="filepath",
                    )
                    tts_format = gr.Dropdown(
                        choices=["mp3", "wav", "flac", "opus"],
                        value="mp3",
                        label="Audio Format",
                    )
                    tts_btn = gr.Button("🎵 Generate Speech", variant="primary")

                with gr.Column(scale=1):
                    tts_audio_out = gr.Audio(
                        label="Generated Audio",
                        type="filepath",
                        elem_classes=["audio-component"],
                    )
                    tts_status = gr.Markdown(elem_classes=["status-text"])

            tts_btn.click(
                fn=synthesize_speech,
                inputs=[tts_text, tts_voice_id, tts_ref_audio, tts_format],
                outputs=[tts_audio_out, tts_status],
            )
            voices_btn.click(
                fn=lambda: gr.update(choices=get_voice_choices()),
                inputs=[],
                outputs=tts_voice_id,
            )

        # ── TAB 3: Voice Cloning ───────────────────────────────────────────
        with gr.TabItem("🧬 Voice Cloning", elem_classes=["tabs-container"]):
            gr.Markdown("""
            **Clone any voice** by uploading a short audio sample (10–60 seconds recommended).
            The model will save it as a reusable voice. Copy the Voice ID and paste it in the TTS tab.

            > ⚠️ Only clone voices with **explicit consent**. Do not impersonate real people.
            """)
            with gr.Row():
                with gr.Column(scale=1):
                    clone_audio = gr.Audio(
                        label="Voice Sample (upload or record)",
                        sources=["microphone", "upload"],
                        type="filepath",
                        elem_classes=["audio-component"],
                    )
                    clone_url = gr.Textbox(
                        label="OR: Media URL (TikTok, Twitter, or direct .MP3/.WAV link)",
                        placeholder="https://...link_to_audio_or_video...",
                    )
                    clone_name = gr.Textbox(
                        label="Voice Name",
                        placeholder="e.g. my-assistant-voice",
                    )
                    clone_gender = gr.Dropdown(
                        choices=["Female", "Male"],
                        value="Female",
                        label="Gender",
                    )
                    clone_langs = gr.Textbox(
                        label="Languages (comma-separated)",
                        value="en",
                        placeholder="en, fr, es",
                    )
                    clone_btn = gr.Button("🧬 Clone Voice", variant="primary")

                with gr.Column(scale=1):
                    clone_result = gr.Markdown(
                        value="Your new Voice ID will appear here after cloning.",
                        elem_classes=["status-text"],
                    )

            clone_btn.click(
                fn=clone_voice,
                inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
                outputs=[clone_result, tts_voice_id],
            )

    gr.HTML("""
    <div style="text-align:center; padding: 1.5rem; color: #475569; font-size: 0.85rem;">
        Built with <a href="https://docs.mistral.ai/capabilities/audio/" target="_blank" style="color:#a78bfa;">Mistral Voxtral</a>
        · <a href="https://huggingface.co/" target="_blank" style="color:#60a5fa;">Hugging Face Spaces</a>
    </div>
    """)


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)