import os
import gradio as gr
import asyncio
import tempfile
import edge_tts
import requests
from langdetect import detect, LangDetectException
from transformers import pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer

# ----------------------------
# 1. SPEECH TO TEXT (Whisper)
# ----------------------------
stt_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small")

def transcribe(audio):
    if audio is None:
        return None
    result = stt_pipeline(audio)
    return result["text"]

# ----------------------------
# 2. TRANSLATION (M2M100)
# ----------------------------
m2m_model_name = "facebook/m2m100_418M"
m2m_tokenizer = M2M100Tokenizer.from_pretrained(m2m_model_name)
m2m_model = M2M100ForConditionalGeneration.from_pretrained(m2m_model_name)

LANG_UI_TO_CODE = {"English": "en", "Spanish": "es", "French": "fr"}

def translate_text(user_text, target_lang_ui):
    if not user_text.strip():
        return ""
    target_code = LANG_UI_TO_CODE.get(target_lang_ui, "en")
    try:
        src_code = detect(user_text)
    except LangDetectException:
        src_code = "en"
    if src_code == target_code:
        return user_text
    m2m_tokenizer.src_lang = src_code
    encoded = m2m_tokenizer(user_text, return_tensors="pt")
    generated = m2m_model.generate(**encoded, forced_bos_token_id=m2m_tokenizer.get_lang_id(target_code))
    return m2m_tokenizer.decode(generated[0], skip_special_tokens=True)

# ----------------------------
# 3. EMOTION DETECTION (Groq API)
# ----------------------------
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
API_URL = "https://api.groq.ai/v1/text/analyze"

def detect_emotion_tone(text):
    if not text.strip():
        return "neutral"
    headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"}
    payload = {"text": text, "features": ["emotion"]}
    try:
        r = requests.post(API_URL, headers=headers, json=payload)
        r.raise_for_status()
        result = r.json()
        emotions = result.get("emotion", {})
        if not emotions:
            return "neutral"
        return max(emotions, key=emotions.get)
    except Exception:
        return "neutral"

# ----------------------------
# 4. TEXT TO SPEECH (Edge TTS)
# ----------------------------
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None
    voice_short_name = voice.split(" - ")[0]
    communicate = edge_tts.Communicate(text, voice_short_name, rate=f"{rate:+d}%", pitch=f"{pitch:+d}Hz")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
        await communicate.save(tmp.name)
        return tmp.name

def tts_sync(text, voice, rate, pitch):
    return asyncio.run(text_to_speech(text, voice, rate, pitch))

# ----------------------------
# 5. PIPELINE FUNCTION
# ----------------------------
async def full_pipeline(audio, target_lang):
    # Step 1: STT
    text = transcribe(audio)
    if not text:
        return None

    # Step 2: Translate
    translated = translate_text(text, target_lang)

    # Step 3: Emotion Detection
    emotion = detect_emotion_tone(text)

    # Step 4: Pick voice based on emotion
    voices = await edge_tts.list_voices()
    if emotion == "happy":
        voice_choice = [v for v in voices if "en-US-AriaNeural" in v["ShortName"]]
    elif emotion == "sad":
        voice_choice = [v for v in voices if "en-US-JennyNeural" in v["ShortName"]]
    elif emotion == "angry":
        voice_choice = [v for v in voices if "en-US-GuyNeural" in v["ShortName"]]
    else:
        voice_choice = [voices[0]]
    voice_final = f"{voice_choice[0]['ShortName']} - {voice_choice[0]['Locale']}"

    # Step 5: Generate final audio
    audio_out = await text_to_speech(translated, voice_final, 0, 0)
    return audio_out

# ----------------------------
# 6. GRADIO UI
# ----------------------------
with gr.Blocks() as demo:
    gr.Markdown("# 🎤 Speech Translator with Emotions")

    with gr.Row():
        audio_in = gr.Audio(sources=["microphone"], type="filepath", label="Record Speech")
        target_lang = gr.Dropdown(choices=["English", "Spanish", "French"], value="English", label="Target Language")

    final_speech = gr.Audio(label="🔊 Final Speech", type="filepath")

    run_btn = gr.Button("🚀 Translate & Speak")
    run_btn.click(fn=full_pipeline, inputs=[audio_in, target_lang], outputs=[final_speech])

if __name__ == "__main__":
    demo.launch()