import gradio as gr
import subprocess
import os
import requests
import base64
import wave
import json

API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
    print("Gimini Api key NOT Found")
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key={API_KEY}"


def generate_tts(text_to_speak, output_file="output.wav"):
    payload = {
        "contents": [{"parts": [{"text": text_to_speak}]}],
        "generationConfig": {
            "responseModalities": ["AUDIO"],
            "speechConfig": {
                "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": "Kore"}}
            },
        },
        "model": "gemini-2.5-flash-preview-tts",
    }
    headers = {"Content-Type": "application/json"}
    response = requests.post(API_URL, headers=headers, data=json.dumps(payload))
    result = response.json()

    audio_part = result["candidates"][0]["content"]["parts"][0]
    base64_audio_data = audio_part["inlineData"]["data"]
    mime_type = audio_part["inlineData"]["mimeType"]
    sample_rate = int(mime_type.split("rate=")[-1])
    pcm_data = base64.b64decode(base64_audio_data)

    with wave.open(output_file, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(pcm_data)

    return output_file


def preprocess_video(input_path, output_path="processed_face.mp4"):
    """
    Standardize input video so Wav2Lip can process it reliably.
    - Resizes to 640x640
    - Sets FPS to 25
    - Forces compatible codec/pixel format
    """
    cmd = [
        "ffmpeg", "-y", "-i", input_path,
        "-vf", "scale=640:640,fps=25",
        "-c:v", "libx264", "-pix_fmt", "yuv420p",
        output_path
    ]
    subprocess.run(cmd, check=True)
    return output_path


def make_talking_video(video_file, text):
    # Step 1: Generate TTS audio
    audio_path = "output.wav"
    generate_tts(text, audio_path)

    # Step 2: Preprocess input video
    preprocessed_video = preprocess_video(video_file)

    # Step 3: Run Wav2Lip inference
    output_video = "result_voice.mp4"
    command = [
        "python", "-m", "Wav2Lip.inference",
        "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
        "--face", preprocessed_video,
        "--audio", audio_path,
        "--outfile", output_video,
    ]
    subprocess.run(command, check=True)

    return output_video


with gr.Blocks() as demo:
    gr.Markdown("## 🎥 Text-to-Speech Avatar with Wav2Lip + Gemini TTS")

    with gr.Row():
        video_input = gr.Video(label="Upload 30s Video")
        text_input = gr.Textbox(label="Enter Prompt", placeholder="What should the avatar say?")

    output_video = gr.Video(label="Result", autoplay=True)

    submit_btn = gr.Button("Generate Talking Video")
    submit_btn.click(make_talking_video, inputs=[video_input, text_input], outputs=output_video)

if __name__ == "__main__":
    demo.launch()