import os
import re
import tempfile
import subprocess
import gradio as gr
from pydub import AudioSegment
import soundfile as sf
import NeuTTS_Air_q4_GGUF

# Load NeuTTS-Air (Q8-GGUF)
print("🧠 Loading NeuTTS-Air (Q4-GGUF)...")
tts = NeuTTSAir(
    backbone_repo="neuphonic/neutts-air-q4-gguf",
    backbone_device="cpu",
    codec_repo="neuphonic/neucodec",
    codec_device="cpu"
)

def parse_host_script(script):
    lines = script.strip().split('\n')
    segments = []
    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue
        if stripped.startswith("HOST 1:"):
            segments.append(("HOST 1", stripped[len("HOST 1:"):].strip()))
        elif stripped.startswith("HOST 2:"):
            segments.append(("HOST 2", stripped[len("HOST 2:"):].strip()))
        else:
            segments.append(("HOST 1", stripped))
    return segments

def generate_audio_from_script(script, ref1_wav, ref1_txt, ref2_wav, ref2_txt):
    segments = parse_host_script(script)
    ref_map = {
        "HOST 1": (ref1_wav, ref1_txt),
        "HOST 2": (ref2_wav, ref2_txt)
    }
    
    output_files = []
    for i, (tag, text) in enumerate(segments):
        ref_wav, ref_text = ref_map.get(tag, ref_map["HOST 1"])
        ref_codes = tts.encode_reference(ref_wav)
        wav = tts.infer(text, ref_codes, ref_text)
        out_path = f"/tmp/seg_{i}.wav"
        sf.write(out_path, wav, 24000)
        output_files.append(out_path)
    
    # Concatenate
    combined = AudioSegment.empty()
    for f in output_files:
        combined += AudioSegment.from_wav(f)
    final_path = "/tmp/script_audio.wav"
    combined.export(final_path, format="wav")
    
    return final_path

def generate_video_with_lipsync(audio_path, ref_video_path):
    # Your existing lip-sync pipeline here
    # Example placeholder - replace with your actual code:
    output_video = f"/tmp/output_{os.path.basename(audio_path)}.mp4"
    
    # Placeholder: Use your lip-sync model (e.g., SadTalker, Wav2Lip, etc.)
    # subprocess.run([...])
    
    # For now, just return the reference video as placeholder
    # Replace this with your actual lip-sync logic
    return ref_video

def generate_video_podcast(script, ref1_video, ref1_transcript, ref2_video=None, ref2_transcript=None):
    # Generate audio from script
    audio_path = generate_audio_from_script(script, ref1_video, ref1_transcript, ref2_video, ref2_transcript)
    
    # Generate video with lip-sync
    video_path = generate_video_with_lipsync(audio_path, ref1_video)
    
    return video_path, audio_path, "✅ Video podcast generated!"

# Gradio UI
with gr.Blocks(title="2nd-Host AI - Video Podcast Generator") as demo:
    gr.Markdown("# 🎥 2nd-Host AI — Video Podcast Generator")
    gr.Markdown("Upload reference videos + transcripts. Enter script. Get video with lip-sync.")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### HOST 1 Reference")
            ref1_video = gr.Video(label="Reference Video (15s, face)", sources=["upload"])
            ref1_txt = gr.Textbox(label="Transcript", placeholder="What they said in video")
        
        with gr.Column():
            gr.Markdown("### HOST 2 Reference (Optional)")
            ref2_video = gr.Video(label="Reference Video (15s, face)", sources=["upload"])
            ref2_txt = gr.Textbox(label="Transcript", placeholder="What they said in video")
    
    script = gr.Textbox(label="Script (HOST 1: / HOST 2:)", lines=8)
    btn = gr.Button("Generate Video Podcast")
    
    video_out = gr.Video(label="Generated Video")
    audio_out = gr.Audio(label="Generated Audio")
    status = gr.Textbox(label="Status")
    
    btn.click(
        generate_video_podcast,
        inputs=[script, ref1_video, ref1_txt, ref2_video, ref2_txt],
        outputs=[video_out, audio_out, status]
    )

demo.launch()