import os
os.environ["OMP_NUM_THREADS"] = "1"
import subprocess
from pathlib import Path
from datetime import datetime
import gradio as gr
from huggingface_hub import snapshot_download

ROOT = Path(__file__).parent.resolve()
REPO_DIR = ROOT / "LatentSync"
CKPT_DIR = REPO_DIR / "checkpoints"
TEMP_DIR = REPO_DIR / "temp"

# Use 1.5 on T4 16GB
HF_CKPT_REPO = "ByteDance/LatentSync-1.5"

def run(cmd, cwd=None):
    print(" ".join(map(str, cmd)))
    subprocess.check_call(cmd, cwd=cwd)

def create_mask_file():
    """Create the missing mask.png file"""
    mask_dir = REPO_DIR / "latentsync" / "utils"
    mask_path = mask_dir / "mask.png"
    
    if mask_path.exists():
        return
    
    mask_dir.mkdir(parents=True, exist_ok=True)
    
    # Create mask using numpy and PIL
    try:
        import numpy as np
        from PIL import Image
        
        # Create 256x256 mask (white = inpaint mouth area, black = keep)
        mask = np.zeros((256, 256), dtype=np.uint8)
        # Create ellipse for mouth region (lower face)
        center_x, center_y = 128, 180
        for y in range(256):
            for x in range(256):
                # Ellipse: ((x-cx)/rx)^2 + ((y-cy)/ry)^2 <= 1
                if ((x - center_x) / 90) ** 2 + ((y - center_y) / 64) ** 2 <= 1:
                    mask[y, x] = 255
        
        Image.fromarray(mask, mode='L').save(str(mask_path))
        print(f"✓ Created mask at {mask_path}")
    except Exception as e:
        print(f"Warning: Could not create mask: {e}")

def setup():
    if not REPO_DIR.exists():
        print("Cloning LatentSync repository...")
        run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
    
    CKPT_DIR.mkdir(parents=True, exist_ok=True)
    TEMP_DIR.mkdir(parents=True, exist_ok=True)
    
    # Create mask file before running inference
    create_mask_file()
    
    # Download checkpoints
    print("Downloading model checkpoints...")
    snapshot_download(
        repo_id=HF_CKPT_REPO,
        local_dir=str(CKPT_DIR),
        local_dir_use_symlinks=False,
    )
    print("✓ Setup complete")

def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
    """Convert static image + audio to video"""
    out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
    cmd = [
        "ffmpeg", "-y",
        "-loop", "1", "-i", image_path,
        "-i", audio_path,
        "-shortest",
        "-r", str(fps),
        "-vf", "scale=256:256:force_original_aspect_ratio=increase,crop=256:256",
        "-pix_fmt", "yuv420p",
        "-c:v", "libx264",
        "-c:a", "aac",
        str(out_path),
    ]
    run(cmd)
    return str(out_path)

def generate(avatar_img, audio_wav, steps, guidance, seed, use_deepcache):
    try:
        setup()
        
        if avatar_img is None:
            return None, "❌ Please upload an avatar image!"
        if audio_wav is None:
            return None, "❌ Please upload an audio file!"
        
        img_path = str(Path(avatar_img).resolve())
        wav_path = str(Path(audio_wav).resolve())
        
        # Create video from image + audio
        print("Creating input video...")
        video_path = make_still_video(img_path, wav_path, fps=25)
        
        out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
        
        # Fixed config path for LatentSync 1.5
        cmd = [
            "python", "-m", "scripts.inference",
            "--unet_config_path", "configs/unet/stage2.yaml",
            "--inference_ckpt_path", "checkpoints/latentsync_unet.pt",
            "--video_path", video_path,
            "--audio_path", wav_path,
            "--video_out_path", str(out_path),
            "--inference_steps", str(int(steps)),
            "--guidance_scale", str(float(guidance)),
            "--seed", str(int(seed)),
            "--temp_dir", "temp",
        ]
        
        if use_deepcache:
            cmd.append("--enable_deepcache")
        
        print("Generating lip-synced video...")
        run(cmd, cwd=str(REPO_DIR))
        
        if out_path.exists():
            return str(out_path), "✅ Video generated successfully!"
        else:
            return None, "❌ Video generation failed - output file not created"
            
    except subprocess.CalledProcessError as e:
        error_msg = f"❌ Command failed with return code {e.returncode}"
        return None, error_msg
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Gradio Interface - Compatible with Gradio 4.44.1
with gr.Blocks(title="LatentSync Lip Sync") as demo:
    gr.Markdown(
        """
        # 🎬 LatentSync 1.5 - AI Lip Sync Generator
        
        Upload an avatar image and audio file to generate a lip-synced video!
        
        **Tips:**
        - Use clear frontal face images for best results
        - Keep audio under 30 seconds for faster processing
        - Higher inference steps = better quality but slower
        """
    )
    
    with gr.Row():
        with gr.Column():
            avatar = gr.Image(
                type="filepath", 
                label="📷 Avatar Image (JPG/PNG)"
            )
            audio = gr.Audio(
                type="filepath", 
                label="🎵 Audio File (WAV)"
            )
        
        with gr.Column():
            gr.Markdown("### ⚙️ Generation Settings")
            steps = gr.Slider(
                10, 40, value=20, step=1, 
                label="Inference Steps (Higher = Better Quality)"
            )
            guidance = gr.Slider(
                0.8, 2.0, value=1.0, step=0.1, 
                label="Guidance Scale (Higher = Stronger Lip Sync)"
            )
            seed = gr.Number(
                value=1247, precision=0, 
                label="Seed (For Reproducibility)"
            )
            deepcache = gr.Checkbox(
                value=True, 
                label="Enable DeepCache (Faster - Recommended for T4)"
            )
    
    btn = gr.Button("🚀 Generate Lip-Synced Video", variant="primary")
    
    status = gr.Textbox(label="Status", interactive=False)
    out = gr.Video(label="Generated Video")
    
    btn.click(
        generate, 
        inputs=[avatar, audio, steps, guidance, seed, deepcache], 
        outputs=[out, status]
    )
    
    gr.Markdown(
        """
        ---
        ### 📝 Notes:
        - First run will download models (~7GB) - this may take a few minutes
        - Generation takes 30-90 seconds depending on settings
        - Works best with T4 GPU (16GB)
        - Based on [LatentSync by ByteDance](https://github.com/bytedance/LatentSync)
        """
    )

if __name__ == "__main__":
    demo.queue(max_size=3)
    demo.launch()