import os
import uuid
import shutil
import tempfile
import gradio as gr
import torch
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from diffusers import AutoencoderKL, DDIMScheduler
from latentsync.models.unet import UNet3DConditionModel
from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
from latentsync.whisper.audio2feature import Audio2Feature
from accelerate.utils import set_seed
AUTHOR = "Lý Trần"
COMMUNITY = "LTTEAM"
COMMUNITY_LINK = "https://www.facebook.com/groups/622526090937760"
REPO_ID = "LTTEAM/Nhep_Mieng"
os.makedirs("checkpoints", exist_ok=True)
snapshot_download(
    repo_id=REPO_ID,
    local_dir="./checkpoints"
)
def process_video(input_video_path: str, temp_dir: str = "temp_video") -> str:
    os.makedirs(temp_dir, exist_ok=True)
    video = VideoFileClip(input_video_path)
    output_path = os.path.join(
        temp_dir, f"crop_{os.path.basename(input_video_path)}"
    )
    if video.duration > 10:
        video = video.subclip(0, 10)
    video.write_videofile(output_path, codec="libx264", audio_codec="aac")
    return output_path
def process_audio(input_audio_path: str, temp_dir: str) -> str:
    os.makedirs(temp_dir, exist_ok=True)
    audio = AudioSegment.from_file(input_audio_path)
    max_ms = 8 * 1000
    if len(audio) > max_ms:
        audio = audio[:max_ms]
    output_path = os.path.join(temp_dir, "trim_audio.wav")
    audio.export(output_path, format="wav")
    return output_path
def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"[INFO] Chạy trên device: {device}")
    space_id = os.environ.get("SPACE_ID", "")
    is_shared_ui = "fffiloni/LatentSync" in space_id

    # Nếu chạy trên shared UI, lưu tạm và cắt ngắn đầu vào
    temp_dir = None
    if is_shared_ui:
        temp_dir = tempfile.mkdtemp()
        video_path = process_video(video_path, temp_dir)
        audio_path = process_audio(audio_path, temp_dir)

    # Nạp cấu hình và checkpoint
    config = OmegaConf.load("configs/unet/second_stage.yaml")
    unet_ckpt = "checkpoints/latentsync_unet.pt"
    scheduler = DDIMScheduler.from_pretrained("configs")

    # Chọn Whisper model dựa vào cross_attention_dim
    dim = config.model.cross_attention_dim
    if dim == 768:
        whisper_ckpt = "checkpoints/whisper/small.pt"
    elif dim == 384:
        whisper_ckpt = "checkpoints/whisper/tiny.pt"
    else:
        raise NotImplementedError("cross_attention_dim phải là 768 hoặc 384")

    # Tạo audio encoder
    audio_encoder = Audio2Feature(
        model_path=whisper_ckpt,
        device=device,
        num_frames=config.data.num_frames
    )

    # Nạp VAE
    vae = AutoencoderKL.from_pretrained(
        "stabilityai/sd-vae-ft-mse",
        torch_dtype=torch.float16 if device=="cuda" else torch.float32
    )
    vae.config.scaling_factor = 0.18215
    vae.config.shift_factor = 0

    # Nạp UNet
    unet, _ = UNet3DConditionModel.from_pretrained(
        OmegaConf.to_container(config.model),
        unet_ckpt,
        device=device
    )
    # Chuyển dtype phù hợp
    unet = unet.to(dtype=torch.float16) if device=="cuda" else unet.to(dtype=torch.float32)

    # Khởi tạo pipeline và chuyển lên device
    pipeline = LipsyncPipeline(
        vae=vae,
        audio_encoder=audio_encoder,
        unet=unet,
        scheduler=scheduler,
    ).to(device)

    # Thiết lập seed
    seed = -1
    if seed != -1:
        set_seed(seed)
    else:
        torch.seed()
    print(f"[INFO] Seed khởi tạo: {torch.initial_seed()}")

    # Thực thi pipeline
    output_id = uuid.uuid4().hex
    result_path = f"output_{output_id}.mp4"
    pipeline(
        video_path=video_path,
        audio_path=audio_path,
        video_out_path=result_path,
        video_mask_path=result_path.replace(".mp4", "_mask.mp4"),
        num_frames=config.data.num_frames,
        num_inference_steps=config.run.inference_steps,
        guidance_scale=1.0,
        weight_dtype=torch.float16 if device=="cuda" else torch.float32,
        width=config.data.resolution,
        height=config.data.resolution,
    )

    # Dọn dẹp thư mục tạm nếu có
    if is_shared_ui and temp_dir and os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)

    return result_path
custom_css = """
:root {
    --primary: #4CAF50;
    --secondary: #8BC34A;
    --accent: #FFC107;
    --dark: #1E1E1E;
    --light: #F5F5F5;
}

body {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    background-color: var(--light);
}

div#main-container {
    margin: 0 auto;
    max-width: 900px;
    background: white;
    padding: 2rem;
    border-radius: 12px;
    box-shadow: 0 4px 12px rgba(0,0,0,0.1);
}

h1 {
    color: var(--primary);
    border-bottom: 2px solid var(--secondary);
    padding-bottom: 0.5rem;
}

.gr-button {
    background: var(--primary) !important;
    color: white !important;
    border: none !important;
    padding: 0.75rem 1.5rem !important;
    border-radius: 8px !important;
    font-weight: 600 !important;
    transition: all 0.3s ease !important;
}

.gr-button:hover {
    background: var(--secondary) !important;
    transform: translateY(-2px) !important;
    box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important;
}

.gr-box {
    border-radius: 8px !important;
    border: 1px solid #e0e0e0 !important;
}

footer {
    text-align: center;
    margin-top: 2rem;
    color: #666;
    font-size: 0.9rem;
}

.example-container {
    background: #f9f9f9;
    padding: 1rem;
    border-radius: 8px;
    margin-top: 1rem;
}
"""

with gr.Blocks(css=custom_css, title="LatentSync - Đồng bộ môi bằng AI") as demo:
    with gr.Column(elem_id="main-container"):
        # Header
        gr.Markdown(f"# 🎤 LatentSync - Đồng bộ môi bằng AI")
        gr.Markdown(f"**Tác giả:** {AUTHOR} | **Cộng đồng:** [{COMMUNITY}]({COMMUNITY_LINK})")
        
        # Giới thiệu
        with gr.Accordion("ℹ️ Giới thiệu ứng dụng", open=False):
            gr.Markdown("""
            Ứng dụng sử dụng mô hình AI tiên tiến để đồng bộ chuyển động môi trong video với âm thanh đầu vào. 
            
            **Cách sử dụng:**
            1. Tải lên video chứa khuôn mặt cần đồng bộ môi
            2. Tải lên file âm thanh hoặc ghi âm trực tiếp
            3. Nhấn nút "Chạy đồng bộ" và chờ kết quả
            
            **Lưu ý:**
            - Video nên có khuôn mặt rõ ràng, ánh sáng tốt
            - Âm thanh cần rõ ràng, không nhiễu
            - Thời gian xử lý phụ thuộc vào độ dài video và cấu hình máy
            """)
        
        # Input/Output
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 🎥 Đầu vào")
                video_in = gr.Video(label="Video đầu vào (MP4)", format="mp4", interactive=True)
                audio_in = gr.Audio(label="Âm thanh đầu vào", type="filepath", interactive=True)
                with gr.Row():
                    btn = gr.Button("🚀 Chạy đồng bộ", variant="primary")
                    clear_btn = gr.Button("🔄 Xóa hết")
            
            with gr.Column():
                gr.Markdown("### 📼 Kết quả")
                video_out = gr.Video(label="Video kết quả", interactive=False)
                with gr.Row():
                    download_btn = gr.Button("💾 Tải xuống")
        
        # Ví dụ mẫu - ĐÃ SỬA LỖI Ở ĐÂY
        with gr.Accordion("📂 Ví dụ mẫu", open=True):
            gr.Examples(
                examples=[
                    ["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
                    ["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
                    ["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
                ],
                inputs=[video_in, audio_in],
                outputs=[video_out],
                fn=main,  # Thêm hàm xử lý chính
                label="Nhấn vào ví dụ để thử ngay",
                # cache_examples=True  # Đã bỏ cache_examples vì cần thêm cấu hình
            )
        
        # Footer
        gr.Markdown(f"""
        ---
        *Ứng dụng được phát triển bởi {AUTHOR} và cộng đồng {COMMUNITY}*  
        *Phiên bản 1.0 | [Tham gia nhóm]({COMMUNITY_LINK}) để cập nhật và hỗ trợ*
        """)
    
    # Xử lý sự kiện
    btn.click(fn=main, inputs=[video_in, audio_in], outputs=[video_out])
    clear_btn.click(lambda: [None, None, None], outputs=[video_in, audio_in, video_out])
    download_btn.click(lambda x: x, inputs=[video_out], outputs=[video_out])

    demo.launch(
        share=True,
        show_error=True,
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860)),
    )