Spaces:

alpha-ai
/

face-coordinates-extraction

Sleeping

File size: 5,252 Bytes

1ce4e4b

from pathlib import Path

import gradio as gr

from pipeline import (
    copy_file_to_dir,
    extract_face_and_coords,
    make_run_dir,
    merge_synced_face,
)

BASE_DIR = Path(__file__).resolve().parent
WORK_DIR = BASE_DIR / "work"
WORK_DIR.mkdir(parents=True, exist_ok=True)


def _normalize_upload_path(file_obj):
    if file_obj is None:
        return None
    if isinstance(file_obj, str):
        return file_obj
    return str(file_obj)


def run_step1(main_video):
    try:
        main_path = _normalize_upload_path(main_video)
        if not main_path:
            raise ValueError("Please upload the main/original video.")

        run_dir = make_run_dir(WORK_DIR, "step1")
        local_main = copy_file_to_dir(main_path, run_dir, "main_video.mp4")

        coords_path, cropped_face_path, bbox = extract_face_and_coords(
            video_path=str(local_main),
            output_dir=str(run_dir),
            coords_name="face_coords_avg.pkl",
            cropped_name="cropped_face.mp4",
        )

        status = f"Step 1 completed. Face bbox saved: {bbox}"
        return status, cropped_face_path, cropped_face_path, coords_path
    except Exception as exc:
        return f"Step 1 failed: {exc}", None, None, None


def run_step3(main_video, synced_face_video, face_coords):
    try:
        main_path = _normalize_upload_path(main_video)
        synced_path = _normalize_upload_path(synced_face_video)
        coords_path = _normalize_upload_path(face_coords)

        if not main_path:
            raise ValueError("Please upload the original/main video.")
        if not synced_path:
            raise ValueError("Please upload the synced face video from manual Step 2.")
        if not coords_path:
            raise ValueError("Please upload face coordinates (.pkl) from Step 1.")

        run_dir = make_run_dir(WORK_DIR, "step3")
        local_main = copy_file_to_dir(main_path, run_dir, "original_video.mp4")
        local_synced = copy_file_to_dir(synced_path, run_dir, "synced_face_video.mp4")
        local_coords = copy_file_to_dir(coords_path, run_dir, "face_coords_avg.pkl")

        final_path = run_dir / "final_output_with_audio.mp4"
        output_path, audio_used = merge_synced_face(
            original_video_path=str(local_main),
            synced_face_video_path=str(local_synced),
            face_coords_path=str(local_coords),
            final_output_path=str(final_path),
        )

        if audio_used == "synced_face_video":
            status = "Step 3 completed. Final video generated with audio from synced face video."
        else:
            status = "Step 3 completed. Final video generated without muxed audio (audio track not found)."

        return status, output_path, output_path
    except Exception as exc:
        return f"Step 3 failed: {exc}", None, None


with gr.Blocks(title="Dub Module - Step 1 and Step 3") as demo:
    gr.Markdown(
        """
# Dub Module Gradio App (Step 1 + Step 3)
Workflow follows `how_to.txt` in this repo with these app boundaries:
- Step 1 is in-app: extract cropped face + `face_coords_avg.pkl`.
- Step 2 is manual and outside the app.
- Step 3 is in-app: merge synced face video back to original and produce final video.
- Separate audio upload is skipped because synced face video audio is used.
        """
    )

    with gr.Tab("Step 1 - Extract Face + Coordinates"):
        gr.Markdown("Upload the main video to generate cropped face video and face coordinates.")
        s1_video = gr.File(label="Main Video", file_types=["video"], type="filepath")
        s1_run = gr.Button("Run Step 1")
        s1_status = gr.Textbox(label="Status", interactive=False)
        s1_preview = gr.Video(label="Cropped Face Preview")
        s1_face_file = gr.File(label="Download Cropped Face Video")
        s1_coords_file = gr.File(label="Download Face Coordinates (.pkl)")

        s1_run.click(
            fn=run_step1,
            inputs=[s1_video],
            outputs=[s1_status, s1_preview, s1_face_file, s1_coords_file],
        )

    with gr.Tab("Step 2 - Manual (Outside App)"):
        gr.Markdown(
            """
Do manual lip-sync generation outside this app using the Step 1 cropped face video.
Then return to Step 3 tab with:
1. Original main video
2. Synced face video (with audio)
3. `face_coords_avg.pkl`
            """
        )

    with gr.Tab("Step 3 - Merge and Final Video"):
        gr.Markdown("Upload inputs from Step 1 and manual Step 2 to generate final output video.")
        s3_main_video = gr.File(label="Original Main Video", file_types=["video"], type="filepath")
        s3_synced_video = gr.File(label="Synced Face Video", file_types=["video"], type="filepath")
        s3_coords = gr.File(label="Face Coordinates (.pkl)", file_types=[".pkl"], type="filepath")
        s3_run = gr.Button("Run Step 3")
        s3_status = gr.Textbox(label="Status", interactive=False)
        s3_preview = gr.Video(label="Final Output Preview")
        s3_file = gr.File(label="Download Final Video")

        s3_run.click(
            fn=run_step3,
            inputs=[s3_main_video, s3_synced_video, s3_coords],
            outputs=[s3_status, s3_preview, s3_file],
        )


if __name__ == "__main__":
    demo.launch()