Spaces:

alpha-ai
/

face_lip_coordinate_extraction

Sleeping

App Files Files Community

tushar310 commited on Feb 24

Commit

95b2ad6

verified ·

1 Parent(s): af6b4e3

version 1

Browse files

Files changed (7) hide show

README.md +58 -8
app.py +198 -0
extract_coordinates.py +26 -0
extract_face.py +23 -0
merge_lips.py +30 -0
pipeline.py +323 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,13 +1,63 @@
----
-title: Face Crop
-emoji: 👁
-colorFrom: yellow
-colorTo: green
 sdk: gradio
-sdk_version: 6.6.0
 app_file: app.py
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Dub Module Gradio App
 sdk: gradio
 app_file: app.py
 pinned: false
 ---
+# Dub Module Gradio App (HF Space Ready)
+This folder provides a Hugging Face Spaces-ready Gradio app for the pipeline in `instructions.txt`.
+Implemented workflow:
+- Step 1: Extract face and lip coordinates from the original video.
+- Step 2: Extract cropped face video using face coordinates and allow downloads of:
+  - face coordinates (`.pkl`)
+  - lip coordinates (`.pkl`)
+  - cropped face video (`.mp4`)
+- Step 3: Manual only (not part of the app).
+- Step 4: Merge manual Step 3 output back into the original video and generate final downloadable video.
+- Step 5: Not part of the app.
+## Files
+- `app.py`: Gradio interface (Step 1, 2, and 4).
+- `pipeline.py`: Core logic shared by UI and CLI.
+- `extract_coordinates.py`: CLI wrapper for Step 1.
+- `extract_face.py`: CLI wrapper for Step 2.
+- `merge_lips.py`: CLI wrapper for Step 4.
+- `requirements.txt`: Python dependencies.
+## Local Run
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## Hugging Face Space Setup
+1. Create a new Gradio Space.
+2. Upload all files from this folder to the root of the Space.
+3. Ensure `README.md` and `requirements.txt` are present.
+4. Space will auto-build and run `app.py`.
+## CLI Usage (Optional)
+Step 1:
+```bash
+python extract_coordinates.py --video input.mp4 --output-dir outputs
+```
+Step 2:
+```bash
+python extract_face.py --video input.mp4 --face-coords outputs/face_coords_avg.pkl --output outputs/cropped_face.mp4
+```
+Step 4:
+```bash
+python merge_lips.py --original-video input.mp4 --lip-synced-video lipsynced_face.mp4 --face-coords outputs/face_coords_avg.pkl --lip-coords outputs/lip_coords_avg.pkl --output outputs/final.mp4 --audio tts.wav
+```
+## Notes
+- Coordinates are generated per video and should not be reused across unrelated videos.
+- If no external audio is uploaded in Step 4, the app attempts to pull audio from the lip-synced video, then from the original video.
+- Generated files are stored in `work/` during runtime.

app.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from pathlib import Path
+import gradio as gr
+from pipeline import (
+    copy_file_to_dir,
+    extract_coordinates,
+    extract_face_video,
+    make_run_dir,
+    merge_lips,
+)
+BASE_DIR = Path(__file__).resolve().parent
+WORK_DIR = BASE_DIR / "work"
+WORK_DIR.mkdir(parents=True, exist_ok=True)
+def _normalize_upload_path(file_obj):
+    if file_obj is None:
+        return None
+    if isinstance(file_obj, str):
+        return file_obj
+    return str(file_obj)
+def run_step1(original_video):
+    try:
+        original_path = _normalize_upload_path(original_video)
+        if not original_path:
+            raise ValueError("Please upload an original video.")
+        run_dir = make_run_dir(WORK_DIR, "step1")
+        local_video = copy_file_to_dir(original_path, run_dir)
+        face_path, lip_path, face_bbox, lip_bbox = extract_coordinates(
+            video_path=str(local_video),
+            output_dir=str(run_dir),
+            face_name="face_coords_avg.pkl",
+            lip_name="lip_coords_avg.pkl",
+        )
+        status = (
+            "Step 1 completed. "
+            f"Face bbox: {face_bbox}. "
+            f"Lip bbox: {lip_bbox}."
+        )
+        return status, face_path, lip_path
+    except Exception as exc:
+        return f"Step 1 failed: {exc}", None, None
+def run_step2(original_video, face_coords, lip_coords):
+    try:
+        original_path = _normalize_upload_path(original_video)
+        face_path = _normalize_upload_path(face_coords)
+        lip_path = _normalize_upload_path(lip_coords)
+        if not original_path:
+            raise ValueError("Please upload the original video.")
+        if not face_path:
+            raise ValueError("Please upload face coordinates (.pkl).")
+        run_dir = make_run_dir(WORK_DIR, "step2")
+        local_video = copy_file_to_dir(original_path, run_dir)
+        local_face = copy_file_to_dir(face_path, run_dir, "face_coords_avg.pkl")
+        local_lip = None
+        if lip_path:
+            local_lip = copy_file_to_dir(lip_path, run_dir, "lip_coords_avg.pkl")
+        cropped_face_path = run_dir / "cropped_face.mp4"
+        extract_face_video(
+            video_path=str(local_video),
+            face_coords_path=str(local_face),
+            output_path=str(cropped_face_path),
+        )
+        status = "Step 2 completed. Download cropped face video and coordinate files below."
+        return status, str(cropped_face_path), str(cropped_face_path), str(local_face), str(local_lip) if local_lip else None
+    except Exception as exc:
+        return f"Step 2 failed: {exc}", None, None, None, None
+def run_step4(original_video, lip_synced_video, face_coords, lip_coords, audio_file):
+    try:
+        original_path = _normalize_upload_path(original_video)
+        lipsynced_path = _normalize_upload_path(lip_synced_video)
+        face_path = _normalize_upload_path(face_coords)
+        lip_path = _normalize_upload_path(lip_coords)
+        audio_path = _normalize_upload_path(audio_file)
+        if not original_path:
+            raise ValueError("Please upload the original video.")
+        if not lipsynced_path:
+            raise ValueError("Please upload the lip-synced face video from Step 3.")
+        if not face_path:
+            raise ValueError("Please upload face coordinates (.pkl).")
+        if not lip_path:
+            raise ValueError("Please upload lip coordinates (.pkl).")
+        run_dir = make_run_dir(WORK_DIR, "step4")
+        local_original = copy_file_to_dir(original_path, run_dir, "original_video.mp4")
+        local_lipsynced = copy_file_to_dir(lipsynced_path, run_dir, "lip_synced_face_video.mp4")
+        local_face = copy_file_to_dir(face_path, run_dir, "face_coords_avg.pkl")
+        local_lip = copy_file_to_dir(lip_path, run_dir, "lip_coords_avg.pkl")
+        local_audio = None
+        if audio_path:
+            local_audio = copy_file_to_dir(audio_path, run_dir)
+        final_path = run_dir / "final_synced_output.mp4"
+        final_video_path, audio_used = merge_lips(
+            original_video_path=str(local_original),
+            lip_synced_video_path=str(local_lipsynced),
+            face_coords_path=str(local_face),
+            lip_coords_path=str(local_lip),
+            final_output_path=str(final_path),
+            audio_path=str(local_audio) if local_audio else None,
+        )
+        status = f"Step 4 completed. Final video generated. Audio source used: {audio_used}"
+        return status, final_video_path, final_video_path
+    except Exception as exc:
+        return f"Step 4 failed: {exc}", None, None
+with gr.Blocks(title="Dub Module - Steps 1, 2, and 4") as demo:
+    gr.Markdown(
+        """
+# Dub Module Gradio App (HF Ready)
+This app implements Step 1, Step 2, and Step 4 from your pipeline.
+- Step 3 must be done manually outside this app.
+- Step 5 is not included.
+        """
+    )
+    with gr.Tab("Step 1 - Extract Coordinates"):
+        gr.Markdown("Upload the original video to generate `face_coords_avg.pkl` and `lip_coords_avg.pkl`.")
+        s1_video = gr.File(label="Original Video", file_types=["video"], type="filepath")
+        s1_run = gr.Button("Run Step 1")
+        s1_status = gr.Textbox(label="Status", interactive=False)
+        s1_face = gr.File(label="Face Coordinates (.pkl)")
+        s1_lip = gr.File(label="Lip Coordinates (.pkl)")
+        s1_run.click(fn=run_step1, inputs=[s1_video], outputs=[s1_status, s1_face, s1_lip])
+    with gr.Tab("Step 2 - Extract Cropped Face Video"):
+        gr.Markdown(
+            "Upload original video and face coordinates. Lip coordinates are optional here, "
+            "but if provided they are returned for download as requested."
+        )
+        s2_video = gr.File(label="Original Video", file_types=["video"], type="filepath")
+        s2_face = gr.File(label="Face Coordinates (.pkl)", file_types=[".pkl"], type="filepath")
+        s2_lip = gr.File(label="Lip Coordinates (.pkl) - optional", file_types=[".pkl"], type="filepath")
+        s2_run = gr.Button("Run Step 2")
+        s2_status = gr.Textbox(label="Status", interactive=False)
+        s2_preview = gr.Video(label="Cropped Face Video Preview")
+        s2_video_file = gr.File(label="Download Cropped Face Video")
+        s2_face_out = gr.File(label="Download Face Coordinates")
+        s2_lip_out = gr.File(label="Download Lip Coordinates")
+        s2_run.click(
+            fn=run_step2,
+            inputs=[s2_video, s2_face, s2_lip],
+            outputs=[s2_status, s2_preview, s2_video_file, s2_face_out, s2_lip_out],
+        )
+    with gr.Tab("Step 3 - Manual (Outside App)"):
+        gr.Markdown(
+            """
+Run your Step 3 lip-sync process manually using the cropped face video from Step 2.
+After Step 3, return to Step 4 and upload:
+1. Original video
+2. Lip-synced face video from your external tool
+3. Face coordinates pkl
+4. Lip coordinates pkl
+5. Optional audio file used during lip-sync
+            """
+        )
+    with gr.Tab("Step 4 - Merge and Final Output"):
+        gr.Markdown("Merge the lip-synced lips back to original video and download the final output.")
+        s4_original = gr.File(label="Original Video", file_types=["video"], type="filepath")
+        s4_lipsynced = gr.File(label="Lip-synced Face Video", file_types=["video"], type="filepath")
+        s4_face = gr.File(label="Face Coordinates (.pkl)", file_types=[".pkl"], type="filepath")
+        s4_lip = gr.File(label="Lip Coordinates (.pkl)", file_types=[".pkl"], type="filepath")
+        s4_audio = gr.File(label="Audio from Step 3 (optional)", file_types=["audio"], type="filepath")
+        s4_run = gr.Button("Run Step 4")
+        s4_status = gr.Textbox(label="Status", interactive=False)
+        s4_preview = gr.Video(label="Final Video Preview")
+        s4_file = gr.File(label="Download Final Video")
+        s4_run.click(
+            fn=run_step4,
+            inputs=[s4_original, s4_lipsynced, s4_face, s4_lip, s4_audio],
+            outputs=[s4_status, s4_preview, s4_file],
+        )
+if __name__ == "__main__":
+    demo.launch()

extract_coordinates.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import argparse
+from pipeline import extract_coordinates
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Step 1: Extract face/lip coordinates from a video.")
+    parser.add_argument("--video", required=True, help="Path to input video")
+    parser.add_argument("--output-dir", default=".", help="Directory to store output pkl files")
+    parser.add_argument("--face-name", default="face_coords_avg.pkl", help="Output face coordinates filename")
+    parser.add_argument("--lip-name", default="lip_coords_avg.pkl", help="Output lip coordinates filename")
+    args = parser.parse_args()
+    face_path, lip_path, face_bbox, lip_bbox = extract_coordinates(
+        video_path=args.video,
+        output_dir=args.output_dir,
+        face_name=args.face_name,
+        lip_name=args.lip_name,
+    )
+    print(f"Face coordinates: {face_path} -> {face_bbox}")
+    print(f"Lip coordinates:  {lip_path} -> {lip_bbox}")
+if __name__ == "__main__":
+    main()

extract_face.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import argparse
+from pipeline import extract_face_video
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Step 2: Extract cropped face video using face coordinates.")
+    parser.add_argument("--video", required=True, help="Path to original video")
+    parser.add_argument("--face-coords", required=True, help="Path to face coordinates pkl")
+    parser.add_argument("--output", default="extracted_face.mp4", help="Output cropped video path")
+    args = parser.parse_args()
+    output_path = extract_face_video(
+        video_path=args.video,
+        face_coords_path=args.face_coords,
+        output_path=args.output,
+    )
+    print(f"Cropped face video: {output_path}")
+if __name__ == "__main__":
+    main()

merge_lips.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import argparse
+from pipeline import merge_lips
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Step 4: Merge lip-synced face region back into original video.")
+    parser.add_argument("--original-video", required=True, help="Path to original video")
+    parser.add_argument("--lip-synced-video", required=True, help="Path to lip-synced face video from external module")
+    parser.add_argument("--face-coords", required=True, help="Path to face coordinates pkl")
+    parser.add_argument("--lip-coords", required=True, help="Path to lip coordinates pkl")
+    parser.add_argument("--output", default="final_synced_output.mp4", help="Output final merged video path")
+    parser.add_argument("--audio", default=None, help="Optional external audio path from Step 3")
+    args = parser.parse_args()
+    final_path, audio_used = merge_lips(
+        original_video_path=args.original_video,
+        lip_synced_video_path=args.lip_synced_video,
+        face_coords_path=args.face_coords,
+        lip_coords_path=args.lip_coords,
+        final_output_path=args.output,
+        audio_path=args.audio,
+    )
+    print(f"Final merged video: {final_path}")
+    print(f"Audio source used:  {audio_used}")
+if __name__ == "__main__":
+    main()

pipeline.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import pickle
+import shutil
+import subprocess
+import uuid
+from pathlib import Path
+from typing import Optional, Sequence, Tuple
+import cv2
+import imageio_ffmpeg
+import mediapipe as mp
+import numpy as np
+LIP_INDICES = [
+    61, 146, 91, 181, 84, 17, 314, 405, 321, 375,
+    291, 409, 270, 269, 267, 0, 37, 39, 40, 185,
+]
+def ensure_dir(path: Path) -> Path:
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+def make_run_dir(base_dir: Path, prefix: str) -> Path:
+    run_dir = ensure_dir(base_dir) / f"{prefix}_{uuid.uuid4().hex}"
+    return ensure_dir(run_dir)
+def copy_file_to_dir(source_path: str, target_dir: Path, target_name: Optional[str] = None) -> Path:
+    source = Path(source_path)
+    if not source.exists():
+        raise FileNotFoundError(f"Input file not found: {source_path}")
+    if target_name is None:
+        target_name = source.name
+    target_path = target_dir / target_name
+    shutil.copy2(source, target_path)
+    return target_path
+def get_bbox(
+    landmarks,
+    indices: Sequence[int],
+    iw: int,
+    ih: int,
+    scale_w: float = 1.5,
+    scale_h: float = 1.5,
+    top_padding: int = 0,
+) -> Tuple[int, int, int, int]:
+    coords = [(landmarks[i].x * iw, landmarks[i].y * ih) for i in indices]
+    x_min, y_min = np.min(coords, axis=0)
+    x_max, y_max = np.max(coords, axis=0)
+    w = x_max - x_min
+    h = y_max - y_min
+    new_w = int(w * scale_w)
+    new_h = int(h * scale_h)
+    x = max(0, int(x_min - (new_w - w) // 2))
+    y = max(0, int(y_min - (new_h - h) // 2) - top_padding)
+    new_w = min(new_w, iw - x)
+    new_h = min(new_h + top_padding, ih - y)
+    return (x, y, new_w, new_h)
+def _load_coords(coords_path: str) -> Tuple[int, int, int, int]:
+    with open(coords_path, "rb") as handle:
+        coords = pickle.load(handle)
+    if len(coords) != 4:
+        raise ValueError(f"Invalid coordinates in {coords_path}: expected 4 values, got {len(coords)}")
+    return tuple(int(v) for v in coords)
+def extract_coordinates(
+    video_path: str,
+    output_dir: str,
+    face_name: str = "face_coords_avg.pkl",
+    lip_name: str = "lip_coords_avg.pkl",
+) -> Tuple[str, str, Tuple[int, int, int, int], Tuple[int, int, int, int]]:
+    output_root = ensure_dir(Path(output_dir))
+    face_out = output_root / face_name
+    lip_out = output_root / lip_name
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video: {video_path}")
+    mp_face_mesh = mp.solutions.face_mesh
+    face_mesh = mp_face_mesh.FaceMesh(
+        static_image_mode=False,
+        max_num_faces=1,
+        refine_landmarks=True,
+        min_detection_confidence=0.8,
+    )
+    face_bbox_list = []
+    lip_bbox_list = []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        results = face_mesh.process(image_rgb)
+        if results.multi_face_landmarks:
+            for face_landmarks in results.multi_face_landmarks:
+                ih, iw, _ = frame.shape
+                face_bbox = get_bbox(
+                    face_landmarks.landmark,
+                    range(len(face_landmarks.landmark)),
+                    iw,
+                    ih,
+                    scale_w=1.2,
+                    scale_h=1.2,
+                )
+                lip_bbox_unclipped = get_bbox(
+                    face_landmarks.landmark,
+                    LIP_INDICES,
+                    iw,
+                    ih,
+                    scale_w=1.5,
+                    scale_h=1.5,
+                    top_padding=20,
+                )
+                x_face, y_face, w_face, h_face = face_bbox
+                x_lip, y_lip, w_lip, h_lip = lip_bbox_unclipped
+                x_lip = max(x_face, x_lip)
+                y_lip = max(y_face, y_lip)
+                w_lip = min(w_lip, x_face + w_face - x_lip)
+                h_lip = min(h_lip, y_face + h_face - y_lip)
+                if w_lip > 0 and h_lip > 0:
+                    face_bbox_list.append(face_bbox)
+                    lip_bbox_list.append((x_lip, y_lip, w_lip, h_lip))
+    cap.release()
+    face_mesh.close()
+    if not face_bbox_list or not lip_bbox_list:
+        raise ValueError("No faces detected in the video. Check the video quality and framing.")
+    avg_face_bbox = np.mean(np.array(face_bbox_list), axis=0).astype(int)
+    avg_lip_bbox = np.mean(np.array(lip_bbox_list), axis=0).astype(int)
+    with open(face_out, "wb") as handle:
+        pickle.dump(tuple(int(v) for v in avg_face_bbox), handle)
+    with open(lip_out, "wb") as handle:
+        pickle.dump(tuple(int(v) for v in avg_lip_bbox), handle)
+    return (
+        str(face_out),
+        str(lip_out),
+        tuple(int(v) for v in avg_face_bbox),
+        tuple(int(v) for v in avg_lip_bbox),
+    )
+def extract_face_video(video_path: str, face_coords_path: str, output_path: str) -> str:
+    x, y, w, h = _load_coords(face_coords_path)
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video: {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    if fps <= 0:
+        fps = 25.0
+    frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    x = max(0, min(x, frame_w - 1))
+    y = max(0, min(y, frame_h - 1))
+    w = max(1, min(w, frame_w - x))
+    h = max(1, min(h, frame_h - y))
+    out = cv2.VideoWriter(
+        output_path,
+        cv2.VideoWriter_fourcc(*"mp4v"),
+        fps,
+        (w, h),
+    )
+    frame_count = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        face_img = frame[y:y + h, x:x + w]
+        out.write(face_img)
+        frame_count += 1
+    cap.release()
+    out.release()
+    if frame_count == 0:
+        raise ValueError("No frames were written for cropped face video.")
+    return output_path
+def _mux_audio(video_no_audio: str, audio_source: str, output_path: str) -> bool:
+    ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
+    cmd = [
+        ffmpeg_exe,
+        "-y",
+        "-i",
+        video_no_audio,
+        "-i",
+        audio_source,
+        "-map",
+        "0:v:0",
+        "-map",
+        "1:a:0",
+        "-c:v",
+        "copy",
+        "-c:a",
+        "aac",
+        "-shortest",
+        output_path,
+    ]
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    return result.returncode == 0 and Path(output_path).exists()
+def merge_lips(
+    original_video_path: str,
+    lip_synced_video_path: str,
+    face_coords_path: str,
+    lip_coords_path: str,
+    final_output_path: str,
+    audio_path: Optional[str] = None,
+) -> Tuple[str, str]:
+    x_face, y_face, w_face, h_face = _load_coords(face_coords_path)
+    x_lip, y_lip, w_lip, h_lip = _load_coords(lip_coords_path)
+    lip_rel_x = (x_lip - x_face) / max(1, w_face)
+    lip_rel_y = (y_lip - y_face) / max(1, h_face)
+    lip_rel_w = w_lip / max(1, w_face)
+    lip_rel_h = h_lip / max(1, h_face)
+    original_cap = cv2.VideoCapture(original_video_path)
+    lip_synced_cap = cv2.VideoCapture(lip_synced_video_path)
+    if not original_cap.isOpened():
+        raise ValueError(f"Could not open original video: {original_video_path}")
+    if not lip_synced_cap.isOpened():
+        raise ValueError(f"Could not open lip-synced video: {lip_synced_video_path}")
+    fps = original_cap.get(cv2.CAP_PROP_FPS)
+    if fps <= 0:
+        fps = 25.0
+    frame_w = int(original_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_h = int(original_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    intermediate_path = str(Path(final_output_path).with_name("merged_no_audio.mp4"))
+    out_final = cv2.VideoWriter(
+        intermediate_path,
+        cv2.VideoWriter_fourcc(*"mp4v"),
+        fps,
+        (frame_w, frame_h),
+    )
+    frames_written = 0
+    while original_cap.isOpened():
+        ret, original_frame = original_cap.read()
+        if not ret:
+            break
+        ret_lip, lip_synced_frame = lip_synced_cap.read()
+        if ret_lip:
+            lip_x_in_face = int(lip_rel_x * w_face)
+            lip_y_in_face = int(lip_rel_y * h_face)
+            lip_w_in_face = int(lip_rel_w * w_face)
+            lip_h_in_face = int(lip_rel_h * h_face)
+            lip_x_in_face = max(0, lip_x_in_face)
+            lip_y_in_face = max(0, lip_y_in_face)
+            lip_w_in_face = max(1, min(lip_w_in_face, lip_synced_frame.shape[1] - lip_x_in_face))
+            lip_h_in_face = max(1, min(lip_h_in_face, lip_synced_frame.shape[0] - lip_y_in_face))
+            lip_synced_lip = lip_synced_frame[
+                lip_y_in_face:lip_y_in_face + lip_h_in_face,
+                lip_x_in_face:lip_x_in_face + lip_w_in_face,
+            ]
+            if lip_synced_lip.size > 0:
+                target_x = max(0, min(x_lip, frame_w - 1))
+                target_y = max(0, min(y_lip, frame_h - 1))
+                target_w = max(1, min(w_lip, frame_w - target_x))
+                target_h = max(1, min(h_lip, frame_h - target_y))
+                lip_synced_lip_resized = cv2.resize(lip_synced_lip, (target_w, target_h))
+                original_frame[target_y:target_y + target_h, target_x:target_x + target_w] = lip_synced_lip_resized
+        out_final.write(original_frame)
+        frames_written += 1
+    original_cap.release()
+    lip_synced_cap.release()
+    out_final.release()
+    if frames_written == 0:
+        raise ValueError("No frames written while merging lips.")
+    audio_candidates = []
+    if audio_path:
+        audio_candidates.append(audio_path)
+    audio_candidates.extend([lip_synced_video_path, original_video_path])
+    for candidate in audio_candidates:
+        if candidate and Path(candidate).exists() and _mux_audio(intermediate_path, candidate, final_output_path):
+            return final_output_path, candidate
+    shutil.copy2(intermediate_path, final_output_path)
+    return final_output_path, "none"

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=5.0.0
+opencv-python-headless>=4.8.0
+mediapipe>=0.10.0
+numpy>=1.24.0
+imageio-ffmpeg>=0.4.9