Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Oct 2, 2025

Commit

ec9ba45

verified ·

1 Parent(s): a43d32f

Delete pipeline/two_stage_pipeline.py

Browse files

Files changed (1) hide show

pipeline/two_stage_pipeline.py +0 -388

pipeline/two_stage_pipeline.py DELETED Viewed

@@ -1,388 +0,0 @@
-#!/usr/bin/env python3
-"""
-two_stage_pipeline.py — Ephemeral SAM2 stage + MatAnyone stage
-- Stage 1: SAM2 -> lossless mask stream (FFV1 .mkv) + meta.json, then unload SAM2
-- Stage 2: read mask stream -> (optional) MatAnyone refine -> composite -> mux audio
-"""
-import os, sys, gc, json, cv2, time, uuid, torch, shutil, logging, subprocess, threading
-import numpy as np
-from pathlib import Path
-from typing import Optional, Callable, Tuple, Dict, Any
-from PIL import Image
-logger = logging.getLogger("backgroundfx_pro.two_stage")
-if not logger.handlers:
-    h = logging.StreamHandler()
-    h.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s: %(message)s"))
-    logger.addHandler(h)
-logger.setLevel(logging.INFO)
-# ---------------------------
-# Env & CUDA helpers
-# ---------------------------
-def setup_env():
-    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF","expandable_segments:True,max_split_size_mb:256,garbage_collection_threshold:0.7")
-    os.environ.setdefault("OMP_NUM_THREADS","1")
-    os.environ.setdefault("OPENBLAS_NUM_THREADS","1")
-    os.environ.setdefault("MKL_NUM_THREADS","1")
-    torch.set_grad_enabled(False)
-    try:
-        torch.backends.cudnn.benchmark = True
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-        torch.set_float32_matmul_precision("high")
-    except Exception:
-        pass
-    if torch.cuda.is_available():
-        try:
-            torch.cuda.set_per_process_memory_fraction(float(os.getenv("CUDA_MEMORY_FRACTION","0.88")))
-        except Exception:
-            pass
-def free_cuda():
-    if torch.cuda.is_available():
-        torch.cuda.ipc_collect()
-        torch.cuda.empty_cache()
-def unload_sam2_modules():
-    """Aggressively unload SAM2 python modules to reduce RSS."""
-    try:
-        import importlib
-        mods = [m for m in list(sys.modules) if m.startswith("sam2")]
-        for m in mods:
-            sys.modules.pop(m, None)
-        importlib.invalidate_caches()
-        gc.collect()
-        free_cuda()
-        logger.info("SAM2 modules unloaded.")
-    except Exception as e:
-        logger.warning(f"Unloading SAM2 modules: {e}")
-# ---------------------------
-# Video probing
-# ---------------------------
-def probe_video(path:str) -> Tuple[int,int,float,int]:
-    cap = cv2.VideoCapture(path)
-    if not cap.isOpened():
-        raise RuntimeError(f"Cannot open video: {path}")
-    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
-    w   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    h   = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    n   = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    cap.release()
-    return w,h,float(fps),n
-# ---------------------------
-# FFmpeg mask writers/readers
-# ---------------------------
-class MaskFFV1Writer:
-    """Write uint8 binary/gray masks to FFV1 lossless .mkv via pipe."""
-    def __init__(self, path:str, w:int, h:int, fps:float):
-        self.path = path
-        self.w, self.h, self.fps = w,h,fps
-        self.proc = None
-    def __enter__(self):
-        cmd = [
-            "ffmpeg","-y","-hide_banner","-loglevel","error",
-            "-f","rawvideo","-pix_fmt","gray","-s",f"{self.w}x{self.h}","-r",f"{self.fps}",
-            "-i","-",
-            "-c:v","ffv1","-level","3","-g","1", self.path
-        ]
-        self.proc = subprocess.Popen(cmd, stdin=subprocess.PIPE)
-        return self
-    def write(self, mask_u8: np.ndarray):
-        # Expect HxW uint8 (0/255). Ensure contiguous.
-        if mask_u8.dtype != np.uint8:
-            mask_u8 = mask_u8.astype(np.uint8)
-        self.proc.stdin.write(mask_u8.tobytes())
-    def __exit__(self, exc_type, exc, tb):
-        if self.proc:
-            try:
-                self.proc.stdin.flush()
-                self.proc.stdin.close()
-                self.proc.wait(timeout=120)
-            except Exception:
-                self.proc.kill()
-class MaskFFV1Reader:
-    """Read uint8 masks from FFV1 .mkv via pipe."""
-    def __init__(self, path:str, w:int, h:int):
-        self.path = path
-        self.w,self.h = w,h
-        self.proc = None
-        self.frame_bytes = w*h
-    def __enter__(self):
-        cmd = [
-            "ffmpeg","-hide_banner","-loglevel","error","-i", self.path,
-            "-f","rawvideo","-pix_fmt","gray","-"
-        ]
-        self.proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
-        return self
-    def read(self) -> Optional[np.ndarray]:
-        buf = self.proc.stdout.read(self.frame_bytes)
-        if not buf or len(buf) < self.frame_bytes:
-            return None
-        return np.frombuffer(buf, dtype=np.uint8).reshape(self.h, self.w)
-    def __exit__(self, exc_type, exc, tb):
-        if self.proc:
-            try:
-                self.proc.stdout.close()
-                self.proc.wait(timeout=30)
-            except Exception:
-                self.proc.kill()
-# Fallback: PNG sequence (disk heavy but simple & robust)
-class MaskPNGWriter:
-    def __init__(self, dirpath: Path):
-        self.dir = dirpath; self.dir.mkdir(parents=True, exist_ok=True); self.idx=0
-    def write(self, mask_u8: np.ndarray):
-        cv2.imwrite(str(self.dir / f"{self.idx:06d}.png"), mask_u8)
-        self.idx+=1
-class MaskPNGReader:
-    def __init__(self, dirpath: Path):
-        self.dir=dirpath; self.idx=0
-    def read(self) -> Optional[np.ndarray]:
-        p = self.dir / f"{self.idx:06d}.png"
-        if not p.exists(): return None
-        img = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
-        self.idx+=1
-        return img
-# ---------------------------
-# Stage 1 — SAM2 → mask dump
-# ---------------------------
-def stage1_dump_masks(video_path:str, out_dir:Path, obj_point:Tuple[int,int]=None) -> Dict[str,Any]:
-    """
-    Run only SAM2, save masks as FFV1 (preferred) or PNG sequence + meta.json.
-    Returns meta dict.
-    """
-    setup_env()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    w,h,fps,n = probe_video(video_path)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    meta = {"video":video_path, "width":w,"height":h,"fps":fps,"frames":n, "storage":None}
-    logger.info(f"[Stage1] {w}x{h}@{fps:.2f} | frames={n}")
-    # Load SAM2 (your wrapper)
-    from models.sam2_loader import SAM2Predictor
-    predictor = SAM2Predictor(device=device)
-    state = predictor.init_state(video_path=video_path)
-    # Prompt: center positive if not provided
-    if obj_point is None:
-        obj_point = (w//2, h//2)
-    pts = np.array([[obj_point[0], obj_point[1]]], dtype=np.float32)
-    labels = np.array([1], dtype=np.int32)
-    ann_obj_id = 1
-    with torch.inference_mode():
-        predictor.add_new_points(state, 0, ann_obj_id, pts, labels)
-    # Preferred: FFV1 mask stream
-    mask_mkv = out_dir / "mask.mkv"
-    use_png = False
-    try:
-        with MaskFFV1Writer(str(mask_mkv), w, h, fps) as writer, \
-             torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16 if device.type=="cuda" else None):
-            for _, out_ids, out_logits in predictor.propagate_in_video(state):
-                # pick ann_obj_id
-                i = None
-                if isinstance(out_ids, torch.Tensor):
-                    nz = (out_ids == ann_obj_id).nonzero(as_tuple=False)
-                    if nz.numel() > 0: i = nz[0].item()
-                else:
-                    ids = list(out_ids);  i = ids.index(ann_obj_id) if ann_obj_id in ids else None
-                if i is None:
-                    # write empty
-                    writer.write(np.zeros((h,w), np.uint8))
-                    continue
-                mask = (out_logits[i] > 0).detach()
-                mask_u8 = (mask.float().mul_(255).to("cpu", non_blocking=True).numpy()).astype(np.uint8)
-                writer.write(mask_u8)
-        meta["storage"] = "ffv1"
-        meta["mask_path"] = str(mask_mkv)
-        logger.info("[Stage1] Masks saved as FFV1 .mkv")
-    except Exception as e:
-        logger.warning(f"FFV1 writer failed ({e}), falling back to PNG sequence.")
-        png_dir = out_dir / "masks_png"
-        wr = MaskPNGWriter(png_dir)
-        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16 if device.type=="cuda" else None):
-            for _, out_ids, out_logits in predictor.propagate_in_video(state):
-                i = None
-                if isinstance(out_ids, torch.Tensor):
-                    nz = (out_ids == ann_obj_id).nonzero(as_tuple=False)
-                    if nz.numel() > 0: i = nz[0].item()
-                else:
-                    ids = list(out_ids);  i = ids.index(ann_obj_id) if ann_obj_id in ids else None
-                if i is None:
-                    wr.write(np.zeros((h,w), np.uint8)); continue
-                mask = (out_logits[i] > 0).detach()
-                wr.write((mask.float().mul_(255).to("cpu").numpy()).astype(np.uint8))
-        meta["storage"] = "png"
-        meta["mask_path"] = str(png_dir)
-    # Persist meta
-    with open(out_dir / "meta.json","w") as f:
-        json.dump(meta, f)
-    # Unload SAM2 completely
-    del predictor, state
-    free_cuda(); unload_sam2_modules()
-    return meta
-# ---------------------------
-# Stage 2 — refine + compose
-# ---------------------------
-def stage2_refine_and_compose(video_path:str, mask_dir:Path, background_image:Image.Image,
-                              out_path:str, use_matany:bool=True) -> str:
-    w,h,fps,n = probe_video(video_path)
-    bg = background_image.resize((w,h), Image.LANCZOS)
-    bg_np = np.array(bg).astype(np.float32)
-    # Read meta
-    with open(mask_dir / "meta.json","r") as f:
-        meta = json.load(f)
-    storage = meta["storage"]; mask_path = meta["mask_path"]
-    # Optional MatAnyone
-    session = None
-    if use_matany:
-        try:
-            from models.matanyone_loader import MatAnyoneSession as _M
-        except Exception:
-            try:
-                from models.matanyone_loader import MatAnyoneLoader as _M
-            except Exception:
-                _M = None
-        if _M:
-            session = _M(device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-            if hasattr(session,"model") and session.model is not None:
-                session.model.eval()
-    # Open video + writer
-    cap = cv2.VideoCapture(video_path)
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    tmp_out = str(Path(out_path).with_suffix(".noaudio.mp4"))
-    writer = cv2.VideoWriter(tmp_out, fourcc, fps, (w,h))
-    # Open mask reader
-    if storage == "ffv1":
-        mreader = MaskFFV1Reader(mask_path, w, h)
-        mreader.__enter__()
-        read_mask = lambda : mreader.read()
-    else:
-        mreader = MaskPNGReader(Path(mask_path))
-        read_mask = lambda : mreader.read()
-    i = 0
-    try:
-        while True:
-            ok, frame_bgr = cap.read()
-            if not ok: break
-            mask_u8 = read_mask()
-            if mask_u8 is None:
-                # out of masks; write original
-                writer.write(frame_bgr); i+=1; continue
-            # Optional refine
-            if session is not None:
-                try:
-                    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
-                    # Provide a float mask 0..1 to session; adapt if your API differs
-                    mask_f = (mask_u8.astype(np.float32) / 255.0)
-                    if hasattr(session,"refine_mask"):
-                        mask_refined = session.refine_mask(frame_rgb, mask_f)
-                    elif hasattr(session,"process_frame"):
-                        mask_refined = session.process_frame(frame_rgb, mask_f)
-                    else:
-                        mask_refined = mask_f
-                    if isinstance(mask_refined, torch.Tensor):
-                        mask_u8 = (mask_refined.detach().clamp(0,1).mul(255).to("cpu").numpy()).astype(np.uint8)
-                    elif isinstance(mask_refined, np.ndarray):
-                        mask_u8 = (np.clip(mask_refined,0,1)*255).astype(np.uint8)
-                except Exception as e:
-                    logger.debug(f"MatAnyone refine failed @frame {i}: {e}")
-            # Composite
-            m = (mask_u8.astype(np.float32)/255.0)[...,None]  # HxWx1
-            fr = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB).astype(np.float32)
-            comp = fr*m + bg_np*(1.0-m)
-            comp_bgr = cv2.cvtColor(comp.astype(np.uint8), cv2.COLOR_RGB2BGR)
-            writer.write(comp_bgr)
-            if i % 50 == 0:
-                logger.info(f"[Stage2] frame {i}/{n}")
-            i += 1
-    finally:
-        cap.release(); writer.release()
-        if isinstance(mreader, MaskFFV1Reader):
-            mreader.__exit__(None,None,None)
-    # Mux audio
-    final_out = str(Path(out_path))
-    cmd = [
-        "ffmpeg","-y","-hide_banner","-loglevel","error",
-        "-i", tmp_out, "-i", video_path,
-        "-map","0:v:0","-map","1:a:0","-c:v","copy","-c:a","aac","-shortest", final_out
-    ]
-    try:
-        r = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
-        if r.returncode != 0:
-            logger.warning(f"Audio mux failed: {r.stderr.strip()}")
-            shutil.move(tmp_out, final_out)
-        else:
-            os.remove(tmp_out)
-    except Exception:
-        shutil.move(tmp_out, final_out)
-    return final_out
-# ---------------------------
-# Orchestrator
-# ---------------------------
-def process_two_stage(
-    video_path:str,
-    background_image: Image.Image,
-    workdir: Optional[Path]=None,
-    progress: Optional[Callable[[str,float],None]] = None,
-    use_matany: bool = True,
-) -> str:
-    setup_env()
-    if workdir is None:
-        workdir = Path.cwd()/ "tmp" / f"job_{uuid.uuid4().hex[:8]}"
-    workdir.mkdir(parents=True, exist_ok=True)
-    # Stage 1
-    if progress: progress("Stage 1: SAM2 mask pass", 0.05)
-    mask_dir = workdir / "sam2_masks"
-    meta = stage1_dump_masks(video_path, mask_dir)
-    if progress: progress("Stage 1 complete", 0.45)
-    # Stage 2
-    if progress: progress("Stage 2: refine + compose", 0.50)
-    out_path = workdir / f"final_{int(time.time())}.mp4"
-    final_video = stage2_refine_and_compose(video_path, mask_dir, background_image, str(out_path), use_matany=use_matany)
-    if progress: progress("Done", 1.0)
-    logger.info(f"Output: {final_video}")
-    return final_video
-# ---------------------------
-# CLI
-# ---------------------------
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Two-stage BackgroundFX Pro")
-    parser.add_argument("--video", required=True)
-    parser.add_argument("--background", required=True)
-    parser.add_argument("--outdir", default=None)
-    parser.add_argument("--no-matany", action="store_true")
-    args = parser.parse_args()
-    bg = Image.open(args.background).convert("RGB")
-    out = process_two_stage(args.video, bg, Path(args.outdir) if args.outdir else None, use_matany=not args.no_matany)
-    print(out)