Spaces:

Luminia
/

CorridorKey

Sleeping

App Files Files Community

Nekochu commited on 19 days ago

Commit

0b6961f

1 Parent(s): c2d53e4

add ZeroGPU GPU inference (FP16, flash-attn, batch=32@1024/16@2048)

Browse files

Files changed (7) hide show

.gitignore +1 -0
CorridorKeyModule/__init__.py +0 -0
CorridorKeyModule/core/__init__.py +0 -0
CorridorKeyModule/core/model_transformer.py +297 -0
README.md +13 -22
app.py +639 -134
requirements.txt +6 -1

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.onnx

CorridorKeyModule/__init__.py ADDED Viewed

File without changes

CorridorKeyModule/core/__init__.py ADDED Viewed

File without changes

CorridorKeyModule/core/model_transformer.py ADDED Viewed

	@@ -0,0 +1,297 @@

+from __future__ import annotations
+import logging
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+logger = logging.getLogger(__name__)
+class MLP(nn.Module):
+    """Linear embedding: C_in -> C_out."""
+    def __init__(self, input_dim: int = 2048, embed_dim: int = 768) -> None:
+        super().__init__()
+        self.proj = nn.Linear(input_dim, embed_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.proj(x)
+class DecoderHead(nn.Module):
+    def __init__(
+        self, feature_channels: list[int] | None = None, embedding_dim: int = 256, output_dim: int = 1
+    ) -> None:
+        super().__init__()
+        if feature_channels is None:
+            feature_channels = [112, 224, 448, 896]
+        # MLP layers to unify channel dimensions
+        self.linear_c4 = MLP(input_dim=feature_channels[3], embed_dim=embedding_dim)
+        self.linear_c3 = MLP(input_dim=feature_channels[2], embed_dim=embedding_dim)
+        self.linear_c2 = MLP(input_dim=feature_channels[1], embed_dim=embedding_dim)
+        self.linear_c1 = MLP(input_dim=feature_channels[0], embed_dim=embedding_dim)
+        # Fuse
+        self.linear_fuse = nn.Conv2d(embedding_dim * 4, embedding_dim, kernel_size=1, bias=False)
+        self.bn = nn.BatchNorm2d(embedding_dim)
+        self.relu = nn.ReLU(inplace=True)
+        # Predict
+        self.dropout = nn.Dropout(0.1)
+        self.classifier = nn.Conv2d(embedding_dim, output_dim, kernel_size=1)
+    def forward(self, features: list[torch.Tensor]) -> torch.Tensor:
+        c1, c2, c3, c4 = features
+        n, _, h, w = c4.shape
+        # Resize to C1 size (which is H/4)
+        _c4 = self.linear_c4(c4.flatten(2).transpose(1, 2)).transpose(1, 2).view(n, -1, c4.shape[2], c4.shape[3])
+        _c4 = F.interpolate(_c4, size=c1.shape[2:], mode="bilinear", align_corners=False)
+        _c3 = self.linear_c3(c3.flatten(2).transpose(1, 2)).transpose(1, 2).view(n, -1, c3.shape[2], c3.shape[3])
+        _c3 = F.interpolate(_c3, size=c1.shape[2:], mode="bilinear", align_corners=False)
+        _c2 = self.linear_c2(c2.flatten(2).transpose(1, 2)).transpose(1, 2).view(n, -1, c2.shape[2], c2.shape[3])
+        _c2 = F.interpolate(_c2, size=c1.shape[2:], mode="bilinear", align_corners=False)
+        _c1 = self.linear_c1(c1.flatten(2).transpose(1, 2)).transpose(1, 2).view(n, -1, c1.shape[2], c1.shape[3])
+        _c = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1))
+        _c = self.bn(_c)
+        _c = self.relu(_c)
+        x = self.dropout(_c)
+        x = self.classifier(x)
+        return x
+class RefinerBlock(nn.Module):
+    """
+    Residual Block with Dilation and GroupNorm (Safe for Batch Size 2).
+    """
+    def __init__(self, channels: int, dilation: int = 1) -> None:
+        super().__init__()
+        self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=dilation, dilation=dilation)
+        self.gn1 = nn.GroupNorm(8, channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=dilation, dilation=dilation)
+        self.gn2 = nn.GroupNorm(8, channels)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        out = self.conv1(x)
+        out = self.gn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.gn2(out)
+        out += residual
+        out = self.relu(out)
+        return out
+class CNNRefinerModule(nn.Module):
+    """
+    Dilated Residual Refiner (Receptive Field ~65px).
+    designed to solve Macroblocking artifacts from Hiera.
+    Structure: Stem -> Res(d1) -> Res(d2) -> Res(d4) -> Res(d8) -> Projection.
+    """
+    def __init__(self, in_channels: int = 7, hidden_channels: int = 64, out_channels: int = 4) -> None:
+        super().__init__()
+        # Stem
+        self.stem = nn.Sequential(
+            nn.Conv2d(in_channels, hidden_channels, kernel_size=3, padding=1),
+            nn.GroupNorm(8, hidden_channels),
+            nn.ReLU(inplace=True),
+        )
+        # Dilated Residual Blocks (RF Expansion)
+        self.res1 = RefinerBlock(hidden_channels, dilation=1)
+        self.res2 = RefinerBlock(hidden_channels, dilation=2)
+        self.res3 = RefinerBlock(hidden_channels, dilation=4)
+        self.res4 = RefinerBlock(hidden_channels, dilation=8)
+        # Final Projection (No Activation, purely additive logits)
+        self.final = nn.Conv2d(hidden_channels, out_channels, kernel_size=1)
+        # Tiny Noise Init (Whisper) - Provides gradients without shock
+        nn.init.normal_(self.final.weight, mean=0.0, std=1e-3)
+        nn.init.constant_(self.final.bias, 0)
+    def forward(self, img: torch.Tensor, coarse_pred: torch.Tensor) -> torch.Tensor:
+        # img: [B, 3, H, W]
+        # coarse_pred: [B, 4, H, W]
+        x = torch.cat([img, coarse_pred], dim=1)
+        x = self.stem(x)
+        x = self.res1(x)
+        x = self.res2(x)
+        x = self.res3(x)
+        x = self.res4(x)
+        # Output Scaling (10x Boost)
+        # Allows the Refiner to predict small stable values (e.g. 0.5) that become strong corrections (5.0).
+        return self.final(x) * 10.0
+class GreenFormer(nn.Module):
+    def __init__(
+        self,
+        encoder_name: str = "hiera_base_plus_224.mae_in1k_ft_in1k",
+        in_channels: int = 4,
+        img_size: int = 512,
+        use_refiner: bool = True,
+    ) -> None:
+        super().__init__()
+        # --- Encoder ---
+        # Load Pretrained Hiera
+        # 1. Create Target Model (512x512, Random Weights)
+        # We use features_only=True, which wraps it in FeatureGetterNet
+        logger.info("Initializing %s (img_size=%d)", encoder_name, img_size)
+        self.encoder = timm.create_model(encoder_name, pretrained=False, features_only=True, img_size=img_size)
+        # We skip downloading/loading base weights because the user's checkpoint
+        # (loaded immediately after this) contains all weights, including correctly
+        # trained/sized PosEmbeds. This keeps the project offline-capable using only local assets.
+        logger.info("Skipped downloading base weights (relying on custom checkpoint)")
+        # Patch First Layer for 4 channels
+        if in_channels != 3:
+            self._patch_input_layer(in_channels)
+        # Get feature info
+        # Verified Hiera Base Plus channels: [112, 224, 448, 896]
+        # We can try to fetch dynamically
+        try:
+            feature_channels = self.encoder.feature_info.channels()
+        except (AttributeError, TypeError):
+            feature_channels = [112, 224, 448, 896]
+        logger.info("Feature channels: %s", feature_channels)
+        # --- Decoders ---
+        embedding_dim = 256
+        # Alpha Decoder (Outputs 1 channel)
+        self.alpha_decoder = DecoderHead(feature_channels, embedding_dim, output_dim=1)
+        # Foreground Decoder (Outputs 3 channels)
+        self.fg_decoder = DecoderHead(feature_channels, embedding_dim, output_dim=3)
+        # --- Refiner ---
+        # CNN Refiner
+        # In Channels: 3 (RGB) + 4 (Coarse Pred) = 7
+        self.use_refiner = use_refiner
+        if self.use_refiner:
+            self.refiner = CNNRefinerModule(in_channels=7, hidden_channels=64, out_channels=4)
+        else:
+            self.refiner = None
+            logger.info("Refiner module DISABLED (backbone-only mode)")
+    def _patch_input_layer(self, in_channels: int) -> None:
+        """
+        Modifies the first convolution layer to accept `in_channels`.
+        Copies existing RGB weights and initializes extras to zero.
+        """
+        # Hiera: self.encoder.model.patch_embed.proj
+        try:
+            patch_embed = self.encoder.model.patch_embed.proj
+        except AttributeError:
+            # Fallback if timm changes structure or for other models
+            patch_embed = self.encoder.patch_embed.proj
+        weight = patch_embed.weight.data  # [Out, 3, K, K]
+        bias = patch_embed.bias.data if patch_embed.bias is not None else None
+        new_in_channels = in_channels
+        out_channels, _, k, k = weight.shape
+        # Create new conv
+        new_conv = nn.Conv2d(
+            new_in_channels,
+            out_channels,
+            kernel_size=k,
+            stride=patch_embed.stride,
+            padding=patch_embed.padding,
+            bias=(bias is not None),
+        )
+        # Copy weights
+        new_conv.weight.data[:, :3, :, :] = weight
+        # Initialize new channels to 0 (Weight Patching)
+        new_conv.weight.data[:, 3:, :, :] = 0.0
+        if bias is not None:
+            new_conv.bias.data = bias
+        # Replace in module
+        try:
+            self.encoder.model.patch_embed.proj = new_conv
+        except AttributeError:
+            self.encoder.patch_embed.proj = new_conv
+        logger.info("Patched input layer: 3 → %d channels (extra initialized to 0)", in_channels)
+    def forward(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
+        # x: [B, 4, H, W]
+        input_size = x.shape[2:]
+        # Encode
+        features = self.encoder(x)  # Returns list of features
+        # Decode Streams
+        alpha_logits = self.alpha_decoder(features)  # [B, 1, H/4, W/4]
+        fg_logits = self.fg_decoder(features)  # [B, 3, H/4, W/4]
+        # Upsample to full resolution (Bilinear)
+        # These are the "Coarse" LOGITS
+        alpha_logits_up = F.interpolate(alpha_logits, size=input_size, mode="bilinear", align_corners=False)
+        fg_logits_up = F.interpolate(fg_logits, size=input_size, mode="bilinear", align_corners=False)
+        # --- HUMILITY CLAMP REMOVED (Phase 3) ---
+        # User requested NO CLAMPING to preserve all backbone detail.
+        # Refiner sees raw logits (-inf to +inf).
+        # alpha_logits_up = torch.clamp(alpha_logits_up, -3.0, 3.0)
+        # fg_logits_up = torch.clamp(fg_logits_up, -3.0, 3.0)
+        # Coarse Probs (for Loss and Refiner Input)
+        alpha_coarse = torch.sigmoid(alpha_logits_up)
+        fg_coarse = torch.sigmoid(fg_logits_up)
+        # --- Refinement (CNN Hybrid) ---
+        # 4. Refine (CNN)
+        # Input to refiner: RGB Image (first 3 channels of x) + Coarse Predictions (Probs)
+        # We give the refiner 'Probs' as input features because they are normalized [0,1]
+        rgb = x[:, :3, :, :]
+        # Feed the Refiner
+        coarse_pred = torch.cat([alpha_coarse, fg_coarse], dim=1)  # [B, 4, H, W]
+        # Refiner outputs DELTA LOGITS
+        # The refiner predicts the correction in valid score space (-inf, inf)
+        if self.use_refiner and self.refiner is not None:
+            delta_logits = self.refiner(rgb, coarse_pred)
+        else:
+            # Zero Deltas
+            delta_logits = torch.zeros_like(coarse_pred)
+        delta_alpha = delta_logits[:, 0:1]
+        delta_fg = delta_logits[:, 1:4]
+        # Residual Addition in Logit Space
+        # This allows infinite correction capability and prevents saturation blocking
+        alpha_final_logits = alpha_logits_up + delta_alpha
+        fg_final_logits = fg_logits_up + delta_fg
+        # Final Activation
+        alpha_final = torch.sigmoid(alpha_final_logits)
+        fg_final = torch.sigmoid(fg_final_logits)
+        return {"alpha": alpha_final, "fg": fg_final}

README.md CHANGED Viewed

@@ -17,20 +17,27 @@ tags:
   - corridor-digital
   - transparency
   - onnx
   - mcp-server
 short_description: Remove green background from video, even transparent objects
 ---
-# CorridorKey Green Screen Matting (CPU)
-Remove green screen backgrounds from video on free CPU. Handles transparent objects (glass, water, cloth) that traditional chroma key cannot.
 Based on [CorridorKey](https://github.com/nikopueringer/CorridorKey) by Corridor Digital.
 ## Pipeline
-1. **BiRefNet** - Generates coarse foreground mask
-2. **CorridorKey GreenFormer** - Refines alpha matte + extracts clean foreground
 3. **Compositing** - Despill, despeckle, composite on new background
 ## API
@@ -41,7 +48,7 @@ Based on [CorridorKey](https://github.com/nikopueringer/CorridorKey) by Corridor
 ```bash
 curl -X POST "https://luminia-corridorkey.hf.space/gradio_api/call/process_video" \
   -H "Content-Type: application/json" \
-  -d '{"data": ["video.mp4", 5, 10, true, 400, "Composite on checkerboard (MP4)"]}'
 ```
 **Step 2: Get result**
@@ -51,27 +58,11 @@ curl "https://luminia-corridorkey.hf.space/gradio_api/call/process_video/{event_
 ### MCP (Model Context Protocol)
-**Tool schema:**
-```json
-{
-  "name": "process_video",
-  "description": "Remove green screen background from video using CorridorKey AI matting.",
-  "parameters": {
-    "video_path": "Path to green screen video",
-    "despill_val": "Despill strength 0-10 (default 5)",
-    "refiner_val": "Refiner scale 0-20 (default 10)",
-    "auto_despeckle": "Remove small artifacts (default true)",
-    "despeckle_size": "Min pixel area to keep (default 400)",
-    "output_mode": "Composite on checkerboard (MP4) | Alpha matte (MP4) | Transparent video (WebM) | PNG sequence (ZIP)"
-  }
-}
-```
 **MCP Config:**
 ```json
 {
   "mcpServers": {
-    "corridorkey-cpu": {
       "url": "https://luminia-corridorkey.hf.space/gradio_api/mcp/"
     }
   }

   - corridor-digital
   - transparency
   - onnx
+  - pytorch
+  - zerogpu
   - mcp-server
 short_description: Remove green background from video, even transparent objects
 ---
+# CorridorKey Green Screen Matting
+Remove green screen backgrounds from video. Handles transparent objects (glass, water, cloth) that traditional chroma key cannot.
 Based on [CorridorKey](https://github.com/nikopueringer/CorridorKey) by Corridor Digital.
+## Inference Paths
+- **GPU (ZeroGPU H200)**: PyTorch GreenFormer with batched inference (batch 32 at 1024, batch 8 at 2048)
+- **CPU (fallback)**: ONNX Runtime sequential inference (batch 1)
 ## Pipeline
+1. **BiRefNet** - Generates coarse foreground mask (ONNX)
+2. **CorridorKey GreenFormer** - Refines alpha matte + extracts clean foreground (PyTorch on GPU, ONNX on CPU)
 3. **Compositing** - Despill, despeckle, composite on new background
 ## API
 ```bash
 curl -X POST "https://luminia-corridorkey.hf.space/gradio_api/call/process_video" \
   -H "Content-Type: application/json" \
+  -d '{"data": ["video.mp4", "1024", 5, "Hybrid (auto)", true, 400]}'
 ```
 **Step 2: Get result**
 ### MCP (Model Context Protocol)
 **MCP Config:**
 ```json
 {
   "mcpServers": {
+    "corridorkey": {
       "url": "https://luminia-corridorkey.hf.space/gradio_api/mcp/"
     }
   }

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """CorridorKey Green Screen Matting - HuggingFace Space.
-Self-contained Gradio app using ONNX Runtime for inference.
-Supports CPU (free tier) and GPU (community grant).
 Usage:
     python app.py                        # Launch Gradio UI
@@ -10,6 +11,7 @@ Usage:
 import os
 import sys
 import shutil
 import gc
 import time
@@ -28,6 +30,12 @@ import cv2
 import gradio as gr
 import onnxruntime as ort
 # Workaround: Gradio cache_examples bug with None outputs.
 _original_read_from_flag = gr.components.Component.read_from_flag
 def _patched_read_from_flag(self, payload):
@@ -52,13 +60,35 @@ CORRIDORKEY_MODELS = {
     "1024": os.path.join(MODELS_DIR, "corridorkey_1024.onnx"),
     "2048": os.path.join(MODELS_DIR, "corridorkey_2048.onnx"),
 }
 IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 1, 3)
 IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 1, 3)
 MAX_DURATION_CPU = 5
-MAX_DURATION_GPU = 30
-MAX_FRAMES = 150
 HAS_CUDA = "CUDAExecutionProvider" in ort.get_available_providers()
 # ---------------------------------------------------------------------------
 # Color utilities (numpy-only)
 # ---------------------------------------------------------------------------
@@ -134,41 +164,167 @@ def fast_greenscreen_mask(frame_rgb_f32):
     return mask_f32, confidence
 # ---------------------------------------------------------------------------
-# Model loading
 # ---------------------------------------------------------------------------
 _birefnet_session = None
 _corridorkey_sessions = {}
 def _ort_opts():
     opts = ort.SessionOptions()
-    opts.intra_op_num_threads = 2
-    opts.inter_op_num_threads = 1
     opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
     opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
     opts.enable_mem_pattern = True
     return opts
-def get_birefnet():
     global _birefnet_session
-    if _birefnet_session is None:
-        logger.info("Downloading BiRefNet-Lite ONNX...")
-        path = hf_hub_download(repo_id=BIREFNET_REPO, filename=BIREFNET_FILE)
-        logger.info("Loading BiRefNet ONNX: %s", path)
-        _birefnet_session = ort.InferenceSession(path, _ort_opts(), providers=["CPUExecutionProvider"])
     return _birefnet_session
-def get_corridorkey(resolution="1024"):
     global _corridorkey_sessions
     if resolution not in _corridorkey_sessions:
         onnx_path = CORRIDORKEY_MODELS.get(resolution)
         if not onnx_path or not os.path.exists(onnx_path):
             raise gr.Error(f"CorridorKey ONNX model for {resolution} not found.")
-        logger.info("Loading CorridorKey ONNX (%s): %s", resolution, onnx_path)
-        _corridorkey_sessions[resolution] = ort.InferenceSession(onnx_path, _ort_opts(), providers=["CPUExecutionProvider"])
     return _corridorkey_sessions[resolution]
 # ---------------------------------------------------------------------------
-# Per-frame inference
 # ---------------------------------------------------------------------------
 def birefnet_frame(session, image_rgb_uint8):
     h, w = image_rgb_uint8.shape[:2]
@@ -179,8 +335,9 @@ def birefnet_frame(session, image_rgb_uint8):
     pred = 1.0 / (1.0 + np.exp(-session.run(None, {inp.name: img})[-1]))
     return (cv2.resize(pred[0, 0], (w, h)) > 0.04).astype(np.float32)
-def corridorkey_frame(session, image_f32, mask_f32, img_size,
-                      despill_strength=0.5, auto_despeckle=True, despeckle_size=400):
     h, w = image_f32.shape[:2]
     img_r = cv2.resize(image_f32, (img_size, img_size))
     mask_r = cv2.resize(mask_f32, (img_size, img_size))[:, :, np.newaxis]
@@ -196,6 +353,70 @@ def corridorkey_frame(session, image_f32, mask_f32, img_size,
     fg = despill(fg, green_limit_mode="average", strength=despill_strength)
     return {"alpha": alpha, "fg": fg}
 # ---------------------------------------------------------------------------
 # Video stitching
 # ---------------------------------------------------------------------------
@@ -213,20 +434,112 @@ def _stitch_ffmpeg(frame_dir, out_path, fps, pattern="%05d.png", pix_fmt="yuv420
         logger.warning("ffmpeg failed: %s", e)
         return False
 # ---------------------------------------------------------------------------
-# Main pipeline: generates ALL professional outputs
 # ---------------------------------------------------------------------------
-def process_video(video_path, resolution, despill_val, mask_mode,
-                  auto_despeckle, despeckle_size, progress=gr.Progress()):
-    """Remove green screen background from video using CorridorKey AI matting.
-    Returns: comp_video, matte_video, download_zip, status
     """
     if video_path is None:
         raise gr.Error("Please upload a video.")
-    max_dur = MAX_DURATION_GPU if HAS_CUDA else MAX_DURATION_CPU
     img_size = int(resolution)
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
@@ -235,132 +548,325 @@ def process_video(video_path, resolution, despill_val, mask_mode,
     cap.release()
     if total_frames == 0:
-        raise gr.Error("Could not read video frames. Check file format.")
     duration = total_frames / fps
     if duration > max_dur:
-        raise gr.Error(f"Video too long ({duration:.1f}s). Max {max_dur}s on {'GPU' if HAS_CUDA else 'free CPU'} tier.")
     frames_to_process = min(total_frames, MAX_FRAMES)
-    logger.info("Processing %d frames (%dx%d @ %.1f fps), resolution=%d, mask=%s",
-                frames_to_process, w, h, fps, img_size, mask_mode)
-    try:
-        birefnet = None
-        if mask_mode != "Fast (classical)":
-            progress(0, desc="Loading BiRefNet...")
-            birefnet = get_birefnet()
-        progress(0.03, desc=f"Loading CorridorKey ({resolution})...")
-        corridorkey = get_corridorkey(resolution)
-    except Exception as e:
-        raise gr.Error(f"Failed to load models: {e}")
-    despill_strength = despill_val / 10.0
     tmpdir = tempfile.mkdtemp(prefix="ck_")
     try:
-        # Output dirs matching original CorridorKey structure
-        comp_dir = os.path.join(tmpdir, "Comp")
-        fg_dir = os.path.join(tmpdir, "FG")
-        matte_dir = os.path.join(tmpdir, "Matte")
-        processed_dir = os.path.join(tmpdir, "Processed")
-        for d in [comp_dir, fg_dir, matte_dir, processed_dir]:
-            os.makedirs(d, exist_ok=True)
-        bg_lin = srgb_to_linear(create_checkerboard(w, h))
-        cap = cv2.VideoCapture(video_path)
-        frame_times = []
-        total_start = time.time()
-        for i in range(frames_to_process):
-            t0 = time.time()
-            ret, frame_bgr = cap.read()
-            if not ret:
-                break
-            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
-            frame_f32 = frame_rgb.astype(np.float32) / 255.0
-            # Coarse mask
-            if mask_mode == "Fast (classical)":
-                mask, _ = fast_greenscreen_mask(frame_f32)
-                if mask is None:
-                    raise gr.Error("Fast mask failed: no green screen detected. Try 'AI (BiRefNet)' mode.")
-            elif mask_mode == "Hybrid (auto)":
-                mask, conf = fast_greenscreen_mask(frame_f32)
-                if mask is None or conf < 0.7:
-                    mask = birefnet_frame(birefnet, frame_rgb)
-            else:
-                mask = birefnet_frame(birefnet, frame_rgb)
-            # CorridorKey inference
-            result = corridorkey_frame(corridorkey, frame_f32, mask, img_size,
-                                       despill_strength=despill_strength,
-                                       auto_despeckle=auto_despeckle,
-                                       despeckle_size=int(despeckle_size))
-            alpha = result["alpha"]
-            fg = result["fg"]
-            # Ensure alpha is [H,W,1] and get 2D version
-            if alpha.ndim == 2:
-                alpha = alpha[:, :, np.newaxis]
-            alpha_2d = alpha[:, :, 0]
-            # -- Comp: composite on checkerboard (sRGB PNG) --
-            fg_lin = srgb_to_linear(fg)
-            comp = linear_to_srgb(composite_straight(fg_lin, bg_lin, alpha))
-            cv2.imwrite(os.path.join(comp_dir, f"{i:05d}.png"),
-                        (np.clip(comp, 0, 1) * 255).astype(np.uint8)[:, :, ::-1])
-            # -- FG: straight foreground, 100% opaque (sRGB PNG) --
-            cv2.imwrite(os.path.join(fg_dir, f"{i:05d}.png"),
-                        (np.clip(fg, 0, 1) * 255).astype(np.uint8)[:, :, ::-1])
-            # -- Matte: alpha channel (grayscale PNG) --
-            cv2.imwrite(os.path.join(matte_dir, f"{i:05d}.png"),
-                        (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8))
-            # -- Processed: premultiplied RGBA (PNG with transparency) --
-            fg_premul_lin = premultiply(fg_lin, alpha)
-            fg_premul_srgb = linear_to_srgb(fg_premul_lin)
-            fg_premul_u8 = (np.clip(fg_premul_srgb, 0, 1) * 255).astype(np.uint8)
-            alpha_u8 = (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8)
-            rgba = np.concatenate([fg_premul_u8[:, :, ::-1], alpha_u8[:, :, np.newaxis]], axis=-1)
-            cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba)
-            # Progress with ETA
-            elapsed = time.time() - t0
-            frame_times.append(elapsed)
-            avg_t = np.mean(frame_times[-5:]) if len(frame_times) >= 2 else elapsed
-            remaining = (frames_to_process - i - 1) * avg_t
-            eta = f"{remaining/60:.1f}min" if remaining > 60 else f"{remaining:.0f}s"
-            pct = 0.05 + 0.85 * (i + 1) / frames_to_process
-            progress(pct, desc=f"Frame {i+1}/{frames_to_process} ({elapsed:.1f}s) | ~{eta} left")
-        cap.release()
-        total_elapsed = time.time() - total_start
-        total_min = total_elapsed / 60
-        # Stitch preview videos
-        progress(0.92, desc="Stitching videos...")
         comp_video = os.path.join(tmpdir, "comp_preview.mp4")
         matte_video = os.path.join(tmpdir, "matte_preview.mp4")
-        _stitch_ffmpeg(comp_dir, comp_video, fps, extra_args=["-crf", "18"])
-        _stitch_ffmpeg(matte_dir, matte_video, fps, extra_args=["-crf", "18"])
-        # Package full professional ZIP
         progress(0.96, desc="Packaging ZIP...")
         zip_path = os.path.join(tmpdir, "CorridorKey_Output.zip")
         with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_STORED) as zf:
             for folder in ["Comp", "FG", "Matte", "Processed"]:
                 src = os.path.join(tmpdir, folder)
-                for f in sorted(os.listdir(src)):
-                    zf.write(os.path.join(src, f), f"Output/{folder}/{f}")
         progress(1.0, desc="Done!")
         n = len(frame_times)
         avg = np.mean(frame_times) if frame_times else 0
-        status = f"Processed {n} frames in {total_min:.1f}min ({w}x{h}) at {img_size}px | {avg:.1f}s/frame"
         return (
             comp_video if os.path.exists(comp_video) else None,
@@ -372,8 +878,8 @@ def process_video(video_path, resolution, despill_val, mask_mode,
     except gr.Error:
         raise
     except Exception as e:
-        logger.exception("Processing failed")
-        raise gr.Error(f"Processing failed: {e}")
     finally:
         for d in ["Comp", "FG", "Matte", "Processed"]:
             p = os.path.join(tmpdir, d)
@@ -388,10 +894,9 @@ def process_video(video_path, resolution, despill_val, mask_mode,
 def process_example(video_path, resolution, despill, mask_mode, despeckle, despeckle_size):
     return process_video(video_path, resolution, despill, mask_mode, despeckle, despeckle_size)
-if HAS_CUDA:
-    DESCRIPTION = "# CorridorKey Green Screen Matting\nRemove green backgrounds from video. Based on [CorridorKey](https://www.youtube.com/watch?v=3Ploi723hg4) by Corridor Digital. GPU mode: max {max_dur}s / {max_frames} frames.".format(max_dur=MAX_DURATION_GPU, max_frames=MAX_FRAMES)
-else:
-    DESCRIPTION = "# CorridorKey Green Screen Matting\nRemove green backgrounds from video. Based on [CorridorKey](https://www.youtube.com/watch?v=3Ploi723hg4) by Corridor Digital. ~37min for 5s clip on free CPU."
 with gr.Blocks(title="CorridorKey") as demo:
     gr.Markdown(DESCRIPTION)
@@ -403,7 +908,7 @@ with gr.Blocks(title="CorridorKey") as demo:
                 resolution = gr.Radio(
                     choices=["1024", "2048"], value="1024",
                     label="Processing Resolution",
-                    info="1024 = balanced (~8s/frame CPU), 2048 = max quality (fast on GPU)"
                 )
                 mask_mode = gr.Radio(
                     choices=["Hybrid (auto)", "AI (BiRefNet)", "Fast (classical)"],

 """CorridorKey Green Screen Matting - HuggingFace Space.
+Self-contained Gradio app with dual inference paths:
+- GPU (ZeroGPU H200): PyTorch batched inference via GreenFormer
+- CPU (fallback): ONNX Runtime sequential inference
 Usage:
     python app.py                        # Launch Gradio UI
 import os
 import sys
+import math
 import shutil
 import gc
 import time
 import gradio as gr
 import onnxruntime as ort
+try:
+    import spaces
+    HAS_SPACES = True
+except ImportError:
+    HAS_SPACES = False
 # Workaround: Gradio cache_examples bug with None outputs.
 _original_read_from_flag = gr.components.Component.read_from_flag
 def _patched_read_from_flag(self, payload):
     "1024": os.path.join(MODELS_DIR, "corridorkey_1024.onnx"),
     "2048": os.path.join(MODELS_DIR, "corridorkey_2048.onnx"),
 }
+CORRIDORKEY_PTH_REPO = "nikopueringer/CorridorKey_v1.0"
+CORRIDORKEY_PTH_FILE = "CorridorKey_v1.0.pth"
 IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 1, 3)
 IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 1, 3)
 MAX_DURATION_CPU = 5
+MAX_DURATION_GPU = 60
+MAX_FRAMES = 1800
 HAS_CUDA = "CUDAExecutionProvider" in ort.get_available_providers()
+# ---------------------------------------------------------------------------
+# Preload model files at startup (OUTSIDE GPU function — don't waste GPU time on downloads)
+# ---------------------------------------------------------------------------
+logger.info("Preloading model files at startup...")
+_preloaded_birefnet_path = None
+_preloaded_pth_path = None
+try:
+    _preloaded_birefnet_path = hf_hub_download(repo_id=BIREFNET_REPO, filename=BIREFNET_FILE)
+    logger.info("BiRefNet cached: %s", _preloaded_birefnet_path)
+except Exception as e:
+    logger.warning("BiRefNet preload failed (will retry later): %s", e)
+try:
+    _preloaded_pth_path = hf_hub_download(repo_id=CORRIDORKEY_PTH_REPO, filename=CORRIDORKEY_PTH_FILE)
+    logger.info("CorridorKey.pth cached: %s", _preloaded_pth_path)
+except Exception as e:
+    logger.warning("CorridorKey.pth preload failed (will retry later): %s", e)
+# Batch sizes for GPU inference (conservative for H200 80GB)
+GPU_BATCH_SIZES = {"1024": 32, "2048": 16}  # 2048 uses only 5.7GB/batch=2, so 16 easily fits in 69.8GB
 # ---------------------------------------------------------------------------
 # Color utilities (numpy-only)
 # ---------------------------------------------------------------------------
     return mask_f32, confidence
 # ---------------------------------------------------------------------------
+# ONNX model loading (CPU fallback + BiRefNet)
 # ---------------------------------------------------------------------------
 _birefnet_session = None
 _corridorkey_sessions = {}
+_sessions_on_gpu = False
+def _get_providers():
+    """Get best available providers. Inside @spaces.GPU, CUDA is available."""
+    providers = ort.get_available_providers()
+    if "CUDAExecutionProvider" in providers:
+        return ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    return ["CPUExecutionProvider"]
 def _ort_opts():
     opts = ort.SessionOptions()
+    if "CUDAExecutionProvider" in ort.get_available_providers():
+        opts.intra_op_num_threads = 0
+        opts.inter_op_num_threads = 0
+    else:
+        opts.intra_op_num_threads = 2
+        opts.inter_op_num_threads = 1
     opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
     opts.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
     opts.enable_mem_pattern = True
     return opts
+def _ensure_gpu_sessions():
+    """Reload ONNX sessions on GPU if CUDA just became available (ZeroGPU)."""
+    global _birefnet_session, _corridorkey_sessions, _sessions_on_gpu
+    has_cuda_now = "CUDAExecutionProvider" in ort.get_available_providers()
+    if has_cuda_now and not _sessions_on_gpu:
+        logger.info("CUDA available! Reloading ONNX sessions on GPU...")
+        _birefnet_session = None
+        _corridorkey_sessions = {}
+        _sessions_on_gpu = True
+def get_birefnet(force_cpu=False):
     global _birefnet_session
+    if _birefnet_session is None or force_cpu:
+        path = _preloaded_birefnet_path or hf_hub_download(repo_id=BIREFNET_REPO, filename=BIREFNET_FILE)
+        providers = ["CPUExecutionProvider"] if force_cpu else _get_providers()
+        logger.info("Loading BiRefNet ONNX: %s (providers: %s)", path, providers)
+        opts = _ort_opts()
+        if force_cpu:
+            opts.intra_op_num_threads = 2
+            opts.inter_op_num_threads = 1
+        _birefnet_session = ort.InferenceSession(path, opts, providers=providers)
     return _birefnet_session
+def get_corridorkey_onnx(resolution="1024"):
     global _corridorkey_sessions
     if resolution not in _corridorkey_sessions:
         onnx_path = CORRIDORKEY_MODELS.get(resolution)
         if not onnx_path or not os.path.exists(onnx_path):
             raise gr.Error(f"CorridorKey ONNX model for {resolution} not found.")
+        providers = _get_providers()
+        logger.info("Loading CorridorKey ONNX (%s): %s (providers: %s)", resolution, onnx_path, providers)
+        _corridorkey_sessions[resolution] = ort.InferenceSession(onnx_path, _ort_opts(), providers=providers)
     return _corridorkey_sessions[resolution]
 # ---------------------------------------------------------------------------
+# PyTorch model loading (GPU path)
+# ---------------------------------------------------------------------------
+_pytorch_model = None
+_pytorch_model_size = None
+def _load_greenformer(img_size):
+    """Load the GreenFormer PyTorch model for GPU inference."""
+    import torch
+    import torch.nn.functional as F
+    from CorridorKeyModule.core.model_transformer import GreenFormer
+    checkpoint_path = _preloaded_pth_path or hf_hub_download(repo_id=CORRIDORKEY_PTH_REPO, filename=CORRIDORKEY_PTH_FILE)
+    logger.info("Using checkpoint: %s", checkpoint_path)
+    logger.info("Initializing GreenFormer (img_size=%d)...", img_size)
+    model = GreenFormer(
+        encoder_name="hiera_base_plus_224.mae_in1k_ft_in1k",
+        img_size=img_size,
+        use_refiner=True,
+    )
+    # Load weights
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+    state_dict = checkpoint.get("state_dict", checkpoint)
+    # Fix compiled model prefix & handle PosEmbed mismatch
+    new_state_dict = {}
+    model_state = model.state_dict()
+    for k, v in state_dict.items():
+        if k.startswith("_orig_mod."):
+            k = k[10:]
+        if "pos_embed" in k and k in model_state:
+            if v.shape != model_state[k].shape:
+                logger.info("Resizing %s from %s to %s", k, v.shape, model_state[k].shape)
+                N_src = v.shape[1]
+                C = v.shape[2]
+                grid_src = int(math.sqrt(N_src))
+                grid_dst = int(math.sqrt(model_state[k].shape[1]))
+                v_img = v.permute(0, 2, 1).view(1, C, grid_src, grid_src)
+                v_resized = F.interpolate(v_img, size=(grid_dst, grid_dst), mode="bicubic", align_corners=False)
+                v = v_resized.flatten(2).transpose(1, 2)
+        new_state_dict[k] = v
+    missing, unexpected = model.load_state_dict(new_state_dict, strict=False)
+    if missing:
+        logger.warning("Missing keys: %s", missing)
+    if unexpected:
+        logger.warning("Unexpected keys: %s", unexpected)
+    model.eval()
+    model = model.cuda().half()  # FP16 for speed on H200
+    logger.info("Model loaded as FP16")
+    try:
+        import flash_attn
+        logger.info("flash-attn v%s installed (prebuilt wheel)", getattr(flash_attn, '__version__', '?'))
+    except ImportError:
+        logger.info("flash-attn not available (using PyTorch SDPA)")
+    logger.info("SDPA backends: flash=%s, mem_efficient=%s, math=%s",
+                torch.backends.cuda.flash_sdp_enabled(),
+                torch.backends.cuda.mem_efficient_sdp_enabled(),
+                torch.backends.cuda.math_sdp_enabled())
+    # Skip torch.compile on ZeroGPU — the 37s warmup eats too much of the 120s budget.
+    if not HAS_SPACES and sys.platform in ("linux", "win32"):
+        try:
+            compiled = torch.compile(model)
+            dummy = torch.zeros(1, 4, img_size, img_size, dtype=torch.float16, device="cuda")
+            with torch.inference_mode():
+                compiled(dummy)
+            model = compiled
+            logger.info("torch.compile() succeeded")
+        except Exception as e:
+            logger.warning("torch.compile() failed, using eager mode: %s", e)
+            torch.cuda.empty_cache()
+    else:
+        logger.info("Skipping torch.compile() (ZeroGPU: saving GPU time for inference)")
+    logger.info("GreenFormer loaded on CUDA (img_size=%d)", img_size)
+    return model
+def get_pytorch_model(img_size):
+    """Get or load the PyTorch GreenFormer model for the given resolution."""
+    global _pytorch_model, _pytorch_model_size
+    if _pytorch_model is None or _pytorch_model_size != img_size:
+        # Free old model if switching resolution
+        if _pytorch_model is not None:
+            import torch
+            del _pytorch_model
+            _pytorch_model = None
+            torch.cuda.empty_cache()
+            gc.collect()
+        _pytorch_model = _load_greenformer(img_size)
+        _pytorch_model_size = img_size
+    return _pytorch_model
+# ---------------------------------------------------------------------------
+# Per-frame inference: ONNX (CPU fallback)
 # ---------------------------------------------------------------------------
 def birefnet_frame(session, image_rgb_uint8):
     h, w = image_rgb_uint8.shape[:2]
     pred = 1.0 / (1.0 + np.exp(-session.run(None, {inp.name: img})[-1]))
     return (cv2.resize(pred[0, 0], (w, h)) > 0.04).astype(np.float32)
+def corridorkey_frame_onnx(session, image_f32, mask_f32, img_size,
+                           despill_strength=0.5, auto_despeckle=True, despeckle_size=400):
+    """ONNX inference for a single frame (CPU path)."""
     h, w = image_f32.shape[:2]
     img_r = cv2.resize(image_f32, (img_size, img_size))
     mask_r = cv2.resize(mask_f32, (img_size, img_size))[:, :, np.newaxis]
     fg = despill(fg, green_limit_mode="average", strength=despill_strength)
     return {"alpha": alpha, "fg": fg}
+# ---------------------------------------------------------------------------
+# Batched inference: PyTorch (GPU path)
+# ---------------------------------------------------------------------------
+def corridorkey_batch_pytorch(model, images_f32, masks_f32, img_size,
+                              despill_strength=0.5, auto_despeckle=True, despeckle_size=400):
+    """PyTorch batched inference for multiple frames on GPU.
+    Args:
+        model: GreenFormer model on CUDA
+        images_f32: list of [H, W, 3] float32 numpy arrays (0-1, sRGB)
+        masks_f32: list of [H, W] float32 numpy arrays (0-1)
+        img_size: model input resolution (1024 or 2048)
+    Returns:
+        list of dicts with 'alpha' [H,W,1] and 'fg' [H,W,3]
+    """
+    import torch
+    batch_size = len(images_f32)
+    if batch_size == 0:
+        return []
+    # Store original sizes per frame
+    orig_sizes = [(img.shape[1], img.shape[0]) for img in images_f32]  # (w, h)
+    # Preprocess: resize, normalize, concatenate into batch tensor
+    batch_inputs = []
+    for img, mask in zip(images_f32, masks_f32):
+        img_r = cv2.resize(img, (img_size, img_size))
+        mask_r = cv2.resize(mask, (img_size, img_size))[:, :, np.newaxis]
+        inp = np.concatenate([(img_r - IMAGENET_MEAN) / IMAGENET_STD, mask_r], axis=-1)
+        batch_inputs.append(inp.transpose(2, 0, 1))  # [4, H, W]
+    batch_np = np.stack(batch_inputs, axis=0).astype(np.float32)  # [B, 4, H, W]
+    batch_tensor = torch.from_numpy(batch_np).cuda().half()  # FP16 input
+    # Forward pass — model is FP16, input is FP16, no autocast needed
+    with torch.inference_mode():
+        out = model(batch_tensor)
+    # Extract results
+    alphas_gpu = out["alpha"].float().cpu().numpy()  # [B, 1, H, W]
+    fgs_gpu = out["fg"].float().cpu().numpy()         # [B, 3, H, W]
+    del batch_tensor
+    # Don't empty cache per batch - too expensive. Let PyTorch manage.
+    # Postprocess each frame
+    results = []
+    for i in range(batch_size):
+        w, h = orig_sizes[i]
+        alpha = cv2.resize(alphas_gpu[i].transpose(1, 2, 0), (w, h), interpolation=cv2.INTER_LANCZOS4)
+        fg = cv2.resize(fgs_gpu[i].transpose(1, 2, 0), (w, h), interpolation=cv2.INTER_LANCZOS4)
+        if alpha.ndim == 2:
+            alpha = alpha[:, :, np.newaxis]
+        if auto_despeckle:
+            alpha = clean_matte(alpha, area_threshold=despeckle_size, dilation=25, blur_size=5)
+        fg = despill(fg, green_limit_mode="average", strength=despill_strength)
+        results.append({"alpha": alpha, "fg": fg})
+    return results
 # ---------------------------------------------------------------------------
 # Video stitching
 # ---------------------------------------------------------------------------
         logger.warning("ffmpeg failed: %s", e)
         return False
 # ---------------------------------------------------------------------------
+# Output writing helper
 # ---------------------------------------------------------------------------
+# Fastest PNG params: compression 1 (instead of default 3)
+_PNG_FAST = [cv2.IMWRITE_PNG_COMPRESSION, 1]
+# JPEG for opaque outputs (comp/fg) — 10x faster than PNG at 4K
+_JPG_QUALITY = [cv2.IMWRITE_JPEG_QUALITY, 95]
+def _write_frame_fast(i, alpha, fg, w, h, bg_lin, comp_dir, matte_dir, fg_dir):
+    """Fast write: comp (JPEG) + matte (PNG) + fg (JPEG). No heavy PNG/npz."""
+    if alpha.ndim == 2:
+        alpha = alpha[:, :, np.newaxis]
+    alpha_2d = alpha[:, :, 0]
+    fg_lin = srgb_to_linear(fg)
+    comp = linear_to_srgb(composite_straight(fg_lin, bg_lin, alpha))
+    cv2.imwrite(os.path.join(comp_dir, f"{i:05d}.jpg"),
+                (np.clip(comp, 0, 1) * 255).astype(np.uint8)[:, :, ::-1], _JPG_QUALITY)
+    cv2.imwrite(os.path.join(fg_dir, f"{i:05d}.jpg"),
+                (np.clip(fg, 0, 1) * 255).astype(np.uint8)[:, :, ::-1], _JPG_QUALITY)
+    cv2.imwrite(os.path.join(matte_dir, f"{i:05d}.png"),
+                (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8), _PNG_FAST)
+def _write_frame_deferred(i, raw_path, w, h, bg_lin, fg_dir, processed_dir):
+    """Deferred write: FG (JPEG) + Processed (RGBA PNG). Runs after GPU release."""
+    d = np.load(raw_path)
+    alpha, fg = d["alpha"], d["fg"]
+    if alpha.ndim == 2:
+        alpha = alpha[:, :, np.newaxis]
+    alpha_2d = alpha[:, :, 0]
+    cv2.imwrite(os.path.join(fg_dir, f"{i:05d}.jpg"),
+                (np.clip(fg, 0, 1) * 255).astype(np.uint8)[:, :, ::-1], _JPG_QUALITY)
+    fg_lin = srgb_to_linear(fg)
+    fg_premul = premultiply(fg_lin, alpha)
+    fg_premul_srgb = linear_to_srgb(fg_premul)
+    fg_u8 = (np.clip(fg_premul_srgb, 0, 1) * 255).astype(np.uint8)
+    a_u8 = (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8)
+    rgba = np.concatenate([fg_u8[:, :, ::-1], a_u8[:, :, np.newaxis]], axis=-1)
+    cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
+    os.remove(raw_path)  # cleanup
+def _write_frame_outputs(i, alpha, fg, w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir):
+    """Full write: all 4 outputs. Used by CPU path."""
+    if alpha.ndim == 2:
+        alpha = alpha[:, :, np.newaxis]
+    alpha_2d = alpha[:, :, 0]
+    fg_lin = srgb_to_linear(fg)
+    comp = linear_to_srgb(composite_straight(fg_lin, bg_lin, alpha))
+    cv2.imwrite(os.path.join(comp_dir, f"{i:05d}.jpg"),
+                (np.clip(comp, 0, 1) * 255).astype(np.uint8)[:, :, ::-1], _JPG_QUALITY)
+    cv2.imwrite(os.path.join(fg_dir, f"{i:05d}.jpg"),
+                (np.clip(fg, 0, 1) * 255).astype(np.uint8)[:, :, ::-1], _JPG_QUALITY)
+    cv2.imwrite(os.path.join(matte_dir, f"{i:05d}.png"),
+                (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8), _PNG_FAST)
+    fg_premul = premultiply(fg_lin, alpha)
+    fg_premul_srgb = linear_to_srgb(fg_premul)
+    fg_u8 = (np.clip(fg_premul_srgb, 0, 1) * 255).astype(np.uint8)
+    a_u8 = (np.clip(alpha_2d, 0, 1) * 255).astype(np.uint8)
+    rgba = np.concatenate([fg_u8[:, :, ::-1], a_u8[:, :, np.newaxis]], axis=-1)
+    cv2.imwrite(os.path.join(processed_dir, f"{i:05d}.png"), rgba, _PNG_FAST)
+# ---------------------------------------------------------------------------
+# Shared storage: GPU function stores results here instead of returning them.
+# This avoids ZeroGPU serializing gigabytes of numpy arrays on return.
+# ---------------------------------------------------------------------------
+_shared_results = {"data": None}
+# ---------------------------------------------------------------------------
+# Main pipeline
+# ---------------------------------------------------------------------------
+def _gpu_decorator(fn):
+    if HAS_SPACES:
+        return spaces.GPU(duration=120)(fn)
+    return fn
+@_gpu_decorator
+def _gpu_phase(video_path, resolution, despill_val, mask_mode,
+               auto_despeckle, despeckle_size, progress=gr.Progress(),
+               precompute_dir=None, precompute_count=0):
+    """ALL GPU work: load models, read video, generate masks, run inference.
+    Returns raw numpy results in RAM. No disk I/O.
     """
     if video_path is None:
         raise gr.Error("Please upload a video.")
+    _ensure_gpu_sessions()
+    try:
+        import torch
+        has_torch_cuda = torch.cuda.is_available()
+    except ImportError:
+        has_torch_cuda = False
+    use_gpu = has_torch_cuda
+    logger.info("[GPU phase] CUDA=%s, mode=%s", has_torch_cuda,
+                "PyTorch batched" if use_gpu else "ONNX sequential")
     img_size = int(resolution)
+    max_dur = MAX_DURATION_GPU if use_gpu else MAX_DURATION_CPU
+    despill_strength = despill_val / 10.0
+    # Read video metadata
     cap = cv2.VideoCapture(video_path)
     fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     cap.release()
     if total_frames == 0:
+        raise gr.Error("Could not read video frames.")
     duration = total_frames / fps
     if duration > max_dur:
+        raise gr.Error(f"Video too long ({duration:.1f}s). Max {max_dur}s.")
     frames_to_process = min(total_frames, MAX_FRAMES)
+    # Load BiRefNet only if masks need it (skip if all precomputed)
+    birefnet = None
+    needs_birefnet = precompute_dir is None or precompute_count == 0
+    if not needs_birefnet and mask_mode != "Fast (classical)":
+        # Check if any frames need BiRefNet (missing mask files)
+        for i in range(min(frames_to_process, precompute_count)):
+            if not os.path.exists(os.path.join(precompute_dir, f"mask_{i:05d}.npy")):
+                needs_birefnet = True
+                break
+    if needs_birefnet:
+        progress(0.02, desc="Loading BiRefNet...")
+        birefnet = get_birefnet()
+        logger.info("BiRefNet loaded (needed for some frames)")
+    else:
+        logger.info("Skipping BiRefNet load (all masks precomputed)")
+    batch_size = GPU_BATCH_SIZES.get(resolution, 16) if use_gpu else 1
+    if use_gpu:
+        progress(0.05, desc=f"Loading GreenFormer ({resolution})...")
+        pytorch_model = get_pytorch_model(img_size)
+    else:
+        progress(0.05, desc=f"Loading CorridorKey ONNX ({resolution})...")
+        corridorkey_onnx = get_corridorkey_onnx(resolution)
+    logger.info("[GPU phase] %d frames (%dx%d @ %.1ffps), res=%d, mask=%s, batch=%d",
+                frames_to_process, w, h, fps, img_size, mask_mode, batch_size)
+    # Read all frames + generate masks + run inference
     tmpdir = tempfile.mkdtemp(prefix="ck_")
+    frame_times = []
+    total_start = time.time()
     try:
+        cap = cv2.VideoCapture(video_path)
+        if use_gpu:
+            import torch
+            vram_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
+            logger.info("VRAM: %.1f/%.1fGB",
+                        torch.cuda.memory_allocated() / 1024**3, vram_total)
+            all_results = []
+            frame_idx = 0
+            # Load precomputed frames from disk (no serialization overhead)
+            use_precomputed = precompute_dir is not None and precompute_count > 0
+            while frame_idx < frames_to_process:
+                t_batch = time.time()
+                batch_images, batch_masks, batch_indices = [], [], []
+                t_mask = 0
+                fast_n, biref_n = 0, 0
+                for _ in range(batch_size):
+                    if frame_idx >= frames_to_process:
+                        break
+                    if use_precomputed:
+                        frame_f32 = np.load(os.path.join(precompute_dir, f"frame_{frame_idx:05d}.npy"))
+                        mask_path = os.path.join(precompute_dir, f"mask_{frame_idx:05d}.npy")
+                        if os.path.exists(mask_path):
+                            mask = np.load(mask_path)
+                            fast_n += 1
+                        else:
+                            # BiRefNet fallback — load original RGB, run on GPU
+                            rgb_path = os.path.join(precompute_dir, f"rgb_{frame_idx:05d}.npy")
+                            frame_rgb = np.load(rgb_path)
+                            tm = time.time()
+                            mask = birefnet_frame(birefnet, frame_rgb)
+                            t_mask += time.time() - tm
+                            biref_n += 1
+                    else:
+                        ret, frame_bgr = cap.read()
+                        if not ret:
+                            break
+                        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+                        frame_f32 = frame_rgb.astype(np.float32) / 255.0
+                        tm = time.time()
+                        if mask_mode == "Fast (classical)":
+                            mask, _ = fast_greenscreen_mask(frame_f32)
+                            fast_n += 1
+                        elif mask_mode == "Hybrid (auto)":
+                            mask, conf = fast_greenscreen_mask(frame_f32)
+                            if mask is None or conf < 0.7:
+                                mask = birefnet_frame(birefnet, frame_rgb)
+                                biref_n += 1
+                            else:
+                                fast_n += 1
+                        else:
+                            mask = birefnet_frame(birefnet, frame_rgb)
+                            biref_n += 1
+                        t_mask += time.time() - tm
+                    batch_images.append(frame_f32)
+                    batch_masks.append(mask)
+                    batch_indices.append(frame_idx)
+                    frame_idx += 1
+                if not batch_images:
+                    break
+                # Batched GPU inference
+                t_inf = time.time()
+                results = corridorkey_batch_pytorch(
+                    pytorch_model, batch_images, batch_masks, img_size,
+                    despill_strength=despill_strength,
+                    auto_despeckle=auto_despeckle,
+                    despeckle_size=int(despeckle_size),
+                )
+                t_inf = time.time() - t_inf
+                for j, result in enumerate(results):
+                    all_results.append((batch_indices[j], result["alpha"], result["fg"]))
+                n = len(batch_images)
+                elapsed = time.time() - t_batch
+                vram_peak = torch.cuda.max_memory_allocated() / 1024**3
+                logger.info("Batch %d: mask=%.1fs(fast=%d,biref=%d) infer=%.1fs total=%.1fs(%.2fs/fr) VRAM=%.1fGB",
+                            n, t_mask, fast_n, biref_n, t_inf, elapsed, elapsed/n, vram_peak)
+                per_frame = elapsed / n
+                frame_times.extend([per_frame] * n)
+                remaining = (frames_to_process - frame_idx) * (np.mean(frame_times[-20:]) if len(frame_times) > 1 else per_frame)
+                progress(0.10 + 0.75 * frame_idx / frames_to_process,
+                         desc=f"Frame {frame_idx}/{frames_to_process} ({per_frame:.2f}s/fr) ~{remaining:.0f}s left")
+            cap.release()
+            gpu_elapsed = time.time() - total_start
+            logger.info("[GPU phase] done: %d frames in %.1fs (%.2fs/fr)",
+                        len(all_results), gpu_elapsed, gpu_elapsed / max(len(all_results), 1))
+            # FAST WRITE inside GPU: only comp (JPEG) + matte (PNG) + raw numpy.
+            # FG + Processed written AFTER GPU release (deferred).
+            from concurrent.futures import ThreadPoolExecutor
+            bg_lin = srgb_to_linear(create_checkerboard(w, h))
+            comp_dir = os.path.join(tmpdir, "Comp")
+            matte_dir = os.path.join(tmpdir, "Matte")
+            fg_dir = os.path.join(tmpdir, "FG")
+            processed_dir = os.path.join(tmpdir, "Processed")
+            for d in [comp_dir, fg_dir, matte_dir, processed_dir]:
+                os.makedirs(d, exist_ok=True)
+            t_write = time.time()
+            progress(0.86, desc="Writing preview frames...")
+            with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as pool:
+                futs = [pool.submit(_write_frame_fast, idx, alpha, fg, w, h, bg_lin,
+                                    comp_dir, matte_dir, fg_dir)
+                        for idx, alpha, fg in all_results]
+                for f in futs:
+                    f.result()
+            del all_results
+            gc.collect()
+            logger.info("[GPU phase] Fast write in %.1fs", time.time() - t_write)
+            return {
+                "results": "written", "frame_times": frame_times,
+                "use_gpu": True, "batch_size": batch_size,
+                "w": w, "h": h, "fps": fps, "tmpdir": tmpdir,
+            }
+        else:
+            # CPU PATH: sequential ONNX + inline writes (no GPU budget concern)
+            bg_lin = srgb_to_linear(create_checkerboard(w, h))
+            comp_dir, fg_dir = os.path.join(tmpdir, "Comp"), os.path.join(tmpdir, "FG")
+            matte_dir, processed_dir = os.path.join(tmpdir, "Matte"), os.path.join(tmpdir, "Processed")
+            for d in [comp_dir, fg_dir, matte_dir, processed_dir]:
+                os.makedirs(d, exist_ok=True)
+            for i in range(frames_to_process):
+                t0 = time.time()
+                ret, frame_bgr = cap.read()
+                if not ret:
+                    break
+                frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+                frame_f32 = frame_rgb.astype(np.float32) / 255.0
+                if mask_mode == "Fast (classical)":
+                    mask, _ = fast_greenscreen_mask(frame_f32)
+                    if mask is None:
+                        raise gr.Error("Fast mask failed. Try 'AI (BiRefNet)' mode.")
+                elif mask_mode == "Hybrid (auto)":
+                    mask, conf = fast_greenscreen_mask(frame_f32)
+                    if mask is None or conf < 0.7:
+                        mask = birefnet_frame(birefnet, frame_rgb)
+                else:
+                    mask = birefnet_frame(birefnet, frame_rgb)
+                result = corridorkey_frame_onnx(corridorkey_onnx, frame_f32, mask, img_size,
+                                                despill_strength=despill_strength,
+                                                auto_despeckle=auto_despeckle,
+                                                despeckle_size=int(despeckle_size))
+                _write_frame_outputs(i, result["alpha"], result["fg"],
+                                     w, h, bg_lin, comp_dir, fg_dir, matte_dir, processed_dir)
+                elapsed = time.time() - t0
+                frame_times.append(elapsed)
+                remaining = (frames_to_process - i - 1) * (np.mean(frame_times[-5:]) if len(frame_times) > 1 else elapsed)
+                progress(0.10 + 0.80 * (i+1) / frames_to_process,
+                         desc=f"Frame {i+1}/{frames_to_process} ({elapsed:.1f}s) ~{remaining:.0f}s left")
+            cap.release()
+            return {
+                "results": None, "frame_times": frame_times,
+                "use_gpu": False, "batch_size": 1,
+                "w": w, "h": h, "fps": fps, "tmpdir": tmpdir,
+            }
+    except gr.Error:
+        raise
+    except Exception as e:
+        logger.exception("Inference failed")
+        raise gr.Error(f"Inference failed: {e}")
+def process_video(video_path, resolution, despill_val, mask_mode,
+                  auto_despeckle, despeckle_size, progress=gr.Progress()):
+    """Orchestrator: precompute fast masks (CPU) → GPU inference → CPU I/O."""
+    if video_path is None:
+        raise gr.Error("Please upload a video.")
+    # Phase 0: Precompute fast masks on CPU and save to disk.
+    # IMPORTANT: Can't pass large data as args to @spaces.GPU (ZeroGPU serializes args).
+    # Save to a numpy file, pass only the path.
+    logger.info("[Phase 0] Precomputing fast masks on CPU")
+    t_mask = time.time()
+    precompute_dir = tempfile.mkdtemp(prefix="ck_pre_")
+    cap = cv2.VideoCapture(video_path)
+    frame_count = 0
+    needs_birefnet = False
+    while True:
+        ret, frame_bgr = cap.read()
+        if not ret:
+            break
+        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+        frame_f32 = frame_rgb.astype(np.float32) / 255.0
+        if mask_mode == "Fast (classical)":
+            mask, _ = fast_greenscreen_mask(frame_f32)
+            if mask is None:
+                raise gr.Error("Fast mask failed. Try 'Hybrid' or 'AI' mode.")
+        elif mask_mode == "Hybrid (auto)":
+            mask, conf = fast_greenscreen_mask(frame_f32)
+            if mask is None or conf < 0.7:
+                mask = None
+                needs_birefnet = True
+        else:
+            mask = None
+            needs_birefnet = True
+        # Save as compressed numpy (fast to load, no serialization overhead)
+        np.save(os.path.join(precompute_dir, f"frame_{frame_count:05d}.npy"), frame_f32)
+        if mask is not None:
+            np.save(os.path.join(precompute_dir, f"mask_{frame_count:05d}.npy"), mask)
+        if mask is None:
+            np.save(os.path.join(precompute_dir, f"rgb_{frame_count:05d}.npy"), frame_rgb)
+        frame_count += 1
+    cap.release()
+    logger.info("[Phase 0] %d frames saved to %s in %.1fs (needs_birefnet=%s)",
+                frame_count, precompute_dir, time.time() - t_mask, needs_birefnet)
+    # Phase 1: GPU inference — pass only paths (tiny strings), not data
+    logger.info("[Phase 1] Starting GPU phase")
+    t0 = time.time()
+    data = _gpu_phase(video_path, resolution, despill_val, mask_mode,
+                      auto_despeckle, despeckle_size, progress,
+                      precompute_dir=precompute_dir, precompute_count=frame_count)
+    logger.info("[process_video] GPU phase done in %.1fs", time.time() - t0)
+    tmpdir = data["tmpdir"]
+    w, h, fps = data["w"], data["h"], data["fps"]
+    frame_times = data["frame_times"]
+    use_gpu = data["use_gpu"]
+    batch_size = data["batch_size"]
+    comp_dir = os.path.join(tmpdir, "Comp")
+    fg_dir = os.path.join(tmpdir, "FG")
+    matte_dir = os.path.join(tmpdir, "Matte")
+    processed_dir = os.path.join(tmpdir, "Processed")
+    for d in [comp_dir, fg_dir, matte_dir, processed_dir]:
+        os.makedirs(d, exist_ok=True)
+    try:
+        from concurrent.futures import ThreadPoolExecutor
+        logger.info("[Phase 2] Frames written by GPU/CPU phase (comp+fg+matte)")
+        # Phase 3: stitch videos from written frames
+        logger.info("[Phase 3] Stitching videos")
+        progress(0.93, desc="Stitching videos...")
         comp_video = os.path.join(tmpdir, "comp_preview.mp4")
         matte_video = os.path.join(tmpdir, "matte_preview.mp4")
+        # Comp uses JPEG, Matte uses PNG
+        _stitch_ffmpeg(comp_dir, comp_video, fps, pattern="%05d.jpg", extra_args=["-crf", "18"])
+        _stitch_ffmpeg(matte_dir, matte_video, fps, pattern="%05d.png", extra_args=["-crf", "18"])
+        # Phase 4: ZIP (no GPU)
+        logger.info("[Phase 4] Packaging ZIP")
         progress(0.96, desc="Packaging ZIP...")
         zip_path = os.path.join(tmpdir, "CorridorKey_Output.zip")
         with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_STORED) as zf:
             for folder in ["Comp", "FG", "Matte", "Processed"]:
                 src = os.path.join(tmpdir, folder)
+                if os.path.isdir(src):
+                    for f in sorted(os.listdir(src)):
+                        zf.write(os.path.join(src, f), f"Output/{folder}/{f}")
         progress(1.0, desc="Done!")
+        total_elapsed = sum(frame_times) if frame_times else 0
         n = len(frame_times)
         avg = np.mean(frame_times) if frame_times else 0
+        engine = "PyTorch GPU" if use_gpu else "ONNX CPU"
+        status = (f"Processed {n} frames ({w}x{h}) at {resolution}px | "
+                  f"{avg:.2f}s/frame | {engine}" +
+                  (f" batch={batch_size}" if use_gpu else ""))
         return (
             comp_video if os.path.exists(comp_video) else None,
     except gr.Error:
         raise
     except Exception as e:
+        logger.exception("Output writing failed")
+        raise gr.Error(f"Output failed: {e}")
     finally:
         for d in ["Comp", "FG", "Matte", "Processed"]:
             p = os.path.join(tmpdir, d)
 def process_example(video_path, resolution, despill, mask_mode, despeckle, despeckle_size):
     return process_video(video_path, resolution, despill, mask_mode, despeckle, despeckle_size)
+DESCRIPTION = """# CorridorKey Green Screen Matting
+Remove green backgrounds from video. Based on [CorridorKey](https://www.youtube.com/watch?v=3Ploi723hg4) by Corridor Digital.
+ZeroGPU H200: batched PyTorch inference (up to 32 frames at once). CPU fallback via ONNX."""
 with gr.Blocks(title="CorridorKey") as demo:
     gr.Markdown(DESCRIPTION)
                 resolution = gr.Radio(
                     choices=["1024", "2048"], value="1024",
                     label="Processing Resolution",
+                    info="1024 = fast (batch 32 on GPU), 2048 = max quality (batch 8 on GPU)"
                 )
                 mask_mode = gr.Radio(
                     choices=["Hybrid (auto)", "AI (BiRefNet)", "Fast (classical)"],

requirements.txt CHANGED Viewed

@@ -1,5 +1,10 @@
 numpy
 opencv-python-headless
 huggingface-hub
-onnxruntime
 gradio[mcp]

 numpy
 opencv-python-headless
 huggingface-hub
+onnxruntime-gpu
+spaces
 gradio[mcp]
+torch
+torchvision
+timm
+https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.9.0/flash_attn-2.8.3+cu126torch2.9-cp310-cp310-linux_x86_64.whl