Spaces:

toshas
/

windowseat-reflection-removal

Running on Zero

App Files Files Community

toshas commited on 3 days ago

Commit

d283d84

1 Parent(s): 72cbe32

initial commit

Browse files

Files changed (26) hide show

LICENSE.txt +177 -0
README.md +5 -5
app.py +125 -0
example_images/0_bakery.jpg +3 -0
example_images/0_cafe.jpg +3 -0
example_images/0_car_wheel.jpg +3 -0
example_images/0_cats.png +3 -0
example_images/0_dog.jpg +3 -0
example_images/0_entrance.jpg +3 -0
example_images/0_misty_train.jpg +3 -0
example_images/0_museum.jpg +3 -0
example_images/0_park_cart.jpg +3 -0
example_images/0_pharaoh.jpg +3 -0
example_images/0_phone_booth.jpg +3 -0
example_images/0_store_front.jpg +3 -0
example_images/0_uniqlo.jpg +3 -0
example_images/0_wolf.jpg +3 -0
example_images/0_zoo.jpg +3 -0
example_images/1_window_airplane.png +3 -0
example_images/1_window_airport.jpg +3 -0
example_images/2_postcards_008.png +3 -0
example_images/2_postcards_050.png +3 -0
example_images/2_real_110.jpg +3 -0
example_images/2_wild_026.jpg +3 -0
requirements.txt +18 -0
windowseat_inference.py +871 -0

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,177 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Windowseat Reflection Removal
-emoji: 👁
-colorFrom: gray
-colorTo: purple
 sdk: gradio
-sdk_version: 6.0.2
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
+title: WindowSeat Reflection Removal
+emoji: 🪟
+colorFrom: purple
+colorTo: blue
 sdk: gradio
+sdk_version: 5.29.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+os.system("pip freeze")
+import spaces
+import tempfile
+import shutil
+import gradio as gr
+import torch as torch
+from gradio_dualvision import DualVisionApp
+from huggingface_hub import login
+from PIL import Image
+from windowseat_inference import load_network, run_inference
+uri_base = "Qwen/Qwen-Image-Edit-2509"
+uri_lora = "huawei-bayerlab/windowseat-reflection-removal-v1-0"
+if "HF_TOKEN_LOGIN" in os.environ:
+    login(token=os.environ["HF_TOKEN_LOGIN"])
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+vae, transformer, embeds_dict, processing_resolution = load_network(uri_base, uri_lora, device)
+# # As of transformers==4.57.1 , xformers is not supported in QwenImageTransformer2DModel
+# try:
+#     transformer.enable_xformers_memory_efficient_attention()
+#     print("xformers enabled")
+# except:
+#     print("xformers not enabled")
+class WindowSeatApp(DualVisionApp):
+    DEFAULT_SEED = 2025
+    def make_header(self):
+        gr.Markdown(
+            """
+            ## WindowSeat Reflection Removal
+            """
+        )
+        with gr.Row(elem_classes="remove-elements"):
+            gr.Markdown(
+                f"""
+                <p align="center">
+                <a title="Website" href="https://hf.co/spaces/huawei-bayerlab/windowseat-reflection-removal-web" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                    <img src="https://img.shields.io/badge/%E2%99%A5%20Project%20-Website-blue">
+                </a>
+                <a title="arXiv" href="https://arxiv.org/abs/2512.05000" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                    <img src="https://img.shields.io/badge/%F0%9F%93%84%20arXiv%20-Paper-AF3436">
+                </a>
+                <a title="Github" href="https://github.com/huawei-bayerlab/windowseat-reflection-removal" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                    <img src="https://img.shields.io/github/stars/huawei-bayerlab/windowseat-reflection-removal?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
+                </a>
+                <a title="Model weights" href="https://hf.co/huawei-bayerlab/windowseat-reflection-removal-v1-0" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                    <img src="https://img.shields.io/badge/%F0%9F%A4%97%20WindowSeat%20Model%20-Weights-yellow" alt="imagedepth">
+                </a>
+                <a title="Social" href="https://twitter.com/antonobukhov1" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                    <img src="https://shields.io/twitter/follow/:?label=Subscribe%20for%20updates!" alt="social">
+                </a>
+                </p>
+                <p align="center" style="margin-top: 0px;">
+                    Upload a photo or pick an example below to remove reflections, wait for the result, then explore it with the slider.
+                    If a quota limit appears, duplicate the space to continue.
+                </p>
+                """
+        )
+    def build_user_components(self):
+        return {}
+    def process(self, image_in: Image.Image, **kwargs):
+        input_temp_dir = tempfile.mkdtemp()
+        output_temp_dir = tempfile.mkdtemp()
+        try:
+            input_image_path = os.path.join(input_temp_dir, "image.png")
+            image_in.save(input_image_path)
+            run_inference(
+                vae,
+                transformer,
+                embeds_dict,
+                processing_resolution,
+                input_temp_dir,
+                output_temp_dir,
+                use_short_edge_tile=True,
+                save_comparison=False,
+                save_alternating=False,
+            )
+            output_image_path = os.path.join(output_temp_dir, "image_windowseat_output.png")
+            result_image = Image.open(output_image_path)
+            result_image.load()
+            out_modalities = {
+                "Result": result_image,
+            }
+            out_settings = {}
+            return out_modalities, out_settings
+        finally:
+            if os.path.exists(input_temp_dir):
+                shutil.rmtree(input_temp_dir)
+            if os.path.exists(output_temp_dir):
+                shutil.rmtree(output_temp_dir)
+with WindowSeatApp(
+    title="WindowSeat Reflection Removal",
+    examples_path="example_images",
+    examples_per_page=12,
+    right_selector_visible=False,
+    advanced_settings_visible=False,
+    squeeze_canvas=True,
+    spaces_zero_gpu_enabled=True,
+) as demo:
+    demo.queue(
+        api_open=False,
+    ).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        ssr_mode=False,
+    )

example_images/0_bakery.jpg ADDED Viewed

Git LFS Details

SHA256: 69619874eef8986d8138255d71ea88009b284c99ef235d1cbd779e3c172232c6
Pointer size: 130 Bytes
Size of remote file: 69 kB

example_images/0_cafe.jpg ADDED Viewed

Git LFS Details

SHA256: 98e890212b1c3e792842b3800358fbead1d15b94454857bfaa2680be5e5e770b
Pointer size: 131 Bytes
Size of remote file: 165 kB

example_images/0_car_wheel.jpg ADDED Viewed

Git LFS Details

SHA256: eab333cbd083cc4cc43552ae8a94c3abbdfe1d5a9f324b748453b1dcb6beb617
Pointer size: 131 Bytes
Size of remote file: 104 kB

example_images/0_cats.png ADDED Viewed

Git LFS Details

SHA256: b4f530d181c3a2e7ee856b58ebff7b0bcff4c84a2266016cce1fc714ddaba57c
Pointer size: 132 Bytes
Size of remote file: 1.71 MB

example_images/0_dog.jpg ADDED Viewed

Git LFS Details

SHA256: d176019ae3e15012efcf22293d6ed5b4ad6ec314523acb1bcdd3e9f6b679f7cf
Pointer size: 130 Bytes
Size of remote file: 95.1 kB

example_images/0_entrance.jpg ADDED Viewed

Git LFS Details

SHA256: 0f3ff3846fda0d9731a6cc3e9fe96c03cda7d9067f9b60cb47d7ef740d38a653
Pointer size: 131 Bytes
Size of remote file: 169 kB

example_images/0_misty_train.jpg ADDED Viewed

Git LFS Details

SHA256: d3ef62338ccf42776aa5a3b6412fabef4380e2b8e44cea4a6ef9f6fe5433c94c
Pointer size: 130 Bytes
Size of remote file: 97.9 kB

example_images/0_museum.jpg ADDED Viewed

Git LFS Details

SHA256: 8096a520cf18314b3532bbc7ddfc95742791c01645c7825e3e616582eff0d8d4
Pointer size: 130 Bytes
Size of remote file: 69.6 kB

example_images/0_park_cart.jpg ADDED Viewed

Git LFS Details

SHA256: 069fa2d7d89f311f24f131eafbda5316b650716c831a69838bb687a0c1fde771
Pointer size: 132 Bytes
Size of remote file: 3.23 MB

example_images/0_pharaoh.jpg ADDED Viewed

Git LFS Details

SHA256: 920f0a030a742668b98a97dcc6abd140e1beb6360a11a9c287b40c39a414e514
Pointer size: 130 Bytes
Size of remote file: 87.1 kB

example_images/0_phone_booth.jpg ADDED Viewed

Git LFS Details

SHA256: c43c6265905cf9ea3bb8be7f3ed7670fd4532cbaf50ef2c5c2b25913e6db990d
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

example_images/0_store_front.jpg ADDED Viewed

Git LFS Details

SHA256: fb657590961e92a3ad6fcda26024f7a649ef6c6ee1527d4b1afee24bfe28c5ff
Pointer size: 130 Bytes
Size of remote file: 54.1 kB

example_images/0_uniqlo.jpg ADDED Viewed

Git LFS Details

SHA256: eaee88a3387cba03dfac9b35a43f4c48156f553112be400bdacd03c4073a6661
Pointer size: 131 Bytes
Size of remote file: 125 kB

example_images/0_wolf.jpg ADDED Viewed

Git LFS Details

SHA256: f90ea67fb0da7c58f9ee88cd5caf6326e6758a30a50432a5c1758ce141c39418
Pointer size: 130 Bytes
Size of remote file: 54.4 kB

example_images/0_zoo.jpg ADDED Viewed

Git LFS Details

SHA256: c68c29774f0414daa22f76074d9276b889509db46ffe114ecd43cadcba3b24df
Pointer size: 130 Bytes
Size of remote file: 92.7 kB

example_images/1_window_airplane.png ADDED Viewed

Git LFS Details

SHA256: 9edda131ec22d7ad6d587f310009d55c62be9b514678da17fb07596660adf7c9
Pointer size: 131 Bytes
Size of remote file: 605 kB

example_images/1_window_airport.jpg ADDED Viewed

Git LFS Details

SHA256: 7b6b9c651bf7fb59c86aa3d46e0b0cb68ad2ad6739674cb42f6fd4189a476baa
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

example_images/2_postcards_008.png ADDED Viewed

Git LFS Details

SHA256: 15388ad5e182ecced2b025f5b7e1069b6712b5ab1576a9fb1a4e86725ed085f2
Pointer size: 131 Bytes
Size of remote file: 327 kB

example_images/2_postcards_050.png ADDED Viewed

Git LFS Details

SHA256: 95da1c6345b5b7461595972bad7b48dbdfe90a03112d2a98a6a2a0b05c2f8d59
Pointer size: 131 Bytes
Size of remote file: 288 kB

example_images/2_real_110.jpg ADDED Viewed

Git LFS Details

SHA256: 991989f0369307af5633d3c3e370c2ea204d0214b1287a1fc78a64c686eef2c7
Pointer size: 130 Bytes
Size of remote file: 38 kB

example_images/2_wild_026.jpg ADDED Viewed

Git LFS Details

SHA256: d1f4865dd3104931aa0ae2f7ba4346dd2b29614cf5633c39b903fbaf345c0ae6
Pointer size: 130 Bytes
Size of remote file: 25.3 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+diffusers>=0.33.0
+# gradio-dualvision @ git+https://github.com/toshas/gradio-dualvision.git@gradio-5.29.0
+gradio_dualvision @ git+https://github.com/toshas/gradio-dualvision.git@59d338e9
+accelerate
+bitsandbytes
+huggingface_hub
+imageio
+imageio-ffmpeg
+peft
+Pillow
+safetensors
+scipy
+torch
+torchvision
+tqdm
+transformers
+# --only-binary=xformers
+# xformers

windowseat_inference.py ADDED Viewed

	@@ -0,0 +1,871 @@

+import argparse
+import functools
+import json
+import math
+import os
+import sys
+import warnings
+import imageio.v2 as imageio
+import numpy as np
+import safetensors
+import torch
+import torchvision
+from diffusers import (
+    AutoencoderKLQwenImage,
+    BitsAndBytesConfig,
+    QwenImageEditPipeline,
+    QwenImageTransformer2DModel,
+)
+from huggingface_hub import hf_hub_download
+from peft import LoraConfig
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+SUPPORTED_MODEL_URIS = [
+    "Qwen/Qwen-Image-Edit-2509",
+]
+LORA_MODEL_URI = "toshas/WindowSeat-Qwen-Image-Edit-2509"
+def fetch_state_dict(
+    pretrained_model_name_or_path_or_dict: str,
+    weight_name: str,
+    use_safetensors: bool = True,
+    subfolder: str | None = None,
+):
+    file_path = hf_hub_download(pretrained_model_name_or_path_or_dict, weight_name, subfolder=subfolder)
+    if use_safetensors:
+        state_dict = safetensors.torch.load_file(file_path)
+    else:
+        state_dict = torch.load(file_path, weights_only=True)
+    return state_dict
+def load_qwen_vae(uri: str, device: torch.device):
+    vae = AutoencoderKLQwenImage.from_pretrained(
+        uri,
+        subfolder="vae",
+        torch_dtype=torch.bfloat16,
+        device_map=device,
+        low_cpu_mem_usage=True,
+        use_safetensors=True,
+    )
+    vae.to(device, dtype=torch.bfloat16)
+    return vae
+def load_qwen_transformer(uri: str, device: torch.device):
+    nf4 = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        llm_int8_skip_modules=["transformer_blocks.0.img_mod"],
+    )
+    transformer = QwenImageTransformer2DModel.from_pretrained(
+        uri,
+        subfolder="transformer",
+        torch_dtype=torch.bfloat16,
+        quantization_config=nf4,
+        device_map=device,
+    )
+    return transformer
+def load_lora_into_transformer(uri: str, transformer: QwenImageTransformer2DModel):
+    lora_config = LoraConfig.from_pretrained(uri, subfolder="transformer_lora")
+    transformer.add_adapter(lora_config)
+    state_dict = fetch_state_dict(uri, "pytorch_lora_weights.safetensors", subfolder="transformer_lora")
+    missing, unexpected = transformer.load_state_dict(state_dict, strict=False)
+    if len(unexpected) > 0:
+        raise ValueError(f"Unexpected keys in transformer state dict: {unexpected}")
+    return transformer
+def load_embeds_dict(uri: str):
+    embeds_dict = fetch_state_dict(uri, "state_dict.safetensors", subfolder="text_embeddings")
+    return embeds_dict
+def load_network(uri_base: str, uri_lora: str, device: torch.device):
+    config_file = hf_hub_download(uri_lora, "model_index.json")
+    with open(config_file, "r") as f:
+        config_dict = json.load(f)
+    base_model_uri = config_dict["base_model"]
+    processing_resolution = config_dict["processing_resolution"]
+    if base_model_uri not in SUPPORTED_MODEL_URIS:
+        raise ValueError(f"Unsupported base model URI: {base_model_uri}")
+    vae = load_qwen_vae(uri_base, device)
+    transformer = load_qwen_transformer(uri_base, device)
+    load_lora_into_transformer(uri_lora, transformer)
+    embeds_dict = load_embeds_dict(uri_lora)
+    return vae, transformer, embeds_dict, processing_resolution
+def encode(image: torch.Tensor, vae: AutoencoderKLQwenImage) -> torch.Tensor:
+    image = image.to(device=vae.device, dtype=vae.dtype)
+    out = vae.encode(image.unsqueeze(2)).latent_dist.sample()
+    latents_mean = torch.tensor(vae.config.latents_mean, device=out.device, dtype=out.dtype)
+    latents_mean = latents_mean.view(1, vae.config.z_dim, 1, 1, 1)
+    latents_std_inv = 1.0 / torch.tensor(vae.config.latents_std, device=out.device, dtype=out.dtype)
+    latents_std_inv = latents_std_inv.view(1, vae.config.z_dim, 1, 1, 1)
+    out = (out - latents_mean) * latents_std_inv
+    return out
+def decode(latents: torch.Tensor, vae: AutoencoderKLQwenImage) -> torch.Tensor:
+    latents_mean = torch.tensor(vae.config.latents_mean, device=latents.device, dtype=latents.dtype)
+    latents_mean = latents_mean.view(1, vae.config.z_dim, 1, 1, 1)
+    latents_std_inv = (1.0 / torch.tensor(vae.config.latents_std, device=latents.device, dtype=latents.dtype))
+    latents_std_inv = latents_std_inv.view(1, vae.config.z_dim, 1, 1, 1)
+    latents = latents / latents_std_inv + latents_mean
+    out = vae.decode(latents)
+    out = out.sample[:, :, 0]
+    return out
+def _match_batch(t: torch.Tensor, B: int) -> torch.Tensor:
+    if t.size(0) == B:
+        return t
+    if t.size(0) == 1 and B > 1:
+        return t.expand(B, *t.shape[1:])
+    if t.size(0) > B:
+        return t[:B]
+    reps = (B + t.size(0) - 1) // t.size(0)
+    return t.repeat((reps,) + (1,) * (t.ndim - 1))[:B]
+def flow_step(
+    model_input: torch.Tensor,
+    transformer: QwenImageTransformer2DModel,
+    vae: AutoencoderKLQwenImage,
+    embeds_dict: dict[str, torch.Tensor],
+) -> torch.Tensor:
+    prompt_embeds = embeds_dict["prompt_embeds"]  # [N_ctx, L, D]
+    prompt_mask = embeds_dict["prompt_mask"]  # [N_ctx, L]
+    if prompt_mask.dtype != torch.bool:
+        prompt_mask = prompt_mask > 0
+    # Accept [B, C, 1, H, W] or [B, C, H, W]
+    if model_input.ndim == 5 and model_input.shape[2] == 1:
+        model_input_4d = model_input[:, :, 0]  # [B, C, H, W]
+    elif model_input.ndim == 4:
+        model_input_4d = model_input
+    else:
+        raise ValueError(f"Unexpected lat_encoding shape: {model_input.shape}")
+    B, C, H, W = model_input_4d.shape
+    device = next(transformer.parameters()).device
+    prompt_embeds = _match_batch(prompt_embeds, B).to(
+        device=device, dtype=torch.bfloat16, non_blocking=True
+    )  # [B, L, D]
+    prompt_mask = _match_batch(prompt_mask, B).to(
+        device=device, dtype=torch.bool, non_blocking=True
+    )  # [B, L]
+    num_channels_latents = C
+    packed_model_input = QwenImageEditPipeline._pack_latents(
+        model_input_4d,
+        batch_size=B,
+        num_channels_latents=num_channels_latents,
+        height=H,
+        width=W,
+    )  # [B, N_patches, C * 4], where N_patches = (H // 2) * (W // 2)
+    packed_model_input = packed_model_input.to(torch.bfloat16)
+    t_const = 499
+    timestep = torch.full(
+        (B,),
+        float(t_const),
+        device=device,
+        dtype=torch.bfloat16,
+    )
+    timestep = timestep / 1000.0
+    h_img = H // 2
+    w_img = W // 2
+    img_shapes = [[(1, h_img, w_img)]] * B
+    txt_seq_lens = prompt_mask.sum(dim=1).tolist() if prompt_mask is not None else None
+    if getattr(transformer, "attention_kwargs", None) is None:
+        attention_kwargs = {}
+    else:
+        attention_kwargs = transformer.attention_kwargs
+    with torch.amp.autocast("cuda", dtype=torch.bfloat16):
+        model_pred = transformer(
+            hidden_states=packed_model_input,  # [B, N_patches, C*4]
+            timestep=timestep,  # [B], float / 1000
+            encoder_hidden_states=prompt_embeds,  # [B, L, D]
+            encoder_hidden_states_mask=prompt_mask,  # [B, L]
+            img_shapes=img_shapes,  # single stream per batch
+            txt_seq_lens=txt_seq_lens,
+            guidance=None,
+            attention_kwargs=attention_kwargs,
+            return_dict=False,
+        )[0]  # [B, N_patches, C*4]
+    temperal_downsample = vae.config.get("temperal_downsample", None)
+    if temperal_downsample is not None:
+        vae_scale_factor = 2 ** len(temperal_downsample)
+    else:
+        vae_scale_factor = 8
+    model_pred = QwenImageEditPipeline._unpack_latents(
+        model_pred,
+        height=H * vae_scale_factor,  # H, W here are latent H,W from encode
+        width=W * vae_scale_factor,
+        vae_scale_factor=vae_scale_factor,
+    )  # [B, C, 1, H_lat, W_lat]
+    latent_output = model_input.to(vae.dtype) - model_pred.to(vae.dtype)
+    return latent_output
+def _supports_color() -> bool:
+    return sys.stdout.isatty()
+def _style(text: str, *, color: str | None = None, bold: bool = False) -> str:
+    if not _supports_color():
+        return text
+    codes = []
+    if bold:
+        codes.append("1")
+    if color == "red":
+        codes.append("31")
+    elif color == "green":
+        codes.append("32")
+    elif color == "yellow":
+        codes.append("33")
+    elif color == "blue":
+        codes.append("34")
+    elif color == "magenta":
+        codes.append("35")
+    elif color == "cyan":
+        codes.append("36")
+    if not codes:
+        return text
+    return f"\033[{';'.join(codes)}m{text}\033[0m"
+def print_banner(title: str):
+    title = f" {title} "
+    bar = "═" * len(title)
+    print(_style(f"╔{bar}╗", color="cyan", bold=True))
+    print(_style(f"║{title}║", color="cyan", bold=True))
+    print(_style(f"╚{bar}╝", color="cyan", bold=True))
+def print_step(step: str, msg: str):
+    prefix = _style(f"[{step}] ", color="yellow", bold=True)
+    print(prefix + msg)
+def print_ok(msg: str):
+    print(_style("✔ ", color="green", bold=True) + msg)
+def print_info(msg: str):
+    print(_style("ℹ ", color="blue", bold=True) + msg)
+def print_error(msg: str):
+    print(_style("✖ ", color="red", bold=True) + msg)
+def print_final_success(output_dir: str):
+    print_ok("Inference finished successfully!")
+    print_info("Predictions have been written to:")
+    print("   " + _style(output_dir, color="cyan", bold=True))
+    print(_style("Thank you for trying out WindowSeat! 🪟", color="green"))
+def _required_side_for_axis(size: int, nmax: int, min_overlap: int) -> int:
+    """Smallest tile side T (1D) so that #tiles <= nmax with overlap >= min_overlap."""
+    nmax = max(1, int(nmax))
+    if nmax == 1:
+        return size
+    return math.ceil((size + (nmax - 1) * min_overlap) / nmax)
+def _starts(size: int, T: int, min_overlap: int):
+    """Uniform stepping with stride = T - min_overlap; last tile flush with edge."""
+    if size <= T:
+        return [0]
+    stride = max(1, T - min_overlap)
+    xs = list(range(0, size - T + 1, stride))
+    last = size - T
+    if xs[-1] != last:
+        xs.append(last)
+    # monotonic dedupe
+    out = []
+    for v in xs:
+        if not out or v > out[-1]:
+            out.append(v)
+    return out
+class TilingDataset(Dataset):
+    def __init__(
+        self,
+        transform_graph,
+        input_folder,
+        tiling_w=768,
+        tiling_h=768,
+        processing_resolution=768,
+        max_num_tiles_w=4,
+        max_num_tiles_h=4,
+        min_overlap_w=64,
+        min_overlap_h=64,
+        use_short_edge_tile=False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.transform_graph = transform_graph
+        self.kwargs = kwargs
+        self.disp_name = kwargs.get("disp_name", "tiling_dataset")
+        img_paths = sorted(
+            os.path.join(input_folder, f)
+            for f in os.listdir(input_folder)
+            if os.path.isfile(os.path.join(input_folder, f))
+        )
+        self.filenames = []
+        Nw, Nh = int(max_num_tiles_w), int(max_num_tiles_h)
+        ow, oh = int(min_overlap_w), int(min_overlap_h)
+        for i, p in enumerate(img_paths):
+            with Image.open(p) as im:
+                W, H = im.size
+                # Choose preferred tile size for this image
+                if use_short_edge_tile:
+                    short_edge = min(W, H)
+                    short_edge = max(short_edge, processing_resolution)
+                    tiling_w_i = short_edge
+                    tiling_h_i = short_edge
+                else:
+                    tiling_w_i = tiling_w
+                    tiling_h_i = tiling_h
+                # Optional upscaling if image is smaller than desired tile
+                if W < tiling_w_i or H < tiling_h_i:
+                    min_side = min(W, H)
+                    scale_ratio = tiling_w_i / min_side
+                    W = round(scale_ratio * W)
+                    H = round(scale_ratio * H)
+            pref_side = max(int(tiling_w_i), int(tiling_h_i))
+            # Feasible square-side interval [T_low, T_high]
+            T_low = max(
+                _required_side_for_axis(W, Nw, ow),
+                _required_side_for_axis(H, Nh, oh),
+                ow + 1,
+                oh + 1,
+            )
+            T_high = min(W, H)
+            if T_low > T_high:
+                msg = (
+                    f"Infeasible square constraints for {os.path.basename(p)}: "
+                    f"need T >= {T_low}, but max square inside is {T_high}. "
+                    f"Relax max_num_tiles_w/h or overlaps, allow non-square tiles, or pad."
+                )
+                raise ValueError(msg)
+            else:
+                T = max(T_low, min(pref_side, T_high))
+                Tw = Th = T
+            # Build starts with axis-specific tile sizes
+            xs = _starts(W, Tw, ow)
+            ys = _starts(H, Th, oh)
+            for y0 in ys:
+                for x0 in xs:
+                    x1, y1 = x0 + Tw, y0 + Th
+                    self.filenames.append([str(p), (x0, y0, x1, y1), False])
+            if self.filenames:
+                self.filenames[-1][-1] = True
+    def __len__(self):
+        return len(self.filenames)
+    def __getitem__(self, index):
+        sample = {}
+        sample["line"] = self.filenames[index]
+        sample["idx"] = index
+        self.transform_graph(sample)
+        return sample
+def read_scalars(sample):
+    scalar_dict = {"tile_info": 1, "is_last_tile": 2}
+    for name, col in scalar_dict.items():
+        sample[name] = sample["line"][col]
+def load_rgb_data(rgb_path, key_prefix="input"):
+    rgb = read_rgb_file(rgb_path)
+    rgb_norm = rgb / 255.0 * 2.0 - 1.0
+    outputs = {
+        f"{key_prefix}_int": torch.from_numpy(rgb).int(),
+        f"{key_prefix}_norm": torch.from_numpy(rgb_norm),
+    }
+    return outputs
+def read_rgb_file(rgb_path) -> np.ndarray:
+    img = Image.open(rgb_path).convert("RGB")
+    arr = np.array(img, dtype=np.uint8)  # [H, W, 3]
+    return arr.transpose(2, 0, 1)  # [3, H, W]
+def read_rgb_image(sample):
+    column = 0
+    name = "input"
+    img_path = sample["line"][column]
+    img = load_rgb_data(img_path, name)
+    sample.update(img)
+    sample.setdefault("meta", {})
+    sample["meta"]["orig_res"] = [
+        sample[name + "_norm"].shape[-2],
+        sample[name + "_norm"].shape[-1],
+    ]
+def _lanczos_resize_chw(x, out_hw):
+    H_out, W_out = map(int, out_hw)
+    is_torch = isinstance(x, torch.Tensor)
+    if is_torch:
+        dev = x.device
+        arr = x.detach().cpu().numpy()
+    else:
+        arr = x
+    assert isinstance(arr, np.ndarray) and arr.ndim == 3, "expect CHW"
+    chw = arr.astype(np.float32, copy=False)
+    C, _, _ = chw.shape
+    out_chw = np.empty((C, H_out, W_out), dtype=np.float32)
+    for c in range(C):
+        ch = chw[c]
+        img = Image.fromarray(ch).convert("F")
+        img = img.resize((W_out, H_out), resample=Image.LANCZOS)
+        out_chw[c] = np.asarray(img, dtype=np.float32)
+    if is_torch:
+        return torch.from_numpy(out_chw).to(dev)
+    return out_chw
+def reshape(sample, height, width):
+    Ht, Wt = height, width
+    for k, v in list(sample.items()):
+        if not (torch.is_tensor(v) and v.ndim >= 2) or "orig" in k:
+            continue
+        x = v.to(torch.float32)
+        x = _lanczos_resize_chw(x, (Ht, Wt))
+        if v.dtype == torch.bool:
+            x = x > 0.5
+        elif not torch.is_floating_point(v):
+            x = x.round().to(v.dtype)
+        sample[k] = x
+    return sample
+def tile(sample, processing_resolution: int):
+    x0, y0, x1, y1 = map(int, sample["tile_info"])
+    processing_width = x1 - x0
+    processing_height = y1 - y0
+    # Reshape input while keeping aspect ratio
+    H, W = sample["input_norm"].shape[-2:]
+    if W < processing_width or H < processing_height:
+        min_side = min(W, H)
+        scale_ratio = processing_width / min_side
+        W = round(scale_ratio * W)
+        H = round(scale_ratio * H)
+    reshape(sample, height=H, width=W)
+    sample["input_int"] = sample["input_int"][:, y0:y1, x0:x1]
+    sample["input_norm"] = sample["input_norm"][:, y0:y1, x0:x1]
+    reshape(sample, height=processing_resolution, width=processing_resolution)
+@torch.no_grad()
+def validate_single_dataset(
+    vae: AutoencoderKLQwenImage,
+    transformer: QwenImageTransformer2DModel,
+    embeds_dict: dict[str, torch.Tensor],
+    data_loader: DataLoader,
+    save_to_dir: str = None,
+    save_comparison: bool = True,
+    save_alternating: bool = True,
+):
+    preds = []
+    for i, batch in enumerate(
+        tqdm(data_loader, desc=f"Reflection Removal Progress"),
+        start=1,
+    ):
+        batch["out"] = {}
+        with torch.no_grad():
+            latents = encode(batch["input_norm"], vae)
+            latents = flow_step(latents, transformer, vae, embeds_dict)
+            batch["out"]["pixel_pred"] = decode(latents, vae)
+        for b in range(len(batch["idx"])):
+            preds.append(
+                {
+                    "file": batch["line"][0][b],
+                    # [x0, y0, x1, y1] tuple for the tile
+                    "tile_info": [batch["tile_info"][i][b] for i in range(4)],
+                    # Shape 1, 3, H, W, torch tensor in range -1 to 1
+                    "pred": batch["out"]["pixel_pred"][b].to("cpu"),
+                }
+            )
+            if batch["is_last_tile"][b]:
+                # Stitch predictions together
+                W = max(int(t["tile_info"][2]) for t in preds)
+                H = max(int(t["tile_info"][3]) for t in preds)
+                acc = torch.zeros(3, H, W, dtype=torch.float32)
+                wsum = torch.zeros(H, W, dtype=torch.float32)
+                for t in preds:
+                    tile_info = [t["tile_info"][i] for i in range(4)]
+                    x0, y0, x1, y1 = map(int, tile_info)
+                    tile = t["pred"].squeeze(0).float()  # [3, h, w], [-1,1]
+                    h, w = tile.shape[-2:]
+                    tH, tW = (y1 - y0), (x1 - x0)
+                    if (h != tH) or (w != tW):
+                        tile = _lanczos_resize_chw(tile, (tH, tW))
+                        h, w = tH, tW
+                    # triangular window for the tile
+                    # fmt: off
+                    wx = 1 - (2 * torch.arange(w, dtype=torch.float32) / (max(w - 1, 1)) - 1).abs()
+                    wy = 1 - (2 * torch.arange(h, dtype=torch.float32) / (max(h - 1, 1)) - 1).abs()
+                    # fmt: on
+                    w2 = (wy[:, None] * wx[None, :]).clamp_min(1e-3)
+                    acc[:, y0:y1, x0:x1] += tile * w2
+                    wsum[y0:y1, x0:x1] += w2
+                stitched = (acc / wsum.clamp_min(1e-6)).unsqueeze(0)  # [1,3,H,W], [-1,1]
+                # Lanczos resize to gt_orig shape
+                orig_H, orig_W = (
+                    batch["meta"]["orig_res"][0][b].item(),
+                    batch["meta"]["orig_res"][1][b].item(),
+                )
+                x = stitched.squeeze(0)
+                x01 = ((x + 1.0) / 2.0).clamp(0.0, 1.0)
+                device = x01.device
+                pil = torchvision.transforms.functional.to_pil_image(x01.cpu())
+                pil_resized = pil.resize((orig_W, orig_H), resample=Image.LANCZOS)
+                pred_ts = torchvision.transforms.functional.to_tensor(pil_resized).to(device)  # [3,H,W], [0,1]
+                pred = pred_ts.cpu().numpy()
+                preds = []
+            else:
+                continue
+            pred_ts = torch.from_numpy(pred).to(device)  # [3,H,W]
+            scene_path = batch["line"][0][b]
+            scene_name = scene_path.split("/")[-1][:-4]
+            # Load original input image (CHW, uint8 in [0,255])
+            input_chw = read_rgb_file(scene_path)
+            input_hwc = (
+                np.transpose(input_chw, (1, 2, 0)).astype(np.float32) / 255.0
+            )  # [H,W,3], [0,1]
+            pred_hwc = np.transpose(pred, (1, 2, 0))
+            if input_hwc.shape[:2] != pred_hwc.shape[:2]:
+                pil_pred = Image.fromarray(
+                    (pred_hwc.clip(0, 1) * 255).round().astype(np.uint8)
+                )
+                H_in, W_in = input_hwc.shape[:2]
+                pil_pred = pil_pred.resize((W_in, H_in), resample=Image.LANCZOS)
+                pred_hwc = (np.array(pil_pred, dtype=np.uint8) / 255.0).clip(0, 1)
+            visualize(
+                file_prefix=scene_name,
+                input_hwc=input_hwc,
+                pred_hwc=pred_hwc,
+                output_dir=save_to_dir,
+                save_comparison=save_comparison,
+                save_alternating=save_alternating,
+            )
+    return
+def save_prediction_only(
+    file_prefix: str,
+    pred_uint8: np.ndarray,
+    output_dir: str,
+) -> None:
+    imageio.imwrite(
+        os.path.join(output_dir, f"{file_prefix}_windowseat_output.png"),
+        pred_uint8,
+        plugin="pillow",
+    )
+def save_comparison_image(
+    file_prefix: str,
+    pred_uint8: np.ndarray,
+    input_uint8: np.ndarray,
+    output_dir: str,
+    margin_width: int = 10,
+) -> None:
+    H_in, W_in, _ = input_uint8.shape
+    if pred_uint8.shape[:2] != (H_in, W_in):
+        pil_pred = Image.fromarray(pred_uint8)
+        pil_pred = pil_pred.resize((W_in, H_in), resample=Image.LANCZOS)
+        pred_uint8 = np.asarray(pil_pred, dtype=np.uint8)
+    margin = np.ones((H_in, margin_width, 3), dtype=np.uint8) * 255
+    comparison = np.concatenate([input_uint8, margin, pred_uint8], axis=1)
+    imageio.imwrite(
+        os.path.join(output_dir, f"{file_prefix}_windowseat_side_by_side.png"),
+        comparison,
+        plugin="pillow",
+    )
+def save_alternating_video(
+    file_prefix: str,
+    input_uint8: np.ndarray,
+    pred_uint8: np.ndarray,
+    output_dir: str,
+    fps: float = 1.0,
+    total_frames: int = 20,
+) -> None:
+    video_path = os.path.join(output_dir, f"{file_prefix}_windowseat_alternating.mp4")
+    H, W = input_uint8.shape[:2]
+    pad_h = (0, H % 2)
+    pad_w = (0, W % 2)
+    if pad_h[1] or pad_w[1]:
+        input_uint8 = np.pad(input_uint8, (pad_h, pad_w, (0, 0)), mode="edge")
+        pred_uint8 = np.pad(pred_uint8, (pad_h, pad_w, (0, 0)), mode="edge")
+    with imageio.get_writer(
+        video_path, fps=fps, macro_block_size=1, ffmpeg_params=["-loglevel", "quiet"]
+    ) as writer:
+        for i in range(total_frames):
+            frame = input_uint8 if i % 2 == 0 else pred_uint8
+            writer.append_data(frame)
+def visualize(
+    file_prefix: str,
+    input_hwc: np.ndarray,
+    pred_hwc: np.ndarray,
+    output_dir: str,
+    save_comparison: bool = True,
+    save_alternating: bool = True,
+) -> None:
+    pred_hwc = pred_hwc.clip(0, 1)
+    pred_uint8 = (pred_hwc * 255).round().astype(np.uint8)
+    input_hwc = np.asarray(input_hwc, dtype=np.float32)
+    if input_hwc.max() > 1.0:
+        input_hwc = input_hwc / 255.0
+    input_uint8 = (input_hwc.clip(0, 1) * 255).round().astype(np.uint8)
+    save_prediction_only(
+        file_prefix=file_prefix,
+        pred_uint8=pred_uint8,
+        output_dir=output_dir,
+    )
+    if save_comparison:
+        save_comparison_image(
+            file_prefix=file_prefix,
+            pred_uint8=pred_uint8,
+            input_uint8=input_uint8,
+            output_dir=output_dir,
+        )
+    if save_alternating:
+        save_alternating_video(
+            file_prefix=file_prefix,
+            input_uint8=input_uint8,
+            pred_uint8=pred_uint8,
+            output_dir=output_dir,
+        )
+def data_transform(sample, processing_resolution=None):
+    read_scalars(sample)
+    read_rgb_image(sample)
+    tile(sample, processing_resolution)
+def run_inference(
+    vae: AutoencoderKLQwenImage,
+    transformer: QwenImageTransformer2DModel,
+    embeds_dict: dict[str, torch.Tensor],
+    processing_resolution: int,
+    image_dir: str,
+    output_dir: str,
+    use_short_edge_tile=True,
+    save_comparison=True,
+    save_alternating=True,
+):
+    dataset = TilingDataset(
+        transform_graph=functools.partial(data_transform, processing_resolution=processing_resolution),
+        input_folder=image_dir,
+        gt_folder=image_dir,
+        use_short_edge_tile=use_short_edge_tile,
+        tiling_w=processing_resolution,
+        tiling_h=processing_resolution,
+        processing_resolution=processing_resolution,
+    )
+    data_loader = DataLoader(
+        dataset=dataset,
+        batch_size=2,
+        shuffle=False,
+        num_workers=0,
+    )
+    os.makedirs(output_dir, exist_ok=True)
+    validate_single_dataset(
+        vae,
+        transformer,
+        embeds_dict,
+        data_loader=data_loader,
+        save_to_dir=output_dir,
+        save_comparison=save_comparison,
+        save_alternating=save_alternating,
+    )
+def parse_args():
+    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+    IMAGE_DIR = os.path.join(SCRIPT_DIR, "example_images")
+    OUTPUT_DIR = os.path.join(SCRIPT_DIR, "outputs")
+    parser = argparse.ArgumentParser(
+        description="WindowSeat: reflection removal inference"
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        default=IMAGE_DIR,
+        help="Directory with input images (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=OUTPUT_DIR,
+        help="Directory to write predictions (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--uri-base",
+        type=str,
+        default=SUPPORTED_MODEL_URIS[0],
+        help="URI of the base model (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--uri-lora",
+        type=str,
+        default=LORA_MODEL_URI,
+        help="URI of the LoRA model (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--more-tiles",
+        action="store_true",
+        help="Use more tiles for processing.",
+    )
+    parser.add_argument(
+        "--no-save-comparison",
+        dest="save_comparison",
+        action="store_false",
+        help="Do NOT save comparison image between input and prediction.",
+    )
+    parser.add_argument(
+        "--no-save-alternating",
+        dest="save_alternating",
+        action="store_false",
+        help="Do NOT save alternating video.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device used for inference.",
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    image_dir = args.input_dir
+    output_dir = args.output_dir
+    uri_base = args.uri_base
+    uri_lora = args.uri_lora
+    use_short_edge_tile = not args.more_tiles
+    save_comparison = args.save_comparison
+    save_alternating = args.save_alternating
+    device = torch.device(args.device)
+    if device != torch.device("cuda"):
+        warnings.warn(
+            f"WindowSeat inference was only tested with 'cuda'. "
+            f"Device {device} is not officially supported and may be slow or fail."
+        )
+    if not os.path.isdir(image_dir):
+        print_error(f"Input image directory does not exist: {image_dir}")
+        sys.exit(1)
+    os.makedirs(output_dir, exist_ok=True)
+    print_banner("WindowSeat: Reflection Removal")
+    print_step("1/2", "Loading network components:")
+    print_info(f"Base:        {uri_base}")
+    print_info(f"WindowSeat:  {uri_lora}")
+    try:
+        vae, transformer, embeds_dict, processing_resolution = load_network(uri_base, uri_lora, device)
+    except Exception as e:
+        print_error(f"Failed to load network: {e}")
+        raise
+    print_step("2/2", f"Running reflection removal inference on: {image_dir}")
+    run_inference(
+        vae, transformer, embeds_dict, processing_resolution, image_dir, output_dir, use_short_edge_tile, save_comparison, save_alternating
+    )
+    print_final_success(output_dir)
+if __name__ == "__main__":
+    main()