Spaces:

OK-AI
/

ViT-Patch-PCA-Visualisation

Running

App Files Files Community

Tenbatsu24 commited on 7 days ago

Commit

69eec85

1 Parent(s): 3cd97ec

add: vitb16 support and more user inputs.

Browse files

Files changed (1) hide show

app.py +143 -63

app.py CHANGED Viewed

@@ -14,42 +14,51 @@ from sklearn.decomposition import PCA
 IMAGENET_MEAN = [0.485, 0.456, 0.406]
 IMAGENET_STD = [0.229, 0.224, 0.225]
-IMAGE_SIZE = 672
 PATCH_SIZE = 16
 PCA_COMPONENTS = 3
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_IDS = {
-    "DiNO": "OK-AI/dino-vits16-pretrain-in1k",
-    "iBOT": "OK-AI/ibot-vits16-pretrain-in1k",
-    "LeJEPA": "OK-AI/lejepa-vits16-pretrain-in1k",
 }
-MODEL_NAMES = list(MODEL_IDS.keys())  # fixed order
 # ── model loading (cached) ────────────────────────────────────────────────────
 _model_cache: dict[str, torch.nn.Module] = {}
-def get_model(name: str) -> torch.nn.Module:
-    if name not in _model_cache:
         model = AutoModel.from_pretrained(
-            MODEL_IDS[name],
             trust_remote_code=True,
         )
         model.eval().to(DEVICE)
-        _model_cache[name] = model
-    return _model_cache[name]
 # ── image helpers ─────────────────────────────────────────────────────────────
 def resize_image_for_patches(
-    image: Image.Image,
-    image_size: int = IMAGE_SIZE,
-    patch_size: int = PATCH_SIZE,
 ) -> torch.Tensor:
     """Resize so height = image_size and width is patch-aligned,
     preserving aspect ratio. Returns (1, 3, H, W) float tensor."""
@@ -71,12 +80,12 @@ def preprocess(image_tensor: torch.Tensor) -> torch.Tensor:
     ).unsqueeze(0)
-def pad_to_square(img: Image.Image) -> Image.Image:
     """Letterbox/pillarbox img onto a square canvas with a dark background.
     Ensures all output images share the same dimensions so the Gradio row
     never reflows or stretches when aspect ratios differ."""
     w, h = img.size
-    size = max(w, h)
     canvas = Image.new("RGB", (size, size), color=(18, 18, 18))
     canvas.paste(img, ((size - w) // 2, (size - h) // 2))
     return canvas
@@ -85,7 +94,7 @@ def pad_to_square(img: Image.Image) -> Image.Image:
 # ── PCA visualisation ─────────────────────────────────────────────────────────
-def pca_vis(model: torch.nn.Module, image_tensor: torch.Tensor) -> Image.Image:
     """Run image through model, PCA patch features → square-padded RGB PIL image."""
     model_input = preprocess(image_tensor).to(DEVICE)
@@ -107,30 +116,49 @@ def pca_vis(model: torch.nn.Module, image_tensor: torch.Tensor) -> Image.Image:
     # nearest-neighbour upscale → pad to square so all outputs are the same size
     upscaled = Image.fromarray(pca_array, mode="RGB").resize((W, H), Image.NEAREST)
-    return pad_to_square(upscaled)
 # ── streaming inference ───────────────────────────────────────────────────────
-PENDING = Image.new("RGB", (IMAGE_SIZE, IMAGE_SIZE), color=(18, 18, 18))
-def run(pil_image: Image.Image):
     """
-    Generator: yields (dino_out, ibot_out, lejepa_out) after each model
-    finishes, so the UI updates one image at a time.
     """
     if pil_image is None:
         raise gr.Error("Please upload an image.")
     pil_image = pil_image.convert("RGB")
-    image_tensor = resize_image_for_patches(pil_image)
-    results = [PENDING, PENDING, PENDING]
-    for i, name in enumerate(MODEL_NAMES):
-        model = get_model(name)
-        results[i] = pca_vis(model, image_tensor)
-        yield tuple(results)
 # ── UI ────────────────────────────────────────────────────────────────────────
@@ -146,6 +174,14 @@ CSS = """
     font-size: 0.9rem;
     padding-bottom: 1rem;
 }
 .model-label {
     text-align: center;
     font-weight: 600;
@@ -153,11 +189,20 @@ CSS = """
     color: #374151;
     padding: 0.25rem 0;
 }
 .output-col {
-    display: flex;
-    flex-direction: column;
-    align-items: center;
-    gap: 0.25rem;
 }
 .subtitle-row a, .model-label a {
     color: inherit;
@@ -171,7 +216,6 @@ footer { display: none !important; }
 """
 with gr.Blocks(css=CSS, title="SSL ViT PCA Visualiser") as demo:
     gr.HTML("""
         <div class="title-row">
             <h1 style="font-size:1.6rem; font-weight:700; margin:0;">
@@ -179,10 +223,8 @@ with gr.Blocks(css=CSS, title="SSL ViT PCA Visualiser") as demo:
             </h1>
         </div>
         <div class="subtitle-row">
-            ViT-S/16 &nbsp;·&nbsp; ImageNet-1K pre-training &nbsp;·&nbsp;
-            <a href="https://huggingface.co/OK-AI/dino-vits16-pretrain-in1k" target="_blank">DiNO</a> &nbsp;·&nbsp;
-            <a href="https://huggingface.co/OK-AI/ibot-vits16-pretrain-in1k" target="_blank">iBOT</a> &nbsp;·&nbsp;
-            <a href="https://huggingface.co/OK-AI/lejepa-vits16-pretrain-in1k" target="_blank">LeJEPA</a>
         </div>
     """)
@@ -193,49 +235,87 @@ with gr.Blocks(css=CSS, title="SSL ViT PCA Visualiser") as demo:
                 label="Input image",
                 show_label=True,
             )
             run_btn = gr.Button("Visualise", variant="primary")
             gr.HTML("""
                 <p style="font-size:0.8rem; color:#9ca3af; margin-top:0.5rem; line-height:1.5;">
-                    Image is resized to 672 px tall (patch-aligned, aspect preserved)
-                    before inference. PCA is fit on all patch tokens and projected to
                     3 components, then scaled with sigmoid for colour display.
-                    Results appear as each model finishes.
-                </p>
-                <p style="font-size:0.75rem; color:#9ca3af; margin-top:0.25rem;">
-                    Models: <a href="https://huggingface.co/OK-AI" target="_blank">OK-AI on HuggingFace</a>
-                    &nbsp;·&nbsp;
-                    Code: <a href="https://github.com/Open-Knowledge-AI/lite_ssl" target="_blank">lite_ssl</a>
                 </p>
             """)
         with gr.Column(scale=3):
             with gr.Row(equal_height=True):
                 with gr.Column(elem_classes="output-col"):
-                    gr.HTML('<div class="model-label"><a href="https://huggingface.co/OK-AI/dino-vits16-pretrain-in1k" target="_blank">DiNO</a></div>')
-                    out_dino = gr.Image(show_label=False, interactive=False)
                 with gr.Column(elem_classes="output-col"):
-                    gr.HTML('<div class="model-label"><a href="https://huggingface.co/OK-AI/ibot-vits16-pretrain-in1k" target="_blank">iBOT</a></div>')
-                    out_ibot = gr.Image(show_label=False, interactive=False)
                 with gr.Column(elem_classes="output-col"):
-                    gr.HTML('<div class="model-label"><a href="https://huggingface.co/OK-AI/lejepa-vits16-pretrain-in1k" target="_blank">LeJEPA</a></div>')
-                    out_lejepa = gr.Image(show_label=False, interactive=False)
     run_btn.click(
         fn=run,
-        inputs=[input_image],
-        outputs=[out_dino, out_ibot, out_lejepa],
-    )
-    gr.Examples(
-        examples=[
-            [f"examples/{f}"]
-            for f in sorted(os.listdir("examples"))
-            if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
-        ],
-        inputs=[input_image],
     )
 if __name__ == "__main__":
     demo.launch()

 IMAGENET_MEAN = [0.485, 0.456, 0.406]
 IMAGENET_STD = [0.229, 0.224, 0.225]
 PATCH_SIZE = 16
 PCA_COMPONENTS = 3
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_IDS = {
+    "ViT-S/16": {
+        "DiNO": "OK-AI/dino-vits16-pretrain-in1k",
+        "iBOT": "OK-AI/ibot-vits16-pretrain-in1k",
+        "LeJEPA": "OK-AI/lejepa-vits16-pretrain-in1k",
+    },
+    "ViT-B/16": {
+        "DiNO": "OK-AI/dino-vitb16-pretrain-in1k",
+        "iBOT": "OK-AI/ibot-vitb16-pretrain-in1k",
+        "LeJEPA": "OK-AI/lejepa-vitb16-pretrain-in1k",
+    }
 }
+MODEL_KEYS = ["DiNO", "iBOT", "LeJEPA"]
 # ── model loading (cached) ────────────────────────────────────────────────────
 _model_cache: dict[str, torch.nn.Module] = {}
+def get_model(repo_id: str, revision: str) -> torch.nn.Module:
+    cache_key = f"{repo_id}@{revision}"
+    if cache_key not in _model_cache:
         model = AutoModel.from_pretrained(
+            repo_id,
+            revision=revision,
             trust_remote_code=True,
         )
         model.eval().to(DEVICE)
+        _model_cache[cache_key] = model
+    return _model_cache[cache_key]
 # ── image helpers ─────────────────────────────────────────────────────────────
 def resize_image_for_patches(
+        image: Image.Image,
+        image_size: int,
+        patch_size: int = PATCH_SIZE,
 ) -> torch.Tensor:
     """Resize so height = image_size and width is patch-aligned,
     preserving aspect ratio. Returns (1, 3, H, W) float tensor."""
     ).unsqueeze(0)
+def pad_to_square(img: Image.Image, canvas_size: int) -> Image.Image:
     """Letterbox/pillarbox img onto a square canvas with a dark background.
     Ensures all output images share the same dimensions so the Gradio row
     never reflows or stretches when aspect ratios differ."""
     w, h = img.size
+    size = max(w, h, canvas_size)
     canvas = Image.new("RGB", (size, size), color=(18, 18, 18))
     canvas.paste(img, ((size - w) // 2, (size - h) // 2))
     return canvas
 # ── PCA visualisation ─────────────────────────────────────────────────────────
+def pca_vis(model: torch.nn.Module, image_tensor: torch.Tensor, canvas_size: int) -> Image.Image:
     """Run image through model, PCA patch features → square-padded RGB PIL image."""
     model_input = preprocess(image_tensor).to(DEVICE)
     # nearest-neighbour upscale → pad to square so all outputs are the same size
     upscaled = Image.fromarray(pca_array, mode="RGB").resize((W, H), Image.NEAREST)
+    return pad_to_square(upscaled, canvas_size)
 # ── streaming inference ───────────────────────────────────────────────────────
+def run(pil_image: Image.Image, epoch: str, weight_type: str, image_size: int):
     """
+    Generator: yields updates sequentially across models and sizes.
     """
     if pil_image is None:
         raise gr.Error("Please upload an image.")
+    image_size = int(image_size)
+    pending_img = Image.new("RGB", (image_size, image_size), color=(18, 18, 18))
+    # 6 total positions: ViT-S [dino, ibot, lejepa], ViT-B [dino, ibot, lejepa]
+    results = [pending_img] * 6
+    yield tuple(results)
     pil_image = pil_image.convert("RGB")
+    image_tensor = resize_image_for_patches(pil_image, image_size)
+    idx = 0
+    for arch in ["ViT-S/16", "ViT-B/16"]:
+        for model_key in MODEL_KEYS:
+            repo_id = MODEL_IDS[arch][model_key]
+            # LeJEPA only supports student weights
+            current_weight = "student" if model_key == "LeJEPA" else weight_type
+            revision = f"{epoch}/{current_weight}"
+            try:
+                model = get_model(repo_id, revision)
+                results[idx] = pca_vis(model, image_tensor, image_size)
+            except Exception as e:
+                print(f"Error processing {repo_id} ({revision}): {e}")
+                # Create an error placeholder card if a model/revision download fails
+                error_canvas = Image.new("RGB", (image_size, image_size), color=(40, 20, 20))
+                results[idx] = error_canvas
+            yield tuple(results)
+            idx += 1
 # ── UI ────────────────────────────────────────────────────────────────────────
     font-size: 0.9rem;
     padding-bottom: 1rem;
 }
+.arch-header {
+    font-size: 1.2rem;
+    font-weight: 700;
+    margin-top: 1rem;
+    padding-left: 0.5rem;
+    border-left: 4px solid #3b82f6;
+    color: #1f2937;
+}
 .model-label {
     text-align: center;
     font-weight: 600;
     color: #374151;
     padding: 0.25rem 0;
 }
+/* Ensure strict rigid layouts for outputs to avoid layout shifting */
 .output-col {
+    display: flex !important;
+    flex-direction: column !important;
+    align-items: center !important;
+    gap: 0.25rem !important;
+    flex: 1 1 0% !important;
+    min-width: 150px !important;
+}
+.output-col img {
+    aspect-ratio: 1 / 1 !important;
+    object-fit: contain !important;
+    max-height: 350px !important;
+    width: 100% !important;
 }
 .subtitle-row a, .model-label a {
     color: inherit;
 """
 with gr.Blocks(css=CSS, title="SSL ViT PCA Visualiser") as demo:
     gr.HTML("""
         <div class="title-row">
             <h1 style="font-size:1.6rem; font-weight:700; margin:0;">
             </h1>
         </div>
         <div class="subtitle-row">
+            ImageNet-1K pre-training &nbsp;·&nbsp;
+            <a href="https://huggingface.co/OK-AI" target="_blank">OK-AI Models</a>
         </div>
     """)
                 label="Input image",
                 show_label=True,
             )
+            with gr.Row():
+                opt_epoch = gr.Dropdown(
+                    choices=["ep100", "ep300"],
+                    value="ep300",
+                    label="Epochs",
+                    interactive=True
+                )
+                opt_weight = gr.Dropdown(
+                    choices=["student", "teacher"],
+                    value="teacher",
+                    label="Weight Type",
+                    info="LeJEPA always uses student",
+                    interactive=True
+                )
+            opt_size = gr.Dropdown(
+                choices=["224", "448", "672", "1280"],
+                value="672",
+                label="Image Target Resolution",
+                interactive=True
+            )
             run_btn = gr.Button("Visualise", variant="primary")
             gr.HTML("""
                 <p style="font-size:0.8rem; color:#9ca3af; margin-top:0.5rem; line-height:1.5;">
+                    PCA is fit on all patch tokens and projected to
                     3 components, then scaled with sigmoid for colour display.
+                    Results stream seamlessly into view as individual variants complete.
                 </p>
             """)
         with gr.Column(scale=3):
+            # ── ViT-S/16 Row ──
+            gr.HTML('<div class="arch-header">ViT-S/16 Grid</div>')
+            with gr.Row(equal_height=True):
+                with gr.Column(elem_classes="output-col"):
+                    gr.HTML('<div class="model-label">DiNO (S/16)</div>')
+                    out_dino_s = gr.Image(show_label=False, interactive=False)
+                with gr.Column(elem_classes="output-col"):
+                    gr.HTML('<div class="model-label">iBOT (S/16)</div>')
+                    out_ibot_s = gr.Image(show_label=False, interactive=False)
+                with gr.Column(elem_classes="output-col"):
+                    gr.HTML('<div class="model-label">LeJEPA (S/16)</div>')
+                    out_lejepa_s = gr.Image(show_label=False, interactive=False)
+            # ── ViT-B/16 Row ──
+            gr.HTML('<div class="arch-header">ViT-B/16 Grid</div>')
             with gr.Row(equal_height=True):
                 with gr.Column(elem_classes="output-col"):
+                    gr.HTML('<div class="model-label">DiNO (B/16)</div>')
+                    out_dino_b = gr.Image(show_label=False, interactive=False)
                 with gr.Column(elem_classes="output-col"):
+                    gr.HTML('<div class="model-label">iBOT (B/16)</div>')
+                    out_ibot_b = gr.Image(show_label=False, interactive=False)
                 with gr.Column(elem_classes="output-col"):
+                    gr.HTML('<div class="model-label">LeJEPA (B/16)</div>')
+                    out_lejepa_b = gr.Image(show_label=False, interactive=False)
+    # Wire outputs orderly following the exact resolution pattern tracking inside the `run` loop
+    output_targets = [
+        out_dino_s, out_ibot_s, out_lejepa_s,
+        out_dino_b, out_ibot_b, out_lejepa_b
+    ]
     run_btn.click(
         fn=run,
+        inputs=[input_image, opt_epoch, opt_weight, opt_size],
+        outputs=output_targets,
     )
+    if os.path.exists("examples"):
+        gr.Examples(
+            examples=[
+                [f"examples/{f}", "ep300", "teacher", "672"]
+                for f in sorted(os.listdir("examples"))
+                if f.lower().endswith((".jpg", ".jpeg", ".png", ".webp"))
+            ],
+            inputs=[input_image, opt_epoch, opt_weight, opt_size],
+        )
 if __name__ == "__main__":
     demo.launch()