Spaces:

csgaobb
/

MetaUAS

Running on Zero

App Files Files Community

csgaobb commited on Mar 27, 2025

Commit

020dd6e

1 Parent(s): aa603f8

init MetaUAS

Browse files

Files changed (4) hide show

app.py +160 -0
demo_metauas.py +90 -0
metauas.py +293 -0
requirements.txt +14 -0

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''
+@File    :   app.py
+@Time    :   2025/03/26 23:48:24
+@Author  :   Bin-Bin Gao
+@Email   :   csgaobb@gmail.com
+@Homepage:   https://csgaobb.github.io/
+@Version :   1.0
+@Desc    :   MetaUAS Demo with Gradio
+'''
+import os
+import cv2
+import torch
+import json
+import shutil
+import kornia as K
+import numpy as np
+import gradio as gr
+from easydict import EasyDict
+from argparse import ArgumentParser
+from torchvision.transforms.functional import pil_to_tensor
+from metauas import MetaUAS, set_random_seed, normalize, apply_ad_scoremap, safely_load_state_dict
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# configurations
+random_seed = 1
+encoder_name = 'efficientnet-b4'
+decoder_name = 'unet'
+encoder_depth = 5
+decoder_depth = 5
+num_alignment_layers = 3
+alignment_type =  'sa'
+fusion_policy = 'cat'
+# build model
+set_random_seed(random_seed)
+metauas_model = MetaUAS(encoder_name,
+                decoder_name,
+                encoder_depth,
+                decoder_depth,
+                num_alignment_layers,
+                alignment_type,
+                fusion_policy
+                )
+def process_image(prompt_img, query_img, options):
+    # Load the model based on selected options
+    if 'model-512' in options:
+        ckt_path = "weights/metauas-512.ckpt"
+        model = safely_load_state_dict(metauas_model, ckt_path)
+        img_size = 512
+    else:
+        ckt_path = 'weights/metauas-256.ckpt'
+        model = safely_load_state_dict(metauas_model, ckt_path)
+        img_size = 256
+    model.to(device)
+    model.eval()
+    # Ensure image is in RGB mode
+    prompt_img = prompt_img.convert('RGB')
+    query_img = query_img.convert('RGB')
+    query_img = pil_to_tensor(query_img).float() / 255.0
+    prompt_img = pil_to_tensor(prompt_img).float() / 255.0
+    if query_img.shape[1] != img_size:
+        resize_trans = K.augmentation.Resize([img_size, img_size], return_transform=True)
+        query_img = resize_trans(query_img)[0]
+        prompt_img = resize_trans(prompt_img)[0]
+    test_data = {
+            "query_image": query_img.to(device),
+            "prompt_image": prompt_img.to(device),
+        }
+    # Forward
+    with torch.no_grad():
+        predicted_masks = model(test_data)
+        anomaly_score = predicted_masks[:].max()
+    # Process anomaly map
+    query_img = test_data["query_image"][0] * 255
+    query_img = query_img.permute(1,2,0)
+    anomaly_map = predicted_masks.squeeze().detach()[:, :, None].cpu().numpy().repeat(3, 2)
+    anomaly_map_vis = apply_ad_scoremap(query_img.cpu(), normalize(anomaly_map))
+    anomaly_map = (anomaly_map * 255).astype(np.uint8)
+    anomaly_map = cv2.applyColorMap(anomaly_map, cv2.COLORMAP_JET)
+    anomaly_map = cv2.cvtColor(anomaly_map, cv2.COLOR_BGR2RGB)
+    return anomaly_map_vis, anomaly_map, f'{anomaly_score:.3f}'
+# Define examples
+examples = [
+    ["images/134.png", "images/000.png", "model-256"],
+    ["images/036.png", "images/024.png", "model-256"],
+    ["images/178.png", "images/003.png", "model-256"],
+]
+# Gradio interface layout
+with gr.Blocks() as demo:
+    gr.HTML("""<h1 align="center" style='margin-top: 30px;'>MetaUAS: Universal Anomaly Segmentation</h1>""")
+    gr.HTML("""<h1 align="center" style="font-size: 15px; "style='margin-top: 40px;'>just given ONE normal image prompt</h1>""")
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                prompt_image = gr.Image(type="pil", label="Prompt Image")
+                query_image = gr.Image(type="pil", label="Query Image")
+            model_selector = gr.Radio(["model-256", "model-512"], label="Pre-models")
+        with gr.Column():
+            with gr.Row():
+                 anomaly_map_vis = gr.Image(type="pil", label="Anomaly Results")
+                 anomaly_map = gr.Image(type="pil", label="Anomaly Maps")
+            anomaly_score = gr.Textbox(label="Anomaly Score")
+    with gr.Row():
+        submit_button = gr.Button("Submit", elem_id="submit-button")
+        clear_button = gr.Button("Clear")
+    # Set up the event handlers
+    submit_button.click(process_image, inputs=[prompt_image, query_image, model_selector], outputs=[anomaly_map_vis, anomaly_map, anomaly_score])
+    clear_button.click(lambda: (None, None, None), outputs=[anomaly_map_vis, anomaly_map, anomaly_score])
+    # Add examples directly to the Blocks interface
+    gr.Examples(examples, inputs=[prompt_image, query_image, model_selector])
+# Add custom CSS to control the output image size and button styles
+demo.css = """
+#submit-button {
+    color: red !important;  /* Font color */
+    background-color: orange !important;  /* Background color */
+    border: none !important;  /* Remove border */
+    padding: 10px 20px !important;  /* Add padding */
+    border-radius: 5px !important;  /* Rounded corners */
+    font-size: 16px !important;  /* Font size */
+    cursor: pointer !important;  /* Pointer cursor on hover */
+}
+#submit-button:hover {
+    background-color: darkorange !important;  /* Darker orange on hover */
+}
+"""
+# Launch the demo
+demo.launch()

demo_metauas.py ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''
+@File    :   demo_metauas.py
+@Time    :   2025/03/26 23:49:14
+@Author  :   Bin-Bin Gao
+@Email   :   csgaobb@gmail.com
+@Homepage:   https://csgaobb.github.io/
+@Version :   1.0
+@Desc    :   MetaUAS Demo
+'''
+import os
+import cv2
+import torch
+import json
+import shutil
+import kornia as K
+import numpy as np
+from easydict import EasyDict
+from argparse import ArgumentParser
+from metauas import MetaUAS, set_random_seed, normalize, apply_ad_scoremap, read_image_as_tensor, safely_load_state_dict
+if __name__ == "__main__":
+    random_seed = 1
+    set_random_seed(random_seed)
+    ckt_path = 'weights/metauas-256.ckpt'
+    img_size = 256
+    #ckt_path = "weights/metauas-512.ckpt"
+    #img_size = 512
+    # load model
+    encoder = 'efficientnet-b4'
+    decoder = 'unet'
+    encoder_depth = 5
+    decoder_depth = 5
+    num_crossfa_layers = 3
+    alignment_type =  'sa'
+    fusion_policy = 'cat'
+    model = MetaUAS(encoder,
+                decoder,
+                encoder_depth,
+                decoder_depth,
+                num_crossfa_layers,
+                alignment_type,
+                fusion_policy
+                )
+    model = safely_load_state_dict(model, ckt_path)
+    model.cuda()
+    model.eval()
+    # load test images
+    path_root = "./images/"
+    path_to_prompt = path_root + "036.png"
+    path_to_query = path_root + "024.png"
+    query = read_image_as_tensor(path_to_query)
+    prompt = read_image_as_tensor(path_to_prompt)
+    if query.shape[1] != img_size:
+        resize_trans = K.augmentation.Resize([img_size, img_size], return_transform=True)
+        query = resize_trans(query)[0]
+        prompt = resize_trans(prompt)[0]
+    test_data = {
+            "query_image": query.cuda(),
+            "prompt_image": prompt.cuda(),
+        }
+    # forward
+    predicted_masks = model(test_data)
+    # visualization
+    query_img = test_data["query_image"][0] * 255
+    query_img = query_img.permute(1,2,0)
+    pred = (1-predicted_masks.squeeze().detach())[:, :, None].cpu().numpy().repeat(3, 2)
+    # normalize just for analysis
+    scoremap_self = apply_ad_scoremap(query_img.cpu(), normalize(pred))
+    cv2.imwrite('./anomaly_map.jpg', scoremap_self)

metauas.py ADDED Viewed

	@@ -0,0 +1,293 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''
+@File    :   metauas.py
+@Time    :   2025/03/26 23:46:12
+@Author  :   Bin-Bin Gao
+@Email   :   csgaobb@gmail.com
+@Homepage:   https://csgaobb.github.io/
+@Version :   1.0
+@Desc    :   some classes and functions for MetaUAS
+'''
+import os
+import random
+import kornia as K
+import matplotlib.pyplot as plt
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import tqdm
+import time
+import cv2
+from PIL import Image
+from einops import rearrange
+from torch.nn import functional as F
+from torchvision import transforms
+from torchvision.transforms.functional import pil_to_tensor
+from segmentation_models_pytorch.unet.model import UnetDecoder
+from segmentation_models_pytorch.fpn.decoder import FPNDecoder
+from segmentation_models_pytorch.encoders import get_encoder, get_preprocessing_params
+def set_random_seed(seed=233, reproduce=False):
+    np.random.seed(seed)
+    torch.manual_seed(seed ** 2)
+    torch.cuda.manual_seed(seed ** 3)
+    random.seed(seed ** 4)
+    if reproduce:
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+    else:
+        torch.backends.cudnn.benchmark = True
+def normalize(pred, max_value=None, min_value=None):
+    if max_value is None or min_value is None:
+        return (pred - pred.min()) / (pred.max() - pred.min())
+    else:
+        return (pred - min_value) / (max_value - min_value)
+def apply_ad_scoremap(image, scoremap, alpha=0.5):
+    np_image = np.asarray(image, dtype=np.float32)
+    scoremap = (scoremap * 255).astype(np.uint8)
+    scoremap = cv2.applyColorMap(scoremap, cv2.COLORMAP_JET)
+    scoremap = cv2.cvtColor(scoremap, cv2.COLOR_BGR2RGB)
+    return (alpha * np_image + (1 - alpha) * scoremap).astype(np.uint8)
+def read_image_as_tensor(path_to_image):
+    pil_image = Image.open(path_to_image).convert("RGB")
+    image_as_tensor = pil_to_tensor(pil_image).float() / 255.0
+    return image_as_tensor
+def safely_load_state_dict(model, checkpoint):
+    model.load_state_dict(torch.load(checkpoint), strict=True)
+    return model
+class AlignmentModule(nn.Module):
+    def __init__(self, input_channels=2048, hidden_channels=256, alignment_type="sa", fusion_policy='cat'):
+        super().__init__()
+        self.fusion_policy = fusion_policy
+        self.alignment_layer = AlignmentLayer(input_channels, hidden_channels, alignment_type=alignment_type)
+    def forward(self, query_features, prompt_features):
+            if isinstance(prompt_features, list):
+                aligned_prompt = []
+                for i in range(len(prompt_features)):
+                    weighted_prompt.append(self.alignment_layer(query_features, prompt_features[i]))
+                aligned_prompt = torch.mean(torch.stack(aligned_prompt),0)
+            else:
+                aligned_prompt = self.alignment_layer(query_features, prompt_features)
+            if self.fusion_policy == 'cat':
+                query_features = rearrange(
+                    [query_features, aligned_prompt], "two b c h w -> b (two c) h w"
+                )
+            elif self.fusion_policy == 'add':
+                query_features = query_features + aligned_prompt
+            elif self.fusion_policy == 'absdiff':
+                query_features = (query_features - aligned_prompt).abs()
+            return query_features
+class AlignmentLayer(nn.Module):
+    def __init__(self, input_channels=2048, hidden_channels=256, alignment_type="sa"):
+        super().__init__()
+        self.alignment_type = alignment_type
+        if alignment_type != "na":
+            self.dimensionality_reduction = nn.Conv2d(
+                input_channels, hidden_channels, kernel_size=1, stride=1, padding=0, bias=True
+            )
+    def forward(self, query_features, prompt_features):
+        # no-alignment
+        if self.alignment_type == 'na':
+            return prompt_features
+        else:
+            Q = self.dimensionality_reduction(query_features)
+            K = self.dimensionality_reduction(prompt_features)
+            V = rearrange(prompt_features, "b c h w -> b c (h w)")
+            soft_attention_map = torch.einsum("bcij,bckl->bijkl", Q, K)
+            soft_attention_map = rearrange(soft_attention_map, "b h1 w1 h2 w2 -> b h1 w1 (h2 w2)")
+            soft_attention_map = nn.Softmax(dim=3)(soft_attention_map)
+            # soft-alignment
+            if self.alignment_type == 'sa':
+                aligned_features = torch.einsum("bijp,bcp->bcij", soft_attention_map, V)
+            # hard-alignment
+            if self.alignment_type == 'ha':
+                max_v, max_index = attention_map.max(dim=-1, keepdim=True)
+                hard_attention_map = (attention_map == max_v).float()
+                aligned_features = torch.einsum("bijp,bcp->bcij", hard_attention_map, V)
+            return aligned_features
+class MetaUAS(pl.LightningModule):
+    def __init__(self, encoder_name, decoder_name, encoder_depth, decoder_depth, num_alignment_layers, alignment_type, fusion_policy):
+        super().__init__()
+        self.encoder_name = encoder_name
+        self.decoder_name = decoder_name
+        self.encoder_depth = encoder_depth
+        self.decoder_depth = decoder_depth
+        self.num_alignment_layers = num_alignment_layers
+        self.alignment_type = alignment_type
+        self.fusion_policy = fusion_policy
+        align_input_channels = [448, 160, 56]
+        align_hidden_channels = [224, 80, 28]
+        encoder_channels = [3, 48, 32, 56, 160, 448]
+        decoder_channels = [256, 128, 64, 64, 48]
+        self.encoder = get_encoder(
+            self.encoder_name,
+            in_channels=3,
+            depth=self.encoder_depth,
+            weights="imagenet",)
+        preparams = get_preprocessing_params(
+            self.encoder_name,
+            pretrained="imagenet"
+            )
+        self.preprocess = transforms.Normalize(preparams['mean'], preparams['std'])
+        self.encoder.eval()
+        for param in self.encoder.parameters():
+            param.requires_grad = False
+        if self.decoder_name == "unet":
+            encoder_out_channels = encoder_channels[self.encoder_depth-self.decoder_depth:]
+            if self.fusion_policy == 'cat':
+                num_alignment_layers = self.num_alignment_layers
+            elif self.fusion_policy == 'add' or self.fusion_policy == 'absdiff':
+                num_alignment_layers = 0
+            self.decoder = UnetDecoder(
+                encoder_channels=encoder_out_channels,
+                decoder_channels=decoder_channels,
+                n_blocks= self.decoder_depth,
+                attention_type="scse",
+                num_coam_layers= num_alignment_layers,
+            )
+        elif self.decoder_name == "fpn":
+            encoder_out_channels = encoder_channels
+            if self.fusion_policy == 'cat':
+                for i in range(self.num_alignment_layers):
+                    encoder_out_channels[-(i+1)] = 2 * encoder_out_channels[-(i+1)]
+            self.decoder = FPNDecoder(
+                encoder_channels= encoder_out_channels,
+                encoder_depth=self.encoder_depth,
+                pyramid_channels=256,
+                segmentation_channels=decoder_channels[-1],
+                dropout=0.2,
+                merge_policy="add",
+            )
+        elif self.decoder_name == "fpnadd":
+            segmentation_channels = 256 #128
+            encoder_out_channels = encoder_channels
+            if self.fusion_policy == 'cat':
+                for i in range(self.num_alignment_layers):
+                    encoder_out_channels[-(i+1)] = 2 * encoder_out_channels[-(i+1)]
+            self.decoder = FPNDecoder(
+                encoder_channels= encoder_out_channels,
+                encoder_depth=self.encoder_depth,
+                pyramid_channels=256,
+                segmentation_channels=segmentation_channels,
+                dropout=0.2,
+                merge_policy="add",
+            )
+        elif self.decoder_name == "fpncat":
+            encoder_out_channels = encoder_channels
+            segmentation_channels = 256 #128
+            if self.fusion_policy == 'cat':
+                for i in range(self.num_alignment_layers):
+                    encoder_out_channels[-(i+1)] = 2 * encoder_out_channels[-(i+1)]
+            self.decoder = FPNDecoder(
+                encoder_channels= encoder_out_channels,
+                encoder_depth=self.encoder_depth,
+                pyramid_channels=256,
+                segmentation_channels=segmentation_channels,
+                dropout=0.2,
+                merge_policy="cat",
+            )
+        if self.alignment_type == "sa" or self.alignment_type == "na" or  self.alignment_type == "ha" :
+            self.alignment = nn.ModuleList(
+                [
+                    AlignmentModule(
+                        input_channels=align_input_channels[i],
+                        hidden_channels=align_hidden_channels[i],
+                        alignment_type=self.alignment_type,
+                        fusion_policy=self.fusion_policy,
+                    )
+                    for i in range(self.num_alignment_layers)
+                ]
+            )
+        if self.decoder_name == "fpncat":
+            self.mask_head = nn.Conv2d(
+                segmentation_channels*4,
+                1,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+        elif self.decoder_name == "fpnadd":
+            self.mask_head = nn.Conv2d(
+                segmentation_channels,
+                1,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+        else:
+            self.mask_head = nn.Conv2d(
+                decoder_channels[-1],
+                1,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+    def forward(self, batch):
+        query_input = self.preprocess(batch["query_image"])
+        prompt_input = self.preprocess(batch["prompt_image"])
+        with torch.no_grad():
+            query_encoded_features = self.encoder(query_input)
+            prompt_encoded_features = self.encoder(prompt_input)
+        for i in range(len(self.alignment)):
+            query_encoded_features[-(i + 1)] = self.alignment[i](query_encoded_features[-(i + 1)], prompt_encoded_features[-(i + 1)])
+        query_decoded_features = self.decoder(*query_encoded_features[self.encoder_depth-self.decoder_depth:])
+        if self.decoder_name == "fpn" or self.decoder_name == "fpncat" or self.decoder_name == "fpnadd":
+            output = F.interpolate(self.mask_head(query_decoded_features), scale_factor=4, mode="bilinear", align_corners=False)
+        elif self.decoder_name == "unet":
+            if self.decoder_depth == 4:
+                output = F.interpolate(self.mask_head(query_decoded_features), scale_factor=2, mode="bilinear", align_corners=False)
+            if self.decoder_depth == 5:
+                if not self.training:
+                    output = self.mask_head(query_decoded_features)
+        return output.sigmoid()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+easydict==1.11
+einops==0.8.1
+gradio==4.0.0
+kornia==0.6.3
+matplotlib==3.5.0
+numpy==1.24.4
+opencv_python==4.6.0.66
+opencv_python_headless==4.7.0.72
+Pillow==8.4.0
+pytorch_lightning==1.9.0
+segmentation_models_pytorch==0.2.1
+torch==1.12.1+cu113
+torchvision==0.13.1+cu113
+tqdm==4.62.3