Spaces:

huzey
/

ncut-pytorch

Running on Zero

App Files Files Community

huzey commited on Nov 6, 2024

Commit

d48a41d

1 Parent(s): 19c8c49

add fg

Browse files

Files changed (1) hide show

app.py +131 -30

app.py CHANGED Viewed

@@ -409,34 +409,40 @@ def blend_image_with_heatmap(image, heatmap, opacity1=0.5, opacity2=0.5):
     return blended.astype(np.uint8)
-def segment_fg_bg(images):
-    images = F.interpolate(images, (224, 224), mode="bilinear")
     # model = load_alignedthreemodel()
     model = load_model("CLIP(ViT-B-16/openai)")
     from ncut_pytorch.backbone import resample_position_embeddings
     pos_embed = model.model.visual.positional_embedding
-    pos_embed = resample_position_embeddings(pos_embed, 14, 14)
     model.model.visual.positional_embedding = torch.nn.Parameter(pos_embed)
-    batch_size = 4
     chunk_idxs = torch.split(torch.arange(images.shape[0]), batch_size)
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     model.to(device)
-    means = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(device)
-    stds = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(device)
     fg_acts, bg_acts = [], []
     for chunk_idx in chunk_idxs:
         with torch.no_grad():
             input_images = images[chunk_idx].to(device)
             # transform the input images
-            input_images = (input_images - means) / stds
             # output = model(input_images)[:, 5]
-            output = model(input_images)['attn'][6]  # [B, H=14, W=14, C]
-            fg_act = output[:, 6, 6].mean(0)
             bg_act = output[:, 0, 0].mean(0)
             fg_acts.append(fg_act)
             bg_acts.append(bg_act)
@@ -445,21 +451,6 @@ def segment_fg_bg(images):
     fg_act = F.normalize(fg_act, dim=-1)
     bg_act = F.normalize(bg_act, dim=-1)
-    # ref_image = default_images[0]
-    # image = Image.open(ref_image).convert("RGB").resize((224, 224), Image.Resampling.BILINEAR)
-    # image = torch.tensor(np.array(image)).permute(2, 0, 1).float().to(device)
-    # image = (image / 255.0 - means) / stds
-    # output = model(image)['attn'][6][0]
-    # # print(output.shape)
-    # # bg on the center
-    # fg_act = output[5, 5]
-    # # bg on the bottom left
-    # bg_act = output[0, 0]
-    # fg_act = F.normalize(fg_act, dim=-1)
-    # bg_act = F.normalize(bg_act, dim=-1)
-    # print(images.mean(), images.std())
     fg_act, bg_act = fg_act.to(device), bg_act.to(device)
     chunk_idxs = torch.split(torch.arange(images.shape[0]), batch_size)
     heatmap_fgs, heatmap_bgs = [], []
@@ -467,9 +458,10 @@ def segment_fg_bg(images):
         with torch.no_grad():
             input_images = images[chunk_idx].to(device)
             # transform the input images
-            input_images = (input_images - means) / stds
             # output = model(input_images)[:, 5]
-            output = model(input_images)['attn'][6]
             output = F.normalize(output, dim=-1)
             heatmap_fg = output @ fg_act[:, None]  # [B, H, W, 1]
             heatmap_bg = output @ bg_act[:, None]  # [B, H, W, 1]
@@ -868,6 +860,71 @@ def ncut_run(
         return to_pil_images(rgb), logging_str
     # ailgnedcut
     if not directed:
@@ -1037,9 +1094,9 @@ def _ncut_run(*args, **kwargs):
             torch.cuda.empty_cache()
         return *(None for _ in range(n_ret)), "Error: " + str(e)
-    # ret = ncut_run(*args, **kwargs)
-    # ret = list(ret)[:n_ret] + [ret[-1]]
-    # return ret
 if USE_HUGGINGFACE_ZEROGPU:
     @spaces.GPU(duration=30)
@@ -1250,6 +1307,7 @@ def run_fn(
     node_type2="k",
     head_index_text='all',
     make_symmetric=False,
     n_ret=1,
     plot_clusters=False,
     alignedcut_eig_norm_plot=False,
@@ -1258,6 +1316,7 @@ def run_fn(
     only_eigvecs=False,
     return_eigvec_and_rgb=False,
     normalize_eigvec_return=False,
 ):
     # print(node_type2, head_index_text, make_symmetric)
     progress=gr.Progress()
@@ -1390,6 +1449,7 @@ def run_fn(
         "lisa_prompt2": lisa_prompt2,
         "lisa_prompt3": lisa_prompt3,
         "is_lisa": is_lisa,
         "n_ret": n_ret,
         "plot_clusters": plot_clusters,
         "alignedcut_eig_norm_plot": alignedcut_eig_norm_plot,
@@ -1401,6 +1461,7 @@ def run_fn(
         "only_eigvecs": only_eigvecs,
         "return_eigvec_and_rgb": return_eigvec_and_rgb,
         "normalize_eigvec_return": normalize_eigvec_return,
     }
     # print(kwargs)
@@ -2196,7 +2257,8 @@ demo = gr.Blocks(
     css=custom_css,
 )
 with demo:
     with gr.Tab('PlayGround'):
         eigvecs = gr.State(np.array([]))
         tsne3d_rgb = gr.State(np.array([]))
@@ -4247,6 +4309,45 @@ with demo:
                     outputs=[mask_gallery, crop_gallery])
     with gr.Tab('Sub-cluster (dev)', visible=False) as sub_cluster_tab:
         with gr.Row():
             image_cluster_plot = gr.Image(value=None, label="Image-level clustering", elem_id="image_cluster_plot", interactive=False)

     return blended.astype(np.uint8)
+def segment_fg_bg(images, hw=224, i_layer=6, batch_size=4, transform_images=True):
+    assert hw % 16 == 0, "The height and width of the image must be divisible by 16."
+    psz = hw // 16
+    center_xy = (psz-1) // 2
+    images = F.interpolate(images, (hw, hw), mode="bilinear")
     # model = load_alignedthreemodel()
     model = load_model("CLIP(ViT-B-16/openai)")
     from ncut_pytorch.backbone import resample_position_embeddings
     pos_embed = model.model.visual.positional_embedding
+    pos_embed = resample_position_embeddings(pos_embed, psz, psz)
     model.model.visual.positional_embedding = torch.nn.Parameter(pos_embed)
     chunk_idxs = torch.split(torch.arange(images.shape[0]), batch_size)
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     model.to(device)
+    if transform_images:
+        means = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(device)
+        stds = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(device)
     fg_acts, bg_acts = [], []
     for chunk_idx in chunk_idxs:
         with torch.no_grad():
             input_images = images[chunk_idx].to(device)
             # transform the input images
+            if transform_images:
+                input_images = (input_images - means) / stds
             # output = model(input_images)[:, 5]
+            output = model(input_images)['attn'][i_layer]  # [B, H=14, W=14, C]
+            fg_act = output[:, center_xy, center_xy].mean(0)
             bg_act = output[:, 0, 0].mean(0)
             fg_acts.append(fg_act)
             bg_acts.append(bg_act)
     fg_act = F.normalize(fg_act, dim=-1)
     bg_act = F.normalize(bg_act, dim=-1)
     fg_act, bg_act = fg_act.to(device), bg_act.to(device)
     chunk_idxs = torch.split(torch.arange(images.shape[0]), batch_size)
     heatmap_fgs, heatmap_bgs = [], []
         with torch.no_grad():
             input_images = images[chunk_idx].to(device)
             # transform the input images
+            if transform_images:
+                input_images = (input_images - means) / stds
             # output = model(input_images)[:, 5]
+            output = model(input_images)['attn'][i_layer]
             output = F.normalize(output, dim=-1)
             heatmap_fg = output @ fg_act[:, None]  # [B, H, W, 1]
             heatmap_bg = output @ bg_act[:, None]  # [B, H, W, 1]
         return to_pil_images(rgb), logging_str
+    # fg-bg separated
+    separate_fg_bg = kwargs.get("separate_fg_bg", False)
+    if separate_fg_bg:
+        fg_threshold = kwargs.get("fg_threshold", 0.5)
+        feature_hw = features.shape[1]
+        progress(0.4, desc="Segmenting FG-BG")
+        heatmap_fg, heatmap_bg = segment_fg_bg(images, hw=448, transform_images=False, i_layer=4)
+        heatmap_fg = 1 - heatmap_fg
+        heatmap_bg = 1 - heatmap_bg
+        b, h, w, c = features.shape
+        heatmap_bg = rearrange(heatmap_bg, 'b h w c -> b c h w')
+        heatmap_fg = rearrange(heatmap_fg, 'b h w c -> b c h w')
+        is_cuda = torch.cuda.is_available()
+        heatmap_fg = F.interpolate(heatmap_fg, (h, w), mode="bicubic")
+        heatmap_bg = F.interpolate(heatmap_bg, (h, w), mode="bicubic")
+        heatmap_fg = heatmap_fg.flatten()
+        heatmap_bg = heatmap_bg.flatten()
+        fg_minus_bg = heatmap_fg - heatmap_bg
+        def _to_mask(heatmap, threshold, gamma=0.5):
+            heatmap = (heatmap - heatmap.mean()) / heatmap.std()
+            heatmap = heatmap.double()
+            heatmap = torch.exp(heatmap)
+            heatmap = 1 / heatmap ** gamma
+            if heatmap.shape[0] > 10000:
+                np.random.seed(0)
+                random_idx = np.random.choice(heatmap.shape[0], 10000, replace=False)
+                vmin, vmax = heatmap[random_idx].quantile(0.01), heatmap[random_idx].quantile(0.99)
+            else:
+                vmin, vmax = heatmap.quantile(0.01), heatmap.quantile(0.99)
+            heatmap = (heatmap - vmin) / (vmax - vmin)
+            heatmap = heatmap.reshape(b, h, w)
+            mask = heatmap > threshold
+            return mask
+        fg_mask = _to_mask(fg_minus_bg, fg_threshold)
+        features_fg = features.flatten(0, 2)[fg_mask.flatten()]
+        progress(0.4, desc="NCut FG")
+        rgb, _logging_str, eigvecs = compute_ncut(
+            features_fg,
+            num_eig=num_eig,
+            num_sample_ncut=num_sample_ncut,
+            affinity_focal_gamma=affinity_focal_gamma,
+            knn_ncut=knn_ncut,
+            knn_tsne=knn_tsne,
+            num_sample_tsne=num_sample_tsne,
+            embedding_method=embedding_method,
+            embedding_metric=embedding_metric,
+            perplexity=perplexity,
+            n_neighbors=n_neighbors,
+            min_dist=min_dist,
+            sampling_method=sampling_method,
+            indirect_connection=indirect_connection,
+            make_orthogonal=make_orthogonal,
+            metric=ncut_metric,
+            only_eigvecs=False,
+        )
+        rgb_all = torch.zeros(b, h, w, 3)
+        rgb_all_flat = rgb_all.flatten(0, 2)
+        rgb_all_flat[fg_mask.flatten()] = rgb
+        rgb_all = rgb_all_flat.reshape(b, h, w, 3)
+        return to_pil_images(rgb_all), logging_str
     # ailgnedcut
     if not directed:
             torch.cuda.empty_cache()
         return *(None for _ in range(n_ret)), "Error: " + str(e)
+    ret = ncut_run(*args, **kwargs)
+    ret = list(ret)[:n_ret] + [ret[-1]]
+    return ret
 if USE_HUGGINGFACE_ZEROGPU:
     @spaces.GPU(duration=30)
     node_type2="k",
     head_index_text='all',
     make_symmetric=False,
+    fg_threshold=0.5,
     n_ret=1,
     plot_clusters=False,
     alignedcut_eig_norm_plot=False,
     only_eigvecs=False,
     return_eigvec_and_rgb=False,
     normalize_eigvec_return=False,
+    separate_fg_bg=False,
 ):
     # print(node_type2, head_index_text, make_symmetric)
     progress=gr.Progress()
         "lisa_prompt2": lisa_prompt2,
         "lisa_prompt3": lisa_prompt3,
         "is_lisa": is_lisa,
+        "fg_threshold": fg_threshold,
         "n_ret": n_ret,
         "plot_clusters": plot_clusters,
         "alignedcut_eig_norm_plot": alignedcut_eig_norm_plot,
         "only_eigvecs": only_eigvecs,
         "return_eigvec_and_rgb": return_eigvec_and_rgb,
         "normalize_eigvec_return": normalize_eigvec_return,
+        "separate_fg_bg": separate_fg_bg,
     }
     # print(kwargs)
     css=custom_css,
 )
 with demo:
     with gr.Tab('PlayGround'):
         eigvecs = gr.State(np.array([]))
         tsne3d_rgb = gr.State(np.array([]))
                     outputs=[mask_gallery, crop_gallery])
+    with gr.Tab('FG'):
+        with gr.Row():
+            with gr.Column(scale=5, min_width=200):
+                input_gallery, submit_button, clear_images_button, dataset_dropdown, num_images_slider, random_seed_slider, load_images_button = make_input_images_section()
+                num_images_slider.value = 30
+                logging_text = gr.Textbox("Logging information", label="Logging", elem_id="logging", type="text", placeholder="Logging information", autofocus=False, autoscroll=False)
+            with gr.Column(scale=5, min_width=200):
+                output_gallery = make_output_images_section()
+                fg_threshold_slider = gr.Slider(0.01, 1, step=0.01, label="Foreground threshold", value=0.5, elem_id="fg_threshold", info="increase for more foreground")
+                # cluster_gallery = gr.Gallery(value=[], label="Clusters", show_label=True, elem_id="clusters", columns=[2], rows=[2], object_fit="contain", height="auto", show_share_button=True, preview=False, interactive=False)
+                [
+                    model_dropdown, layer_slider, node_type_dropdown, num_eig_slider,
+                    affinity_focal_gamma_slider, num_sample_ncut_slider, ncut_knn_slider, ncut_indirect_connection, ncut_make_orthogonal,
+                    embedding_method_dropdown, embedding_metric_dropdown, num_sample_tsne_slider, knn_tsne_slider,
+                    perplexity_slider, n_neighbors_slider, min_dist_slider,
+                    sampling_method_dropdown, ncut_metric_dropdown, positive_prompt, negative_prompt
+                ] = make_parameters_section()
+        false_placeholder = gr.Checkbox(label="False", value=False, elem_id="false_placeholder", visible=False)
+        no_prompt = gr.Textbox("", label="", elem_id="empty_placeholder", type="text", placeholder="", visible=False)
+        submit_button.click(
+            partial(run_fn, n_ret=1, plot_clusters=False, separate_fg_bg=True),
+            inputs=[
+                input_gallery, model_dropdown, layer_slider, num_eig_slider, node_type_dropdown,
+                positive_prompt, negative_prompt,
+                false_placeholder, no_prompt, no_prompt, no_prompt,
+                affinity_focal_gamma_slider, num_sample_ncut_slider, ncut_knn_slider, ncut_indirect_connection, ncut_make_orthogonal,
+                embedding_method_dropdown, embedding_metric_dropdown, num_sample_tsne_slider, knn_tsne_slider,
+                perplexity_slider, n_neighbors_slider, min_dist_slider, sampling_method_dropdown, ncut_metric_dropdown,
+                *[false_placeholder]*12,
+                fg_threshold_slider
+            ],
+            outputs=[output_gallery, logging_text],
+        )
     with gr.Tab('Sub-cluster (dev)', visible=False) as sub_cluster_tab:
         with gr.Row():
             image_cluster_plot = gr.Image(value=None, label="Image-level clustering", elem_id="image_cluster_plot", interactive=False)