Spaces:

longlian
/

llm-grounded-diffusion

Sleeping

App Files Files Community

Tony Lian commited on Jun 30, 2023

Commit

93de48e

1 Parent(s): 0cbad80

Apply batching to SAM to reduce the memory cost with many objects

Browse files

Files changed (1) hide show

generation.py +13 -7

generation.py CHANGED Viewed

@@ -53,7 +53,7 @@ def generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input
         batch_size = input_len
     run_times = int(np.ceil(input_len / batch_size))
-    single_object_images, single_object_pil_images_box_ann, latents_all = [], [], []
     for batch_idx in range(run_times):
         input_latents_batch, bboxes_batch, phrases_batch = input_latents[batch_idx * batch_size:(batch_idx + 1) * batch_size], \
             bboxes[batch_idx * batch_size:(batch_idx + 1) * batch_size], phrases[batch_idx * batch_size:(batch_idx + 1) * batch_size]
@@ -68,17 +68,23 @@ def generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input
         gc.collect()
         torch.cuda.empty_cache()
-        single_object_images.append(single_object_images_batch)
         single_object_pil_images_box_ann.append(single_object_pil_images_box_ann_batch)
         latents_all.append(latents_all_batch)
-    single_object_images, single_object_pil_images_box_ann, latents_all = np.concatenate(single_object_images, axis=0), sum(single_object_pil_images_box_ann, []), torch.cat(latents_all, dim=1)
-    mask_selected, conf_score_selected = sam.sam_refine_boxes(sam_input_images=single_object_images, boxes=bboxes, model_dict=model_dict, verbose=verbose, **sam_refine_kwargs)
-    # mask_selected: List[List[Array of shape (64, 64)]]
-    mask_selected = np.array(mask_selected)[:, 0]
     mask_selected_tensor = torch.tensor(mask_selected)

         batch_size = input_len
     run_times = int(np.ceil(input_len / batch_size))
+    mask_selected_list, single_object_pil_images_box_ann, latents_all = [], [], []
     for batch_idx in range(run_times):
         input_latents_batch, bboxes_batch, phrases_batch = input_latents[batch_idx * batch_size:(batch_idx + 1) * batch_size], \
             bboxes[batch_idx * batch_size:(batch_idx + 1) * batch_size], phrases[batch_idx * batch_size:(batch_idx + 1) * batch_size]
         gc.collect()
         torch.cuda.empty_cache()
+        # `sam_refine_boxes` also calls `empty_cache` so we don't need to explicitly empty the cache again.
+        mask_selected, _ = sam.sam_refine_boxes(sam_input_images=single_object_images_batch, boxes=bboxes_batch, model_dict=model_dict, verbose=verbose, **sam_refine_kwargs)
+        mask_selected_list.append(np.array(mask_selected)[:, 0])
         single_object_pil_images_box_ann.append(single_object_pil_images_box_ann_batch)
         latents_all.append(latents_all_batch)
+    single_object_pil_images_box_ann, latents_all = sum(single_object_pil_images_box_ann, []), torch.cat(latents_all, dim=1)
+    # mask_selected_list: List(batch)[List(image)[List(box)[Array of shape (64, 64)]]]
+    mask_selected = np.concatenate(mask_selected_list, axis=0)
+    mask_selected = mask_selected.reshape((-1, *mask_selected.shape[-2:]))
+    assert mask_selected.shape[0] == input_latents.shape[0], f"{mask_selected.shape[0]} != {input_latents.shape[0]}"
+    print(mask_selected.shape)
     mask_selected_tensor = torch.tensor(mask_selected)