Spaces:

sayehghp
/

vicca

Sleeping

App Files Files Community

sayehghp commited on Dec 9, 2025

Commit

e09b1c8

1 Parent(s): ecf378d

Visualization

Browse files

Files changed (3) hide show

CXRGen/sample_generation.py +142 -36
inference.py +5 -0
vg_token_attention.py +11 -2

CXRGen/sample_generation.py CHANGED Viewed

@@ -56,21 +56,81 @@ def get_args_parser():
 apply_uniformer = UniformerDetector()
 apply_canny = CannyDetector()
-def process(input_image, prompt, model, num_samples, image_resolution=512, ddim_steps=10, guess_mode=False, strength=1, scale=9, seed=-1, eta=0):
     with torch.no_grad():
         ddim_sampler = DDIMSampler(model)
         img = resize_image(HWC3(input_image), image_resolution)
-        # detected_map = apply_uniformer(resize_image(input_image, image_resolution))
         H, W, C = img.shape
         detected_map = apply_canny(img, 100, 200)
         detected_map = HWC3(detected_map)
-        # detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
-        # control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
-        control = torch.from_numpy(detected_map.copy()).float().cpu() / 255.0
         control = torch.stack([control for _ in range(num_samples)], dim=0)
-        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
         if seed == -1:
             seed = random.randint(0, 65535)
@@ -79,29 +139,45 @@ def process(input_image, prompt, model, num_samples, image_resolution=512, ddim_
         if config.save_memory:
             model.low_vram_shift(is_diffusing=False)
-        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt] * num_samples)]}
-        #cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
-        #un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
         shape = (4, H // 8, W // 8)
         if config.save_memory:
             model.low_vram_shift(is_diffusing=True)
-        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
-        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
-                                                     shape, cond, verbose=False, eta=eta,
-                                                     unconditional_guidance_scale=scale)
         if config.save_memory:
             model.low_vram_shift(is_diffusing=False)
         x_samples = model.decode_first_stage(samples)
-        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
         results = [x_samples[i] for i in range(num_samples)]
     return [255 - detected_map] + results
 def imageEncoder(img):
     image_source, image = load_image(img)
     return image
@@ -121,42 +197,72 @@ def main(args):
     # if args.device == 'cuda':
     #     model = model.cuda()
-    # respect the passed device, but fall back safely
     if getattr(args, "device", "cpu") == "cuda" and torch.cuda.is_available():
         device = torch.device("cuda")
     else:
         device = torch.device("cpu")
-    model = create_model('./CXRGen/models/cldm_v15_biovlp.yaml').cpu()
     state = load_state_dict(args.weight_path, location="cpu")
     model.load_state_dict(state, strict=False)
-    # only move to GPU if we really decided to
-    if device.type == "cuda":
-        model = model.to(device)
-    # # Decide device once
-    # device = "cuda" if torch.cuda.is_available() else "cpu"
-    # print(f"[VICCA] Using device: {device}", flush=True)
-    # # Make sure the rest of the code sees the same device
-    # args.device = device
-    # # Create model on CPU then move to device
-    # model = create_model("./CXRGen/models/cldm_v15_biovlp.yaml")
-    # # Load weights with correct map_location
-    # state_dict = load_state_dict(args.weight_path, location=device)
-    # model.load_state_dict(state_dict, strict=False)
-    # model = model.to(device)
     model.eval()
     prompt = args.text_prompt
     img_org = cv2.imread(args.image_path)
     img_w, img_h, c = img_org.shape
     input_img = lungsegment(args.image_path)
-    gen_img = process(input_img, prompt, model, args.num_samples)
     if args.plot_gen_image:
         for i in range(1,len(gen_img)):

 apply_uniformer = UniformerDetector()
 apply_canny = CannyDetector()
+# def process(input_image, prompt, model, num_samples, image_resolution=512, ddim_steps=10, guess_mode=False, strength=1, scale=9, seed=-1, eta=0):
+#     with torch.no_grad():
+#         ddim_sampler = DDIMSampler(model)
+#         img = resize_image(HWC3(input_image), image_resolution)
+#         # detected_map = apply_uniformer(resize_image(input_image, image_resolution))
+#         H, W, C = img.shape
+#         detected_map = apply_canny(img, 100, 200)
+#         detected_map = HWC3(detected_map)
+#         # detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+#         # control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+#         control = torch.from_numpy(detected_map.copy()).float().cpu() / 255.0
+#         control = torch.stack([control for _ in range(num_samples)], dim=0)
+#         control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+#         if seed == -1:
+#             seed = random.randint(0, 65535)
+#         seed_everything(seed)
+#         if config.save_memory:
+#             model.low_vram_shift(is_diffusing=False)
+#         cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt] * num_samples)]}
+#         #cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+#         #un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+#         shape = (4, H // 8, W // 8)
+#         if config.save_memory:
+#             model.low_vram_shift(is_diffusing=True)
+#         model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)  # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
+#         samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+#                                                      shape, cond, verbose=False, eta=eta,
+#                                                      unconditional_guidance_scale=scale)
+#         if config.save_memory:
+#             model.low_vram_shift(is_diffusing=False)
+#         x_samples = model.decode_first_stage(samples)
+#         x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+#         results = [x_samples[i] for i in range(num_samples)]
+#     return [255 - detected_map] + results
+def process(
+    input_image,
+    prompt,
+    model,
+    num_samples,
+    device,
+    image_resolution=512,
+    ddim_steps=10,
+    guess_mode=False,
+    strength=1,
+    scale=9,
+    seed=-1,
+    eta=0,
+):
+    model = model.to(device)
     with torch.no_grad():
         ddim_sampler = DDIMSampler(model)
         img = resize_image(HWC3(input_image), image_resolution)
         H, W, C = img.shape
         detected_map = apply_canny(img, 100, 200)
         detected_map = HWC3(detected_map)
+        control = torch.from_numpy(detected_map.copy()).float() / 255.0
         control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, "b h w c -> b c h w").clone()
+        control = control.to(device)
         if seed == -1:
             seed = random.randint(0, 65535)
         if config.save_memory:
             model.low_vram_shift(is_diffusing=False)
+        cond = {
+            "c_concat": [control],
+            "c_crossattn": [model.get_learned_conditioning([prompt] * num_samples)],
+        }
         shape = (4, H // 8, W // 8)
         if config.save_memory:
             model.low_vram_shift(is_diffusing=True)
+        model.control_scales = (
+            [strength * (0.825 ** float(12 - i)) for i in range(13)]
+            if guess_mode
+            else ([strength] * 13)
+        )
+        samples, intermediates = ddim_sampler.sample(
+            ddim_steps,
+            num_samples,
+            shape,
+            cond,
+            verbose=False,
+            eta=eta,
+            unconditional_guidance_scale=scale,
+        )
         if config.save_memory:
             model.low_vram_shift(is_diffusing=False)
         x_samples = model.decode_first_stage(samples)
+        x_samples = (
+            einops.rearrange(x_samples, "b c h w -> b h w c") * 127.5 + 127.5
+        ).cpu().numpy().clip(0, 255).astype(np.uint8)
         results = [x_samples[i] for i in range(num_samples)]
     return [255 - detected_map] + results
 def imageEncoder(img):
     image_source, image = load_image(img)
     return image
     # if args.device == 'cuda':
     #     model = model.cuda()
     if getattr(args, "device", "cpu") == "cuda" and torch.cuda.is_available():
         device = torch.device("cuda")
     else:
         device = torch.device("cpu")
+    print(f"[CXRGen] Using device: {device}", flush=True)
+    model = create_model("./CXRGen/models/cldm_v15_biovlp.yaml").cpu()
     state = load_state_dict(args.weight_path, location="cpu")
     model.load_state_dict(state, strict=False)
+    model = model.to(device)
     model.eval()
     prompt = args.text_prompt
     img_org = cv2.imread(args.image_path)
     img_w, img_h, c = img_org.shape
     input_img = lungsegment(args.image_path)
+    gen_img = process(
+        input_img,
+        prompt,
+        model,
+        args.num_samples,
+        device=device,
+    )
+    # # respect the passed device, but fall back safely
+    # if getattr(args, "device", "cpu") == "cuda" and torch.cuda.is_available():
+    #     device = torch.device("cuda")
+    # else:
+    #     device = torch.device("cpu")
+    # model = create_model('./CXRGen/models/cldm_v15_biovlp.yaml').cpu()
+    # state = load_state_dict(args.weight_path, location="cpu")
+    # model.load_state_dict(state, strict=False)
+    # # only move to GPU if we really decided to
+    # if device.type == "cuda":
+    #     model = model.to(device)
+    # # # Decide device once
+    # # device = "cuda" if torch.cuda.is_available() else "cpu"
+    # # print(f"[VICCA] Using device: {device}", flush=True)
+    # # # Make sure the rest of the code sees the same device
+    # # args.device = device
+    # # # Create model on CPU then move to device
+    # # model = create_model("./CXRGen/models/cldm_v15_biovlp.yaml")
+    # # # Load weights with correct map_location
+    # # state_dict = load_state_dict(args.weight_path, location=device)
+    # # model.load_state_dict(state_dict, strict=False)
+    # # model = model.to(device)
+    # model.eval()
+    # prompt = args.text_prompt
+    # img_org = cv2.imread(args.image_path)
+    # img_w, img_h, c = img_org.shape
+    # input_img = lungsegment(args.image_path)
+    # gen_img = process(input_img, prompt, model, args.num_samples)
     if args.plot_gen_image:
         for i in range(1,len(gen_img)):

inference.py CHANGED Viewed

@@ -115,6 +115,7 @@ from DETR import svc
 from DETR.arguments import get_args_parser as get_detr_args_parser
 from VG import localization
 from ssim import ssim
 from CheXbert.src.label import label
@@ -214,6 +215,10 @@ def gen_cxr(weight_path, image_path, text_prompt, num_samples, output_path, devi
     args.num_samples = num_samples
     args.output_path = output_path
     args.weight_path = get_weight(weight_path)
     args.device = device
     sample_generation.main(args)

 from DETR.arguments import get_args_parser as get_detr_args_parser
 from VG import localization
 from ssim import ssim
+import torch
 from CheXbert.src.label import label
     args.num_samples = num_samples
     args.output_path = output_path
     args.weight_path = get_weight(weight_path)
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
     args.device = device
     sample_generation.main(args)

vg_token_attention.py CHANGED Viewed

@@ -269,9 +269,18 @@ def run_token_ca_visualization(
     """
     if isinstance(terms, str):
         terms = [terms]
-    terms = [t.strip() for t in terms if t and t.strip()]
     if not terms:
-        raise ValueError("No terms provided for attention visualization.")
     device = device or DEVICE_DEFAULT
     model = load_model(cfg_path, ckpt_path).to(device).eval()

     """
     if isinstance(terms, str):
         terms = [terms]
+    prompt_lower = prompt.lower()
+    # Keep only terms that actually appear in the prompt (case-insensitive)
+    terms = [t for t in terms if t.lower() in prompt_lower]
     if not terms:
+        print(f"[TokenCA] No configured terms found in prompt: {prompt!r}")
+        return {}  # or an empty dict / list, whatever you expect upstream
+    # terms = [t.strip() for t in terms if t and t.strip()]
+    # if not terms:
+    #     raise ValueError("No terms provided for attention visualization.")
     device = device or DEVICE_DEFAULT
     model = load_model(cfg_path, ckpt_path).to(device).eval()