IDM-VTON

Paused

App Files Files Community

venbab commited on Nov 3, 2025

Commit

481de3e

verified ·

1 Parent(s): b4d02b2

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -110

app.py CHANGED Viewed

@@ -26,9 +26,6 @@ from preprocess.openpose.run_openpose import OpenPose
 from detectron2.data.detection_utils import convert_PIL_to_numpy, _apply_exif_orientation
 from torchvision.transforms.functional import to_pil_image
-# --- FastAPI for the /tryon REST route ---
-from fastapi import FastAPI, UploadFile, File, Response
 # ------------------------------------------------------------------------------------
 # Helpers
 # ------------------------------------------------------------------------------------
@@ -39,11 +36,10 @@ def pil_to_binary_mask(pil_image, threshold=0):
     mask = np.zeros(binary_mask.shape, dtype=np.uint8)
     for i in range(binary_mask.shape[0]):
         for j in range(binary_mask.shape[1]):
-            if binary_mask[i, j] is True:
                 mask[i, j] = 1
     mask = (mask * 255).astype(np.uint8)
-    output_mask = Image.fromarray(mask)
-    return output_mask
 # ------------------------------------------------------------------------------------
 # Load models / pipeline
@@ -100,12 +96,8 @@ UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
 parsing_model = Parsing(0)
 openpose_model = OpenPose(0)
-UNet_Encoder.requires_grad_(False)
-image_encoder.requires_grad_(False)
-vae.requires_grad_(False)
-unet.requires_grad_(False)
-text_encoder_one.requires_grad_(False)
-text_encoder_two.requires_grad_(False)
 tensor_transfrom = transforms.Compose(
     [
@@ -130,7 +122,7 @@ pipe = TryonPipeline.from_pretrained(
 pipe.unet_encoder = UNet_Encoder
 # ------------------------------------------------------------------------------------
-# Core try-on function used by both Gradio UI and REST
 # ------------------------------------------------------------------------------------
 def _tryon_core(
     human_img: Image.Image,
@@ -167,15 +159,11 @@ def _tryon_core(
     if auto_mask:
         keypoints = openpose_model(human_img_used.resize((384, 512)))
         model_parse, _ = parsing_model(human_img_used.resize((384, 512)))
-        mask, mask_gray = get_mask_location("hd", "upper_body", model_parse, keypoints)
         mask = mask.resize((768, 1024))
     else:
-        # fallback: no-draw mask (full body) – rarely used in REST path
         mask = pil_to_binary_mask(Image.new("L", (768, 1024), 255))
-    mask_gray = (1 - transforms.ToTensor()(mask)) * tensor_transfrom(human_img_used)
-    mask_gray = to_pil_image((mask_gray + 1.0) / 2.0)
     # DensePose
     human_img_arg = _apply_exif_orientation(human_img_used.resize((384, 512)))
     human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
@@ -195,60 +183,54 @@ def _tryon_core(
     pose_img = pose_img[:, :, ::-1]
     pose_img = Image.fromarray(pose_img).resize((768, 1024))
-    with torch.no_grad():
-        with torch.cuda.amp.autocast():
-            with torch.no_grad():
-                prompt = "model is wearing " + (garment_des or "a garment")
-                negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
-                with torch.inference_mode():
-                    (
-                        prompt_embeds,
-                        negative_prompt_embeds,
-                        pooled_prompt_embeds,
-                        negative_pooled_prompt_embeds,
-                    ) = pipe.encode_prompt(
-                        prompt,
-                        num_images_per_prompt=1,
-                        do_classifier_free_guidance=True,
-                        negative_prompt=negative_prompt,
-                    )
-                    prompt_c = "a photo of " + (garment_des or "a garment")
-                    negative_prompt_c = negative_prompt
-                    if not isinstance(prompt_c, List):
-                        prompt_c = [prompt_c] * 1
-                    if not isinstance(negative_prompt_c, List):
-                        negative_prompt_c = [negative_prompt_c] * 1
-                    with torch.inference_mode():
-                        (prompt_embeds_c, _, _, _,) = pipe.encode_prompt(
-                            prompt_c,
-                            num_images_per_prompt=1,
-                            do_classifier_free_guidance=False,
-                            negative_prompt=negative_prompt_c,
-                        )
-                    pose_tensor = tensor_transfrom(pose_img).unsqueeze(0).to(device, torch.float16)
-                    garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device, torch.float16)
-                    generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
-                    images = pipe(
-                        prompt_embeds=prompt_embeds.to(device, torch.float16),
-                        negative_prompt_embeds=negative_prompt_embeds.to(device, torch.float16),
-                        pooled_prompt_embeds=pooled_prompt_embeds.to(device, torch.float16),
-                        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device, torch.float16),
-                        num_inference_steps=int(denoise_steps),
-                        generator=generator,
-                        strength=1.0,
-                        pose_img=pose_tensor,
-                        text_embeds_cloth=prompt_embeds_c.to(device, torch.float16),
-                        cloth=garm_tensor,
-                        mask_image=mask,
-                        image=human_img_used,
-                        height=1024,
-                        width=768,
-                        ip_adapter_image=garm_img.resize((768, 1024)),
-                        guidance_scale=2.0,
-                    )[0]
     if auto_crop:
         out_img = images[0].resize(crop_size)
@@ -258,7 +240,7 @@ def _tryon_core(
         return images[0]
 # ------------------------------------------------------------------------------------
-# Gradio UI (original) – unchanged logic except we call the same core function
 # ------------------------------------------------------------------------------------
 garm_list = os.listdir(os.path.join(example_path, "cloth"))
 garm_list_path = [os.path.join(example_path, "cloth", garm) for garm in garm_list]
@@ -268,16 +250,12 @@ human_list_path = [os.path.join(example_path, "human", human) for human in human
 human_ex_list = []
 for ex_human in human_list_path:
-    ex_dict = {}
-    ex_dict["background"] = ex_human
-    ex_dict["layers"] = None
-    ex_dict["composite"] = None
     human_ex_list.append(ex_dict)
 @spaces.GPU
-def start_tryon(dict, garm_img, garment_des, is_checked, is_checked_crop, denoise_steps, seed):
-    # Keep compatibility with the existing Gradio workflow
-    human_img = dict["background"].convert("RGB")
     out_img = _tryon_core(
         human_img=human_img,
         garm_img=garm_img,
@@ -287,12 +265,10 @@ def start_tryon(dict, garm_img, garment_des, is_checked, is_checked_crop, denois
         denoise_steps=int(denoise_steps),
         seed=int(seed) if seed is not None else None,
     )
-    # Also return the mask preview (approx) by recomputing lightweight gray
     mask_gray = pil_to_binary_mask(out_img.convert("L"))
     return out_img, mask_gray
-image_blocks = gr.Blocks().queue()
-with image_blocks as demo:
     gr.Markdown("## IDM-VTON 👕👔👚")
     gr.Markdown(
         "Virtual Try-on with your image and garment image. Check out the "
@@ -306,7 +282,7 @@ with image_blocks as demo:
                 is_checked = gr.Checkbox(label="Yes", info="Use auto-generated mask (Takes 5 seconds)", value=True)
             with gr.Row():
                 is_checked_crop = gr.Checkbox(label="Yes", info="Use auto-crop & resizing", value=False)
-            _ = gr.Examples(inputs=imgs, examples_per_page=10, examples=human_ex_list)
         with gr.Column():
             garm_img = gr.Image(label="Garment", sources="upload", type="pil")
@@ -317,7 +293,7 @@ with image_blocks as demo:
                         show_label=False,
                         elem_id="prompt",
                     )
-            _ = gr.Examples(inputs=garm_img, examples_per_page=8, examples=garm_list_path)
         with gr.Column():
             masked_img = gr.Image(label="Masked image output", elem_id="masked-img", show_share_button=False)
@@ -335,32 +311,8 @@ with image_blocks as demo:
         fn=start_tryon,
         inputs=[imgs, garm_img, prompt, is_checked, is_checked_crop, denoise_steps, seed],
         outputs=[image_out, masked_img],
-        api_name="tryon",
-    )
-# ------------------------------------------------------------------------------------
-# FastAPI route and mount
-# ------------------------------------------------------------------------------------
-app = FastAPI()
-@app.post("/tryon")
-async def tryon(person: UploadFile = File(...), garment: UploadFile = File(...)):
-    p_bytes = await person.read()
-    g_bytes = await garment.read()
-    human_img = Image.open(io.BytesIO(p_bytes)).convert("RGB")
-    garment_img = Image.open(io.BytesIO(g_bytes)).convert("RGBA")
-    out = _tryon_core(
-        human_img=human_img,
-        garm_img=garment_img,
-        garment_des="",          # optional: you can add a text box in Flutter later
-        auto_mask=True,
-        auto_crop=False,
-        denoise_steps=30,
-        seed=42,
     )
-    buf = io.BytesIO()
-    out.save(buf, format="JPEG", quality=92)
-    return Response(content=buf.getvalue(), media_type="image/jpeg")
-# Mount Gradio UI on root path
-app = gr.mount_gradio_app(app, image_blocks, path="/")

 from detectron2.data.detection_utils import convert_PIL_to_numpy, _apply_exif_orientation
 from torchvision.transforms.functional import to_pil_image
 # ------------------------------------------------------------------------------------
 # Helpers
 # ------------------------------------------------------------------------------------
     mask = np.zeros(binary_mask.shape, dtype=np.uint8)
     for i in range(binary_mask.shape[0]):
         for j in range(binary_mask.shape[1]):
+            if binary_mask[i, j]:
                 mask[i, j] = 1
     mask = (mask * 255).astype(np.uint8)
+    return Image.fromarray(mask)
 # ------------------------------------------------------------------------------------
 # Load models / pipeline
 parsing_model = Parsing(0)
 openpose_model = OpenPose(0)
+for m in (UNet_Encoder, image_encoder, vae, unet, text_encoder_one, text_encoder_two):
+    m.requires_grad_(False)
 tensor_transfrom = transforms.Compose(
     [
 pipe.unet_encoder = UNet_Encoder
 # ------------------------------------------------------------------------------------
+# Core try-on function
 # ------------------------------------------------------------------------------------
 def _tryon_core(
     human_img: Image.Image,
     if auto_mask:
         keypoints = openpose_model(human_img_used.resize((384, 512)))
         model_parse, _ = parsing_model(human_img_used.resize((384, 512)))
+        mask, _ = get_mask_location("hd", "upper_body", model_parse, keypoints)
         mask = mask.resize((768, 1024))
     else:
         mask = pil_to_binary_mask(Image.new("L", (768, 1024), 255))
     # DensePose
     human_img_arg = _apply_exif_orientation(human_img_used.resize((384, 512)))
     human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
     pose_img = pose_img[:, :, ::-1]
     pose_img = Image.fromarray(pose_img).resize((768, 1024))
+    # Run pipeline
+    with torch.no_grad(), torch.cuda.amp.autocast():
+        prompt = "model is wearing " + (garment_des or "a garment")
+        negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = pipe.encode_prompt(
+            prompt,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=True,
+            negative_prompt=negative_prompt,
+        )
+        prompt_c = "a photo of " + (garment_des or "a garment")
+        if not isinstance(prompt_c, List):
+            prompt_c = [prompt_c]
+        (prompt_embeds_c, _, _, _,) = pipe.encode_prompt(
+            prompt_c,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=False,
+            negative_prompt=negative_prompt,
+        )
+        pose_tensor = tensor_transfrom(pose_img).unsqueeze(0).to(device, torch.float16)
+        garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device, torch.float16)
+        generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
+        images = pipe(
+            prompt_embeds=prompt_embeds.to(device, torch.float16),
+            negative_prompt_embeds=negative_prompt_embeds.to(device, torch.float16),
+            pooled_prompt_embeds=pooled_prompt_embeds.to(device, torch.float16),
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device, torch.float16),
+            num_inference_steps=int(denoise_steps),
+            generator=generator,
+            strength=1.0,
+            pose_img=pose_tensor,
+            text_embeds_cloth=prompt_embeds_c.to(device, torch.float16),
+            cloth=garm_tensor,
+            mask_image=mask,
+            image=human_img_used,
+            height=1024,
+            width=768,
+            ip_adapter_image=garm_img.resize((768, 1024)),
+            guidance_scale=2.0,
+        )[0]
     if auto_crop:
         out_img = images[0].resize(crop_size)
         return images[0]
 # ------------------------------------------------------------------------------------
+# Gradio UI (and HTTP function endpoint via /run/tryon)
 # ------------------------------------------------------------------------------------
 garm_list = os.listdir(os.path.join(example_path, "cloth"))
 garm_list_path = [os.path.join(example_path, "cloth", garm) for garm in garm_list]
 human_ex_list = []
 for ex_human in human_list_path:
+    ex_dict = {"background": ex_human, "layers": None, "composite": None}
     human_ex_list.append(ex_dict)
 @spaces.GPU
+def start_tryon(dict_img, garm_img, garment_des, is_checked, is_checked_crop, denoise_steps, seed):
+    human_img = dict_img["background"].convert("RGB")
     out_img = _tryon_core(
         human_img=human_img,
         garm_img=garm_img,
         denoise_steps=int(denoise_steps),
         seed=int(seed) if seed is not None else None,
     )
     mask_gray = pil_to_binary_mask(out_img.convert("L"))
     return out_img, mask_gray
+with gr.Blocks() as image_blocks:
     gr.Markdown("## IDM-VTON 👕👔👚")
     gr.Markdown(
         "Virtual Try-on with your image and garment image. Check out the "
                 is_checked = gr.Checkbox(label="Yes", info="Use auto-generated mask (Takes 5 seconds)", value=True)
             with gr.Row():
                 is_checked_crop = gr.Checkbox(label="Yes", info="Use auto-crop & resizing", value=False)
+            gr.Examples(inputs=imgs, examples_per_page=10, examples=human_ex_list)
         with gr.Column():
             garm_img = gr.Image(label="Garment", sources="upload", type="pil")
                         show_label=False,
                         elem_id="prompt",
                     )
+            gr.Examples(inputs=garm_img, examples_per_page=8, examples=garm_list_path)
         with gr.Column():
             masked_img = gr.Image(label="Masked image output", elem_id="masked-img", show_share_button=False)
         fn=start_tryon,
         inputs=[imgs, garm_img, prompt, is_checked, is_checked_crop, denoise_steps, seed],
         outputs=[image_out, masked_img],
+        api_name="tryon",   # <-- HTTP: POST /run/tryon
     )
+# IMPORTANT: expose a top-level `demo` for Gradio Spaces
+demo = image_blocks