AI-Clothes-Changer

Runtime error

App Files Files Community

venbab commited on Nov 3, 2025

Commit

5422abd

verified ·

1 Parent(s): 7095797

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -123

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import spaces
 import gradio as gr
 from PIL import Image
@@ -13,10 +14,7 @@ from transformers import (
 )
 from diffusers import DDPMScheduler, AutoencoderKL
 from typing import List
-import torch
-import os
-import numpy as np
 from utils_mask import get_mask_location
 from torchvision import transforms
 import apply_net
@@ -25,52 +23,47 @@ from preprocess.openpose.run_openpose import OpenPose
 from detectron2.data.detection_utils import convert_PIL_to_numpy, _apply_exif_orientation
 from torchvision.transforms.functional import to_pil_image
-# ---------------- Helpers ----------------
 def pil_to_binary_mask(pil_image, threshold=0):
     np_image = np.array(pil_image)
     grayscale_image = Image.fromarray(np_image).convert("L")
     binary_mask = np.array(grayscale_image) > threshold
     mask = np.zeros(binary_mask.shape, dtype=np.uint8)
-    for i in range(binary_mask.shape[0]):
-        for j in range(binary_mask.shape[1]):
-            if binary_mask[i, j]:
-                mask[i, j] = 1
-    mask = (mask * 255).astype(np.uint8)
-    return Image.fromarray(mask)
-# ---------------- Load models / pipeline ----------------
 base_path = "yisol/IDM-VTON"
 example_path = os.path.join(os.path.dirname(__file__), "example")
-unet = UNet2DConditionModel.from_pretrained(
-    base_path, subfolder="unet", torch_dtype=torch.float16
-)
-unet.requires_grad_(False)
-tokenizer_one = AutoTokenizer.from_pretrained(
-    base_path, subfolder="tokenizer", revision=None, use_fast=False
-)
-tokenizer_two = AutoTokenizer.from_pretrained(
-    base_path, subfolder="tokenizer_2", revision=None, use_fast=False
-)
 noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
-text_encoder_one = CLIPTextModel.from_pretrained(
-    base_path, subfolder="text_encoder", torch_dtype=torch.float16
-)
-text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
-    base_path, subfolder="text_encoder_2", torch_dtype=torch.float16
-)
-image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-    base_path, subfolder="image_encoder", torch_dtype=torch.float16
-)
-vae = AutoencoderKL.from_pretrained(
-    base_path, subfolder="vae", torch_dtype=torch.float16
-)
-UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
-    base_path, subfolder="unet_encoder", torch_dtype=torch.float16
-)
 parsing_model = Parsing(0)
 openpose_model = OpenPose(0)
@@ -78,9 +71,7 @@ openpose_model = OpenPose(0)
 for m in (UNet_Encoder, image_encoder, vae, unet, text_encoder_one, text_encoder_two):
     m.requires_grad_(False)
-tensor_transfrom = transforms.Compose(
-    [transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
-)
 pipe = TryonPipeline.from_pretrained(
     base_path,
@@ -97,54 +88,32 @@ pipe = TryonPipeline.from_pretrained(
 )
 pipe.unet_encoder = UNet_Encoder
-progress = gr.Progress()
-# ---------------- Inference ----------------
-@spaces.GPU
-def infer(person, garment, denoise_steps, seed):
-    print(f"[infer] steps={denoise_steps}, seed={seed}", flush=True)
-    progress(0, desc="Starting")
     device = "cuda"
     openpose_model.preprocessor.body_estimation.model.to(device)
     pipe.to(device)
     pipe.unet_encoder.to(device)
-    personRGB = person.convert("RGB")
     crop_size = personRGB.size
     human_img = personRGB.resize((768, 1024))
-    garm_img = garment.convert("RGB").resize((768, 1024))
-    progress(0.1, desc="Mask generating")
     keypoints = openpose_model(human_img.resize((384, 512)))
     model_parse, _ = parsing_model(human_img.resize((384, 512)))
-    mask, _mask_gray = get_mask_location("hd", "upper_body", model_parse, keypoints)
     mask = mask.resize((768, 1024))
-    progress(0.3, desc="DensePose processing")
     human_img_arg = _apply_exif_orientation(human_img.resize((384, 512)))
     human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
-    args = apply_net.create_argument_parser().parse_args(
-        (
-            "show",
-            "./configs/densepose_rcnn_R_50_FPN_s1x.yaml",
-            "./ckpt/densepose/model_final_162be9.pkl",
-            "dp_segm",
-            "-v",
-            "--opts",
-            "MODEL.DEVICE",
-            "cuda",
-        )
-    )
-    pose_img = args.func(args, human_img_arg)
-    pose_img = Image.fromarray(pose_img[:, :, ::-1]).resize((768, 1024))
-    progress(0.5, desc="Image generating")
-    def callback(pipe_, step, timestep, callback_kwargs):
-        progress_value = 0.5 + ((step + 1.0) / int(denoise_steps)) * 0.5
-        progress(progress_value, desc=f"Image generating, {step + 1}/{int(denoise_steps)} steps")
-        return callback_kwargs
     with torch.no_grad(), torch.cuda.amp.autocast():
         prompt = "model is wearing clothing"
@@ -162,22 +131,19 @@ def infer(person, garment, denoise_steps, seed):
         )
         prompt_c = "a photo of clothing"
-        if not isinstance(prompt_c, List):
-            prompt_c = [prompt_c]
-        if not isinstance(negative_prompt, List):
-            negative_prompt_c = [negative_prompt]
-        else:
-            negative_prompt_c = negative_prompt
-        (prompt_embeds_c, _, _, _,) = pipe.encode_prompt(
             prompt_c,
             num_images_per_prompt=1,
             do_classifier_free_guidance=False,
-            negative_prompt=negative_prompt_c,
         )
         pose_tensor = tensor_transfrom(pose_img).unsqueeze(0).to(device, torch.float16)
         garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device, torch.float16)
-        generator = torch.Generator(device).manual_seed(int(seed)) if seed is not None else None
         images = pipe(
             prompt_embeds=prompt_embeds.to(device, torch.float16),
@@ -196,63 +162,67 @@ def infer(person, garment, denoise_steps, seed):
             width=768,
             ip_adapter_image=garm_img.resize((768, 1024)),
             guidance_scale=2.0,
-            callback_on_step_end=callback,
         )[0]
     out_img = images[0].resize(crop_size)
-    progress(1, desc="Complete")
     return out_img
-# ---------------- UI (no queue) ----------------
 title = "## AI Clothes Changer"
 description = "Step into the world of AI clothes swap and unlock style possibilities."
-person_list = os.listdir(os.path.join(example_path, "human"))
-person_images = [os.path.join(example_path, "human", p) for p in person_list]
-garment_list = os.listdir(os.path.join(example_path, "cloth"))
-garment_images = [os.path.join(example_path, "cloth", g) for g in garment_list]
-with gr.Blocks() as demo:  # ← NO .queue()
     gr.Markdown(title)
     gr.Markdown(description)
     with gr.Row():
         with gr.Column():
             gr.Markdown("#### Person Image")
-            person_image = gr.Image(
-                sources=["upload"], type="pil", label="Person Image",
-                width=512, height=512, show_download_button=False, show_share_button=False
-            )
             gr.Examples(inputs=person_image, examples_per_page=20, examples=person_images)
         with gr.Column():
             gr.Markdown("#### Garment Image")
-            garment_image = gr.Image(
-                sources=["upload"], type="pil", label="Garment Image",
-                width=512, height=512, show_download_button=False, show_share_button=False
-            )
             gr.Examples(inputs=garment_image, examples_per_page=20, examples=garment_images)
         with gr.Column():
             gr.Markdown("#### Generated Image")
-            gen_image = gr.Image(label="Generated Image", width=512, height=512,
-                                 show_download_button=True, show_share_button=False)
-            with gr.Row():
-                gen_button = gr.Button("Generate")
             with gr.Accordion("Advanced Options", open=False):
                 denoise_steps = gr.Number(label="Denoising Steps", minimum=20, maximum=40, value=30, step=1)
                 seed = gr.Number(label="Seed", minimum=-1, maximum=2147483647, step=1, value=42)
-        gen_button.click(
-            fn=infer,
-            inputs=[person_image, garment_image, denoise_steps, seed],
-            outputs=[gen_image],
-            api_name="predict",     # ← provides /run/predict
-            queue=False             # ← accept direct POSTs (no queue)
-        )
-# For local dev only. On Spaces, Gradio auto-launches.
-if __name__ == "__main__":
-    demo.launch(show_error=True)

+# app.py
 import spaces
 import gradio as gr
 from PIL import Image
 )
 from diffusers import DDPMScheduler, AutoencoderKL
 from typing import List
+import torch, os, io, base64, json, numpy as np
 from utils_mask import get_mask_location
 from torchvision import transforms
 import apply_net
 from detectron2.data.detection_utils import convert_PIL_to_numpy, _apply_exif_orientation
 from torchvision.transforms.functional import to_pil_image
+# FastAPI REST
+from fastapi import FastAPI, Response
+from pydantic import BaseModel
+# -------------------- helpers --------------------
 def pil_to_binary_mask(pil_image, threshold=0):
     np_image = np.array(pil_image)
     grayscale_image = Image.fromarray(np_image).convert("L")
     binary_mask = np.array(grayscale_image) > threshold
     mask = np.zeros(binary_mask.shape, dtype=np.uint8)
+    mask[binary_mask] = 1
+    return Image.fromarray((mask * 255).astype(np.uint8))
+def _b64_to_pil(data_uri_or_b64: str) -> Image.Image:
+    # Accept both data: URI and raw base64
+    if data_uri_or_b64.startswith("data:"):
+        comma = data_uri_or_b64.find(",")
+        b64 = data_uri_or_b64[comma + 1:]
+    else:
+        b64 = data_uri_or_b64
+    return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
+def _pil_to_b64_jpeg(img: Image.Image) -> str:
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG", quality=92)
+    return base64.b64encode(buf.getvalue()).decode("utf-8")
+# -------------------- load models --------------------
 base_path = "yisol/IDM-VTON"
 example_path = os.path.join(os.path.dirname(__file__), "example")
+unet = UNet2DConditionModel.from_pretrained(base_path, subfolder="unet", torch_dtype=torch.float16)
+tokenizer_one = AutoTokenizer.from_pretrained(base_path, subfolder="tokenizer", use_fast=False)
+tokenizer_two = AutoTokenizer.from_pretrained(base_path, subfolder="tokenizer_2", use_fast=False)
 noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
+text_encoder_one = CLIPTextModel.from_pretrained(base_path, subfolder="text_encoder", torch_dtype=torch.float16)
+text_encoder_two = CLIPTextModelWithProjection.from_pretrained(base_path, subfolder="text_encoder_2", torch_dtype=torch.float16)
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(base_path, subfolder="image_encoder", torch_dtype=torch.float16)
+vae = AutoencoderKL.from_pretrained(base_path, subfolder="vae", torch_dtype=torch.float16)
+UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(base_path, subfolder="unet_encoder", torch_dtype=torch.float16)
 parsing_model = Parsing(0)
 openpose_model = OpenPose(0)
 for m in (UNet_Encoder, image_encoder, vae, unet, text_encoder_one, text_encoder_two):
     m.requires_grad_(False)
+tensor_transfrom = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
 pipe = TryonPipeline.from_pretrained(
     base_path,
 )
 pipe.unet_encoder = UNet_Encoder
+# -------------------- core inference --------------------
+def _infer_core(person_img: Image.Image, garment_img: Image.Image, denoise_steps: int, seed: int) -> Image.Image:
     device = "cuda"
     openpose_model.preprocessor.body_estimation.model.to(device)
     pipe.to(device)
     pipe.unet_encoder.to(device)
+    personRGB = person_img.convert("RGB")
     crop_size = personRGB.size
     human_img = personRGB.resize((768, 1024))
+    garm_img = garment_img.convert("RGB").resize((768, 1024))
     keypoints = openpose_model(human_img.resize((384, 512)))
     model_parse, _ = parsing_model(human_img.resize((384, 512)))
+    mask, _ = get_mask_location("hd", "upper_body", model_parse, keypoints)
     mask = mask.resize((768, 1024))
     human_img_arg = _apply_exif_orientation(human_img.resize((384, 512)))
     human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
+    args = apply_net.create_argument_parser().parse_args((
+        "show", "./configs/densepose_rcnn_R_50_FPN_s1x.yaml",
+        "./ckpt/densepose/model_final_162be9.pkl", "dp_segm", "-v",
+        "--opts", "MODEL.DEVICE", "cuda"
+    ))
+    pose_img = args.func(args, human_img_arg)[:, :, ::-1]
+    pose_img = Image.fromarray(pose_img).resize((768, 1024))
     with torch.no_grad(), torch.cuda.amp.autocast():
         prompt = "model is wearing clothing"
         )
         prompt_c = "a photo of clothing"
+        if not isinstance(prompt_c, List): prompt_c = [prompt_c]
+        (
+            prompt_embeds_c, _, _, _
+        ) = pipe.encode_prompt(
             prompt_c,
             num_images_per_prompt=1,
             do_classifier_free_guidance=False,
+            negative_prompt=[negative_prompt],
         )
         pose_tensor = tensor_transfrom(pose_img).unsqueeze(0).to(device, torch.float16)
         garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device, torch.float16)
+        generator = torch.Generator(device).manual_seed(int(seed))
         images = pipe(
             prompt_embeds=prompt_embeds.to(device, torch.float16),
             width=768,
             ip_adapter_image=garm_img.resize((768, 1024)),
             guidance_scale=2.0,
         )[0]
     out_img = images[0].resize(crop_size)
     return out_img
+# -------------------- Gradio UI --------------------
+progress = gr.Progress()
+@spaces.GPU
+def infer(person, garment, denoise_steps, seed):
+    progress(0.05, desc="Starting")
+    out = _infer_core(person, garment, int(denoise_steps), int(seed))
+    progress(1.0, desc="Done")
+    return out
 title = "## AI Clothes Changer"
 description = "Step into the world of AI clothes swap and unlock style possibilities."
+person_images = [os.path.join(example_path, "human", f) for f in os.listdir(os.path.join(example_path, "human"))]
+garment_images = [os.path.join(example_path, "cloth", f) for f in os.listdir(os.path.join(example_path, "cloth"))]
+with gr.Blocks().queue() as demo:
     gr.Markdown(title)
     gr.Markdown(description)
     with gr.Row():
         with gr.Column():
             gr.Markdown("#### Person Image")
+            person_image = gr.Image(sources=["upload"], type="pil", label="Person Image", width=512, height=512,
+                                    show_download_button=False, show_share_button=False)
             gr.Examples(inputs=person_image, examples_per_page=20, examples=person_images)
         with gr.Column():
             gr.Markdown("#### Garment Image")
+            garment_image = gr.Image(sources=["upload"], type="pil", label="Garment Image", width=512, height=512,
+                                     show_download_button=False, show_share_button=False)
             gr.Examples(inputs=garment_image, examples_per_page=20, examples=garment_images)
         with gr.Column():
             gr.Markdown("#### Generated Image")
+            gen_image = gr.Image(label="Generated Image", width=512, height=512, show_download_button=True, show_share_button=False)
+            with gr.Row(): gen_button = gr.Button("Generate")
             with gr.Accordion("Advanced Options", open=False):
                 denoise_steps = gr.Number(label="Denoising Steps", minimum=20, maximum=40, value=30, step=1)
                 seed = gr.Number(label="Seed", minimum=-1, maximum=2147483647, step=1, value=42)
+        gen_button.click(fn=infer, inputs=[person_image, garment_image, denoise_steps, seed], outputs=[gen_image], api_name="predict")
+# -------------------- FastAPI REST (JSON, base64) --------------------
+class TryOnPayload(BaseModel):
+    person_b64: str         # data URI or raw base64
+    garment_b64: str        # data URI or raw base64
+    denoise_steps: int = 30
+    seed: int = 42
+fastapi_app = FastAPI()
+@fastapi_app.post("/tryon")
+def tryon_endpoint(payload: TryOnPayload):
+    person = _b64_to_pil(payload.person_b64)
+    garment = _b64_to_pil(payload.garment_b64)
+    out_img = _infer_core(person, garment, payload.denoise_steps, payload.seed)
+    b64 = _pil_to_b64_jpeg(out_img)
+    # return a data URI so clients can use it directly if they want
+    return {"image_data_uri": f"data:image/jpeg;base64,{b64}", "base64": b64}
+# Mount Gradio at root, REST at same server
+app = gr.mount_gradio_app(fastapi_app, demo, path="/")