# Install necessary libraries (in your requirements.txt) # pillow opencv-python transformers mediapipe diffusers accelerate transformers # Example install command: pip install pillow opencv-python transformers mediapipe diffusers accelerate transformers from PIL import Image import cv2 import mediapipe as mp import numpy as np from transformers import pipeline, CLIPImageProcessor, CLIPVisionModelWithProjection, CLIPTextModel, CLIPTextModelWithProjection, AutoTokenizer from diffusers import StableDiffusionXLInpaintPipeline, DDPMScheduler, AutoencoderKL import torch import os from torchvision import transforms from typing import List # from utils_mask import get_mask_location # from preprocess.humanparsing.run_parsing import Parsing # from preprocess.openpose.run_openpose import OpenPose from detectron2.data.detection_utils import convert_PIL_to_numpy, _apply_exif_orientation from torchvision.transforms.functional import to_pil_image import apply_net def pil_to_binary_mask(pil_image, threshold=0): np_image = np.array(pil_image) grayscale_image = Image.fromarray(np_image).convert("L") binary_mask = np.array(grayscale_image) > threshold mask = np.zeros(binary_mask.shape, dtype=np.uint8) for i in range(binary_mask.shape[0]): for j in range(binary_mask.shape[1]): if binary_mask[i, j] == True: mask[i, j] = 1 mask = (mask * 255).astype(np.uint8) output_mask = Image.fromarray(mask) return output_mask # [cite: 60, 61] base_path = 'yisol/IDM-VTON' # example_path = os.path.join(os.path.dirname(__file__), 'example') unet = UNet2DConditionModel.from_pretrained( base_path, subfolder="unet", torch_dtype=torch.float16, ) unet.requires_grad_(False) # [cite: 61] tokenizer_one = AutoTokenizer.from_pretrained( base_path, subfolder="tokenizer", revision=None, use_fast=False, ) # [cite: 61, 62] tokenizer_two = AutoTokenizer.from_pretrained( base_path, subfolder="tokenizer_2", revision=None, use_fast=False, ) # [cite: 62] noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler") # [cite: 62] text_encoder_one = CLIPTextModel.from_pretrained( base_path, subfolder="text_encoder", torch_dtype=torch.float16, ) # [cite: 62] text_encoder_two = CLIPTextModelWithProjection.from_pretrained( base_path, subfolder="text_encoder_2", torch_dtype=torch.float16, ) # [cite: 62] image_encoder = CLIPVisionModelWithProjection.from_pretrained( base_path, subfolder="image_encoder", torch_dtype=torch.float16, ) # [cite: 62, 63] vae = AutoencoderKL.from_pretrained(base_path, subfolder="vae", torch_dtype=torch.float16, ) # [cite: 63] # "stabilityai/stable-diffusion-xl-base-1.0", UNet_Encoder = UNet2DConditionModel_ref.from_pretrained( base_path, subfolder="unet_encoder", torch_dtype=torch.float16, ) # [cite: 63] # parsing_model = Parsing(0) # openpose_model = OpenPose(0) UNet_Encoder.requires_grad_(False) # [cite: 63] image_encoder.requires_grad_(False) # [cite: 63] vae.requires_grad_(False) # [cite: 63] unet.requires_grad_(False) # [cite: 63] text_encoder_one.requires_grad_(False) # [cite: 63] text_encoder_two.requires_grad_(False) # [cite: 63] tensor_transfrom = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize(,), ] ) # [cite: 63, 64] pipe = TryonPipeline.from_pretrained( base_path, unet=unet, vae=vae, feature_extractor=CLIPImageProcessor(), text_encoder=text_encoder_one, text_encoder_2=text_encoder_two, tokenizer=tokenizer_one, tokenizer_2=tokenizer_two, scheduler=noise_scheduler, image_encoder=image_encoder, torch_dtype=torch.float16, ) # [cite: 64, 65] pipe.unet_encoder = UNet_Encoder # [cite: 65] @spaces.GPU def start_tryon(dict, garm_img, garment_des, is_checked, is_checked_crop, denoise_steps, seed): device = "cuda" # openpose_model.preprocessor.body_estimation.model.to(device) pipe.to(device) pipe.unet_encoder.to(device) garm_img = garm_img.convert("RGB").resize((768, 1024)) human_img_orig = dict["background"].convert("RGB") if is_checked_crop: width, height = human_img_orig.size target_width = int(min(width, height * (3 / 4))) target_height = int(min(height, width * (4 / 3))) left = (width - target_width) / 2 top = (height - target_height) / 2 right = (width + target_width) / 2 bottom = (height + target_height) / 2 cropped_img = human_img_orig.crop((left, top, right, bottom)) crop_size = cropped_img.size human_img = cropped_img.resize((384, 512)) # Reduced size for efficiency else: human_img = human_img_orig.resize((384, 512)) # Reduced size for efficiency if is_checked: # keypoints = openpose_model(human_img.resize((384, 512))) # model_parse, _ = parsing_model(human_img.resize((384, 512))) # mask, mask_gray = get_mask_location('hd', "upper_body", model_parse, keypoints) # mask = mask.resize((768, 1024)) # Placeholder for mask generation (replace with your mask logic) mask = Image.new('L', (768, 1024), color='white') # Example: a white mask mask_gray = Image.new('RGB', (768, 1024), color='gray') # Example: a gray image else: mask = pil_to_binary_mask(dict['layers'][0].convert("RGB").resize((768, 1024))) mask_gray = (1 - transforms.ToTensor()(mask)) * tensor_transfrom(human_img) mask_gray = to_pil_image((mask_gray + 1.0) / 2.0) human_img_arg = _apply_exif_orientation(human_img.resize((384, 512))) human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR") args = apply_net.create_argument_parser().parse_args( ('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda')) # verbosity = getattr(args, "verbosity", None) pose_img = args.func(args, human_img_arg) pose_img = pose_img[:, :, ::-1] pose_img = Image.fromarray(pose_img).resize((768, 1024)) with torch.no_grad(): # Extract the images with torch.cuda.amp.autocast(): with torch.no_grad(): prompt = "model is wearing " + garment_des negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality" with torch.inference_mode(): ( prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, ) = pipe.encode_prompt( prompt, num_images_per_prompt=1, do_classifier_free_guidance=True, negative_prompt=negative_prompt, ) prompt = "a photo of " + garment_des negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality" if not isinstance(prompt, List): prompt = [prompt] * 1 if not isinstance(negative_prompt, List): negative_prompt = [negative_prompt] * 1 with torch.inference_mode(): ( prompt_embeds_c, _, _, _, ) = pipe.encode_prompt( prompt, num_images_per_prompt=1, do_classifier_free_guidance=False, negative_prompt=negative_prompt, ) pose_img = tensor_transfrom(pose_img).unsqueeze(0).to(device, torch.float16) garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device, torch.float16) generator = torch.Generator(device).manual_seed(seed) if seed is not None else None images = pipe( prompt_embeds=prompt_embeds.to(device, torch.float16), negative_prompt_embeds=negative_prompt_embeds.to(device, torch.float16), pooled_prompt_embeds=pooled_prompt_embeds.to(device, torch.float16), negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device, torch.float16), num_inference_steps=denoise_steps, # [cite: 78, 79, 80] generator=generator, strength=1.0, pose_img=pose_img.to(device, torch.float16), text_embeds_cloth=prompt_embeds_c.to(device, torch.float16), cloth=garm_tensor.to(device, torch.float16), mask_image=mask, image=human_