IDM-VTON-RV-local

Paused

App Files Files Community

ArmanRV commited on Feb 25

Commit

9481783

verified ·

1 Parent(s): f364720

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -310

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 # -*- coding: utf-8 -*-
 import os
-import re
 import time
-from typing import List, Optional, Tuple, Dict
 import spaces
 import gradio as gr
@@ -94,7 +93,7 @@ DEMO_PASS = os.getenv("DEMO_PASS", "").strip()
 APP_AUTH = (DEMO_USER, DEMO_PASS) if (DEMO_USER and DEMO_PASS) else None
 # =========================
-# Garments dataset autoload
 # =========================
 GARMENT_DIR = "garments"
 ALLOWED_EXTS = (".png", ".jpg", ".jpeg", ".webp")
@@ -162,9 +161,6 @@ def build_gallery_items(files: List[str]):
     return [(garment_path(f), "") for f in files]
-# =========================
-# Small helpers
-# =========================
 def clamp_int(x, lo, hi):
     try:
         x = int(x)
@@ -173,18 +169,10 @@ def clamp_int(x, lo, hi):
     return max(lo, min(hi, x))
-def clamp_float(x, lo, hi):
-    try:
-        x = float(x)
-    except Exception:
-        x = lo
-    return max(lo, min(hi, x))
 _last_call_ts = 0.0
-def allow_call(min_interval_sec: float = 2.0) -> Tuple[bool, str]:
     global _last_call_ts
     now = time.time()
     if now - _last_call_ts < min_interval_sec:
@@ -194,150 +182,8 @@ def allow_call(min_interval_sec: float = 2.0) -> Tuple[bool, str]:
     return True, ""
-def round_to_multiple(x: int, m: int = 8) -> int:
-    return max(m, int(round(x / m) * m))
-def pick_target_size_keep_aspect(w: int, h: int, max_side: int) -> Tuple[int, int]:
-    """
-    (tw, th) <= max_side по большей стороне, кратно 8
-    """
-    if w <= 0 or h <= 0:
-        return 768, 1024
-    scale = min(max_side / float(max(w, h)), 1.0)
-    tw = round_to_multiple(int(w * scale), 8)
-    th = round_to_multiple(int(h * scale), 8)
-    tw = max(512, tw)
-    th = max(512, th)
-    if max(tw, th) > max_side:
-        scale2 = max_side / float(max(tw, th))
-        tw = round_to_multiple(int(tw * scale2), 8)
-        th = round_to_multiple(int(th * scale2), 8)
-    return tw, th
-def letterbox(img: Image.Image, target_w: int, target_h: int, fill=(127, 127, 127)) -> Tuple[Image.Image, Dict[str, int]]:
-    """
-    Resize with aspect + padding to (target_w,target_h).
-    meta: x,y,w,h for core region inside padded canvas
-    """
-    src_w, src_h = img.size
-    if src_w <= 0 or src_h <= 0:
-        out = img.resize((target_w, target_h))
-        return out, {"x": 0, "y": 0, "w": target_w, "h": target_h, "src_w": src_w, "src_h": src_h}
-    scale = min(target_w / src_w, target_h / src_h)
-    new_w = max(1, int(src_w * scale))
-    new_h = max(1, int(src_h * scale))
-    img_rs = img.resize((new_w, new_h), Image.LANCZOS)
-    canvas = Image.new("RGB", (target_w, target_h), fill)
-    x = (target_w - new_w) // 2
-    y = (target_h - new_h) // 2
-    canvas.paste(img_rs, (x, y))
-    return canvas, {"x": x, "y": y, "w": new_w, "h": new_h, "src_w": src_w, "src_h": src_h}
-def unletterbox(img_lb: Image.Image, meta: Dict[str, int]) -> Image.Image:
-    x, y, w, h = meta["x"], meta["y"], meta["w"], meta["h"]
-    return img_lb.crop((x, y, x + w, y + h))
-def paste_into_canvas(canvas_mode: str, canvas_size: Tuple[int, int], core_img: Image.Image, meta: Dict[str, int], fill):
-    """
-    Вклеивает core_img в канвас (target_w,target_h) по meta x,y.
-    """
-    x, y, w, h = meta["x"], meta["y"], meta["w"], meta["h"]
-    canvas = Image.new(canvas_mode, canvas_size, fill)
-    if core_img.size != (w, h):
-        core_img = core_img.resize((w, h), Image.BILINEAR)
-    canvas.paste(core_img, (x, y))
-    return canvas
-def infer_garment_class_from_path(relpath: str) -> str:
-    """
-    'upper_body' | 'lower_body' | 'dresses'
-    """
-    s = (relpath or "").lower().replace("\\", "/")
-    if any(k in s for k in ["dress", "dresses", "sarafan", "plate", "плать", "сараф"]):
-        return "dresses"
-    if any(k in s for k in ["pants", "trouser", "jeans", "skirt", "short", "брюк", "джин", "юбк", "шорт"]):
-        return "lower_body"
-    return "upper_body"
-def guess_garment_description(relpath: str) -> str:
-    s = (relpath or "").lower().replace("\\", "/")
-    mapping = [
-        (["shearling", "дублен", "sheepskin"], "a shearling jacket"),
-        (["coat", "пальт", "overcoat"], "a coat"),
-        (["jacket", "куртк", "парка", "parka", "bomber"], "a jacket"),
-        (["blazer", "пидж", "suit"], "a blazer"),
-        (["hoodie", "худи"], "a hoodie"),
-        (["sweater", "свит", "jumper"], "a sweater"),
-        (["shirt", "рубаш"], "a shirt"),
-        (["tshirt", "tee", "футбол"], "a t-shirt"),
-        (["dress", "плать", "sarafan"], "a dress"),
-        (["pants", "jeans", "брюк", "джин"], "pants"),
-        (["skirt", "юбк"], "a skirt"),
-    ]
-    for keys, desc in mapping:
-        if any(k in s for k in keys):
-            return desc
-    base = os.path.splitext(os.path.basename(s))[0]
-    base = re.sub(r"[_\-]+", " ", base)
-    base = re.sub(r"\d+", " ", base)
-    base = re.sub(r"\s+", " ", base).strip()
-    if len(base) >= 3:
-        return "a " + " ".join(base.split()[:4])
-    return "a piece of clothing"
-def apply_safety_clamp(mask_full: Image.Image, meta: Dict[str, int], garment_class: str, clamp_strength: float) -> Image.Image:
-    """
-    Универсальная страховка от “уехало вниз/вверх”:
-    - upper_body: оставляем маску выше линии бёдер (чем больше clamp_strength, тем “выше” граница)
-    - lower_body: оставляем маску ниже линии талии/бёдер (чем больше clamp_strength, тем “ниже” граница)
-    - dresses: не трогаем
-    clamp_strength: 0..1 (0 = почти не влияет, 1 = сильнее)
-    """
-    if garment_class == "dresses":
-        return mask_full
-    tw, th = mask_full.size
-    x, y, w, h = meta["x"], meta["y"], meta["w"], meta["h"]
-    # базовые линии (проценты по core высоте) — эмпирика для full-body
-    # upper_body: граница где-то около 0.60..0.72 от высоты core
-    # lower_body: граница около 0.34..0.48 от высоты core
-    clamp_strength = clamp_float(clamp_strength, 0.0, 1.0)
-    if garment_class == "upper_body":
-        lo, hi = 0.60, 0.72
-        frac = lo + (hi - lo) * (1.0 - clamp_strength)  # clamp_strength↑ => граница ближе к lo (выше)
-        cut_y = y + int(frac * h)
-        keep = mask_full.crop((0, 0, tw, max(0, min(th, cut_y))))
-        out = Image.new("L", (tw, th), 0)
-        out.paste(keep, (0, 0))
-        return out
-    if garment_class == "lower_body":
-        lo, hi = 0.34, 0.48
-        frac = lo + (hi - lo) * (clamp_strength)  # clamp_strength↑ => граница ближе к hi (ниже)
-        cut_y = y + int(frac * h)
-        keep = mask_full.crop((0, max(0, min(th, cut_y)), tw, th))
-        out = Image.new("L", (tw, th), 0)
-        out.paste(keep, (0, max(0, min(th, cut_y))))
-        return out
-    return mask_full
 # =========================
-# Model init (local IDM-VTON)
 # =========================
 base_path = "yisol/IDM-VTON"
@@ -387,75 +233,58 @@ pipe.unet_encoder = UNet_Encoder
 # =========================
-# Inference
 # =========================
 @spaces.GPU
 def start_tryon(
     human_pil: Image.Image,
     garm_img: Image.Image,
-    garm_relpath: str = "",
-    garment_type_override: str = "auto",  # auto | upper_body | lower_body | dresses
     auto_mask: bool = True,
-    safety_clamp: bool = True,
-    clamp_strength: float = 0.55,  # 0..1
-    denoise_steps: int = 34,
-    guidance_scale: float = 3.8,
-    strength: float = 0.90,
-    seed: int = -1,
-    max_side: int = 1024,
-    prompt_override: str = "",
-    negative_prompt: str = "monochrome, lowres, bad anatomy, worst quality, low quality",
 ) -> Image.Image:
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = torch.float16 if device == "cuda" else torch.float32
     if device == "cuda":
         openpose_model.preprocessor.body_estimation.model.to(device)
     pipe.to(device)
     pipe.unet_encoder.to(device)
     human_img_orig = human_pil.convert("RGB")
-    src_w, src_h = human_img_orig.size
-    target_w, target_h = pick_target_size_keep_aspect(src_w, src_h, max_side=max_side)
-    # letterbox for model canvas (important: gray padding)
-    human_lb, lb_meta = letterbox(human_img_orig, target_w, target_h, fill=(127, 127, 127))
-    garm_img = garm_img.convert("RGB")
-    garm_lb, _ = letterbox(garm_img, target_w, target_h, fill=(127, 127, 127))
-    # Core region (no padding) — IMPORTANT for preprocessors
-    human_core = unletterbox(human_lb, lb_meta)
-    x, y, w, h = lb_meta["x"], lb_meta["y"], lb_meta["w"], lb_meta["h"]
-    # garment class
-    if garment_type_override and garment_type_override != "auto":
-        cloth_class = garment_type_override
     else:
-        cloth_class = infer_garment_class_from_path(garm_relpath)
-    # ---- MASK (compute on core -> paste to full) ----
     if auto_mask:
-        human_core_384 = human_core.resize((384, 512), Image.BILINEAR)
-        keypoints = openpose_model(human_core_384)
-        model_parse, _ = parsing_model(human_core_384)
-        mask_core_384, _ = get_mask_location("hd", cloth_class, model_parse, keypoints)
-        mask_core = mask_core_384.resize((w, h), Image.BILINEAR)
-        mask_full = Image.new("L", (target_w, target_h), 0)
-        mask_full.paste(mask_core, (x, y))
-        if safety_clamp:
-            mask_full = apply_safety_clamp(mask_full, lb_meta, cloth_class, clamp_strength)
-        mask = mask_full
     else:
-        mask = Image.new("L", (target_w, target_h), 0)
-    # ---- DensePose (compute on core -> paste to full) ----
-    human_dp = _apply_exif_orientation(human_core.resize((384, 512), Image.BILINEAR))
-    human_dp = convert_PIL_to_numpy(human_dp, format="BGR")
     args = apply_net.create_argument_parser().parse_args(
         (
@@ -469,28 +298,18 @@ def start_tryon(
             "cuda" if device == "cuda" else "cpu",
         )
     )
-    pose_core = args.func(args, human_dp)
-    pose_core = pose_core[:, :, ::-1]
-    pose_core = Image.fromarray(pose_core).resize((w, h), Image.BILINEAR)
-    pose_img = paste_into_canvas("RGB", (target_w, target_h), pose_core, lb_meta, (127, 127, 127))
-    # ---- prompts (not fixed) ----
-    garment_desc = guess_garment_description(garm_relpath)
-    if prompt_override and prompt_override.strip():
-        garment_desc = prompt_override.strip()
-    prompt_main = f"model is wearing {garment_desc}"
-    prompt_cloth = f"a photo of {garment_desc}"
-    # ---- params ----
-    denoise_steps = clamp_int(denoise_steps, 15, 60)
-    guidance_scale = clamp_float(guidance_scale, 0.0, 12.0)
-    strength = clamp_float(strength, 0.50, 1.00)
-    max_side = clamp_int(max_side, 640, 2048)
-    seed = int(seed) if seed is not None else -1
-    if seed < 0:
-        seed = int.from_bytes(os.urandom(2), "big") + int(time.time() * 1000) % 1000000
     with torch.no_grad():
         if device == "cuda":
@@ -527,7 +346,8 @@ def start_tryon(
             )
             pose_t = tensor_transfrom(pose_img).unsqueeze(0).to(device=device, dtype=dtype)
-            garm_t = tensor_transfrom(garm_lb).unsqueeze(0).to(device=device, dtype=dtype)
             generator = torch.Generator(device).manual_seed(seed)
             images = pipe(
@@ -537,28 +357,28 @@ def start_tryon(
                 negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device=device, dtype=dtype),
                 num_inference_steps=denoise_steps,
                 generator=generator,
-                strength=strength,
                 pose_img=pose_t,
                 text_embeds_cloth=prompt_embeds_c.to(device=device, dtype=dtype),
                 cloth=garm_t,
                 mask_image=mask,
-                image=human_lb,
-                height=target_h,
-                width=target_w,
-                ip_adapter_image=garm_lb,
-                guidance_scale=guidance_scale,
             )[0]
-    out_img_lb = images[0].convert("RGB")
-    # remove padding and return to original resolution
-    out_core = unletterbox(out_img_lb, lb_meta)
-    out_final = out_core.resize((src_w, src_h), Image.LANCZOS)
-    return out_final
 # =========================
-# UI
 # =========================
 CUSTOM_CSS = """
 footer {display:none !important;}
@@ -583,23 +403,10 @@ def on_gallery_select(files_list: List[str], evt: gr.SelectData):
     return files_list[idx], f"👕 Выбрано: {files_list[idx]}"
-def tryon_ui(
-    person_pil,
-    selected_filename,
-    garment_type_override,
-    auto_mask,
-    safety_clamp,
-    clamp_strength,
-    steps,
-    cfg,
-    strength,
-    seed,
-    max_side,
-    prompt_override,
-):
     yield None, "⏳ Обработка... (первый запуск может быть дольше)"
-    ok, msg = allow_call(2.0)
     if not ok:
         yield None, msg
         return
@@ -620,24 +427,16 @@ def tryon_ui(
         out_img = start_tryon(
             human_pil=person_pil,
             garm_img=garm,
-            garm_relpath=selected_filename,
-            garment_type_override=str(garment_type_override),
-            auto_mask=bool(auto_mask),
-            safety_clamp=bool(safety_clamp),
-            clamp_strength=float(clamp_strength),
-            denoise_steps=int(steps),
-            guidance_scale=float(cfg),
-            strength=float(strength),
-            seed=int(seed),
-            max_side=int(max_side),
-            prompt_override=str(prompt_override or "").strip(),
         )
         yield out_img, "✅ Готово"
     except Exception as e:
         yield None, f"❌ Ошибка: {type(e).__name__}: {str(e)[:220]}"
-# preload garments
 ensure_garments_downloaded()
 _initial_files = list_garments()
 _initial_items = build_gallery_items(_initial_files)
@@ -664,36 +463,6 @@ with gr.Blocks(title="Virtual Try-On Rendez-vous", css=CUSTOM_CSS) as demo:
                 allow_preview=True,
             )
-            with gr.Accordion("⚙️ Настройки", open=False):
-                garment_type_override = gr.Dropdown(
-                    choices=["auto", "upper_body", "lower_body", "dresses"],
-                    value="auto",
-                    label="Тип одежды (override)",
-                )
-                auto_mask = gr.Checkbox(value=True, label="Auto mask (parsing + openpose)")
-                safety_clamp = gr.Checkbox(
-                    value=True,
-                    label="Safety clamp (защита от съезда зоны редактирования)",
-                )
-                clamp_strength = gr.Slider(
-                    0.0, 1.0, value=0.55, step=0.01,
-                    label="Clamp strength (0 = мягко, 1 = сильнее)",
-                )
-                steps = gr.Slider(15, 60, value=34, step=1, label="Шаги (num_inference_steps)")
-                cfg = gr.Slider(0.0, 12.0, value=3.8, step=0.1, label="Guidance scale (CFG)")
-                strength = gr.Slider(0.50, 1.00, value=0.90, step=0.01, label="Strength")
-                seed = gr.Number(value=-1, precision=0, label="Seed (-1 = случайный)")
-                max_side = gr.Slider(768, 1536, value=1024, step=64, label="Макс. сторона (динамический размер)")
-                prompt_override = gr.Textbox(
-                    value="",
-                    label="Описание одежды (опц.)",
-                    placeholder="Напр.: a blazer / a dress / a t-shirt ... (если пусто — авто по имени файла)",
-                )
             run = gr.Button("Примерить", variant="primary")
             status = gr.Textbox(value="Ожидание...", interactive=False)
@@ -714,20 +483,7 @@ with gr.Blocks(title="Virtual Try-On Rendez-vous", css=CUSTOM_CSS) as demo:
     run.click(
         fn=tryon_ui,
-        inputs=[
-            person,
-            selected_garment_state,
-            garment_type_override,
-            auto_mask,
-            safety_clamp,
-            clamp_strength,
-            steps,
-            cfg,
-            strength,
-            seed,
-            max_side,
-            prompt_override,
-        ],
         outputs=[out, status],
         concurrency_limit=1,
     )

 # -*- coding: utf-8 -*-
 import os
 import time
+from typing import List, Optional, Tuple
 import spaces
 import gradio as gr
 APP_AUTH = (DEMO_USER, DEMO_PASS) if (DEMO_USER and DEMO_PASS) else None
 # =========================
+# Garments dataset autoload (UX only, doesn't affect quality)
 # =========================
 GARMENT_DIR = "garments"
 ALLOWED_EXTS = (".png", ".jpg", ".jpeg", ".webp")
     return [(garment_path(f), "") for f in files]
 def clamp_int(x, lo, hi):
     try:
         x = int(x)
     return max(lo, min(hi, x))
 _last_call_ts = 0.0
+def allow_call(min_interval_sec: float = 2.5) -> Tuple[bool, str]:
     global _last_call_ts
     now = time.time()
     if now - _last_call_ts < min_interval_sec:
     return True, ""
 # =========================
+# Model init (baseline IDM-VTON)
 # =========================
 base_path = "yisol/IDM-VTON"
 # =========================
+# Inference (baseline like your original)
 # =========================
 @spaces.GPU
 def start_tryon(
     human_pil: Image.Image,
     garm_img: Image.Image,
     auto_mask: bool = True,
+    crop_center: bool = True,
+    denoise_steps: int = 25,
+    seed: int = 42,
 ) -> Image.Image:
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = torch.float16 if device == "cuda" else torch.float32
     if device == "cuda":
         openpose_model.preprocessor.body_estimation.model.to(device)
     pipe.to(device)
     pipe.unet_encoder.to(device)
+    # fixed resolution baseline
+    garm_img = garm_img.convert("RGB").resize((768, 1024))
     human_img_orig = human_pil.convert("RGB")
+    # crop center baseline
+    if crop_center:
+        width, height = human_img_orig.size
+        target_width = int(min(width, height * (3 / 4)))
+        target_height = int(min(height, width * (4 / 3)))
+        left = (width - target_width) / 2
+        top = (height - target_height) / 2
+        right = (width + target_width) / 2
+        bottom = (height + target_height) / 2
+        cropped_img = human_img_orig.crop((left, top, right, bottom))
+        crop_size = cropped_img.size
+        human_img = cropped_img.resize((768, 1024))
     else:
+        human_img = human_img_orig.resize((768, 1024))
+        crop_size = None
+        left = top = 0
+    # mask baseline (upper_body)
     if auto_mask:
+        keypoints = openpose_model(human_img.resize((384, 512)))
+        model_parse, _ = parsing_model(human_img.resize((384, 512)))
+        mask, _ = get_mask_location("hd", "upper_body", model_parse, keypoints)
+        mask = mask.resize((768, 1024))
     else:
+        mask = Image.new("L", (768, 1024), 0)
+    # DensePose baseline
+    human_img_arg = _apply_exif_orientation(human_img.resize((384, 512)))
+    human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
     args = apply_net.create_argument_parser().parse_args(
         (
             "cuda" if device == "cuda" else "cpu",
         )
     )
+    pose_img = args.func(args, human_img_arg)
+    pose_img = pose_img[:, :, ::-1]
+    pose_img = Image.fromarray(pose_img).resize((768, 1024))
+    # fixed prompts baseline
+    garment_des = "a garment"
+    prompt_main = "model is wearing " + garment_des
+    prompt_cloth = "a photo of " + garment_des
+    negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+    denoise_steps = clamp_int(denoise_steps, 20, 40)
+    seed = clamp_int(seed, 0, 999999)
     with torch.no_grad():
         if device == "cuda":
             )
             pose_t = tensor_transfrom(pose_img).unsqueeze(0).to(device=device, dtype=dtype)
+            garm_t = tensor_transfrom(garm_img).unsqueeze(0).to(device=device, dtype=dtype)
             generator = torch.Generator(device).manual_seed(seed)
             images = pipe(
                 negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device=device, dtype=dtype),
                 num_inference_steps=denoise_steps,
                 generator=generator,
+                strength=1.0,
                 pose_img=pose_t,
                 text_embeds_cloth=prompt_embeds_c.to(device=device, dtype=dtype),
                 cloth=garm_t,
                 mask_image=mask,
+                image=human_img,
+                height=1024,
+                width=768,
+                ip_adapter_image=garm_img.resize((768, 1024)),
+                guidance_scale=2.0,
             )[0]
+    out_img = images[0]
+    if crop_center and crop_size is not None:
+        out_img_rs = out_img.resize(crop_size)
+        human_img_orig.paste(out_img_rs, (int(left), int(top)))
+        return human_img_orig
+    return out_img
 # =========================
+# UI (simple baseline)
 # =========================
 CUSTOM_CSS = """
 footer {display:none !important;}
     return files_list[idx], f"👕 Выбрано: {files_list[idx]}"
+def tryon_ui(person_pil, selected_filename):
     yield None, "⏳ Обработка... (первый запуск может быть дольше)"
+    ok, msg = allow_call(2.5)
     if not ok:
         yield None, msg
         return
         out_img = start_tryon(
             human_pil=person_pil,
             garm_img=garm,
+            auto_mask=True,
+            crop_center=True,
+            denoise_steps=25,
+            seed=42,
         )
         yield out_img, "✅ Готово"
     except Exception as e:
         yield None, f"❌ Ошибка: {type(e).__name__}: {str(e)[:220]}"
 ensure_garments_downloaded()
 _initial_files = list_garments()
 _initial_items = build_gallery_items(_initial_files)
                 allow_preview=True,
             )
             run = gr.Button("Примерить", variant="primary")
             status = gr.Textbox(value="Ожидание...", interactive=False)
     run.click(
         fn=tryon_ui,
+        inputs=[person, selected_garment_state],
         outputs=[out, status],
         concurrency_limit=1,
     )