Spaces:

adowu
/

Qwen-Image-Edit-Optimized

Runtime error

File size: 21,689 Bytes

65a007f

import math
import random

import cv2
import gradio as gr
import numpy as np
import spaces
import torch
from PIL import Image
from diffusers import FlowMatchEulerDiscreteScheduler
import mediapipe as mp


from optimization import optimize_pipeline_
from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel

import glob
import os
os.environ['HF_HOME'] = '/tmp'
os.environ['TRANSFORMERS_CACHE'] = '/tmp'


HAIR_IMAGE_DIR = "hair_png"

def list_hair_images():
    files = glob.glob(os.path.join(HAIR_IMAGE_DIR, "*.png"))
    return [os.path.basename(f) for f in files]

def load_hair_image(filename):
    if filename is None:
        return None
    path = os.path.join(HAIR_IMAGE_DIR, filename)
    return Image.open(path).convert("RGB")

# ===============================
# --- Model Loading ---
# ===============================

dtype = torch.bfloat16
device = "cuda" if torch.cuda.is_available() else "cpu"

scheduler_config = {
    "base_image_seq_len": 256,
    "base_shift": math.log(3),
    "invert_sigmas": False,
    "max_image_seq_len": 8192,
    "max_shift": math.log(3),
    "num_train_timesteps": 1000,
    "shift": 1.0,
    "shift_terminal": None,
    "stochastic_sampling": False,
    "time_shift_type": "exponential",
    "use_beta_sigmas": False,
    "use_dynamic_shifting": True,
    "use_exponential_sigmas": False,
    "use_karras_sigmas": False,
}
scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)

# Pipeline読み込み
pipe = QwenImageEditPlusPipeline.from_pretrained(
    "Qwen/Qwen-Image-Edit-2509",
    scheduler=scheduler,
    torch_dtype=dtype,
).to(device)

# LoRA適用（Lightning 4steps）
pipe.load_lora_weights(
    "lightx2v/Qwen-Image-Lightning",
    weight_name="Qwen-Image-Edit-2509/Qwen-Image-Edit-2509-Lightning-4steps-V1.0-bf16.safetensors",
    weight_dtype=torch.bfloat16,
)
pipe.fuse_lora(lora_scale=1.0)

pipe.transformer.__class__ = QwenImageTransformer2DModel
pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())

# ===============================
# --- Constants & Prompts ---
# ===============================

MAX_SEED = np.iinfo(np.int32).max
DEFAULT_SEED = 0
DEFAULT_RANDOMIZE = True
DEFAULT_TRUE_GUIDANCE_SCALE = 1.0
DEFAULT_NUM_INFERENCE_STEPS = 4

# picture1 = Hair / ベース画像, picture2 = Face画像
FIXED_PROMPT = (
    "Replace the blurred face in picture1 with the face from picture2."
    "Use only the visible face from picture2, and ignore all the surrounding white areas."
    "Preserve picture1’s hairstyle, head shape, lighting, shadows, and background exactly as they are."
    "Place the new face in the correct position and adjust its size, rotation angle, and perspective so that it matches the original head orientation naturally."
    "Match the inserted face to picture2's skin tone, lighting direction, and contrast."
    "Also adjust the skin tone of the neck and body in picture1 so that it matches the skin tone of the face from picture2 naturally."
    "Blend edges smoothly so the result looks like a single realistic person with no visible editing artifacts."
)

FIXED_NEGATIVE_PROMPT = "blurry, extra lines, color bleeding"

# ===============================
# --- I18N Dictionary ---
# ===============================

I18N = {
    "title": {
        "en": "Hairstyle Transformer",
        "ja": "髪型変換",
        "zh": "发型变换",
        },
    "notice": {
        "en": (
            "Note: When using this software, please comply with applicable laws and ensure that you do not infringe on the rights of others. "
            "The software developer assumes no responsibility for how users utilize this software. "
            "When posting images online (SNS, etc.), be sure to use source photos of fictional people created with image-generation tools, "
            "and never engage in activities such as deepfakes that impersonate or mislead others."
        ),
        "ja": (
            "注意：本ソフトウェアを利用する際は、関連する法規制を遵守し、他者の権利を侵害しないよう十分ご注意ください。"
            "また、ソフトウェア開発者は、ユーザーによる利用方法について一切の責任を負いません。"
            "SNS等で公開する際は、画像生成アプリなどで作成した実在しない人物の画像を入力素材としてご使用ください。"
            "他者を不当に模倣・誤認させるディープフェイクなどの行為は絶対に行わないでください。"
        ),
        "zh": (
            "注意：使用本软件时，请遵守相关法律法规，并注意不要侵犯他人的权利。"
            "软件开发者对用户的使用方式不承担任何责任。"
            "在社交平台（SNS等）公开发布时，请使用通过图像生成工具创建的虚构人物图片作为输入，"
            "绝不可从事深度伪造等不正当模仿或误导他人的行为。"
        )
    },
    "face_input": {
        "en": "Face image (picture2)",
        "ja": "Face入力画像（picture2）",
        "zh": "人脸图像（picture2）",
    },
    "hair_input": {
        "en": "Hair image (picture1)",
        "ja": "Hair 画像（picture1）",
        "zh": "头发图像（picture1）",
    },
    "accordion": {"en": "Advanced settings", "ja": "詳細設定", "zh": "高级设置"},
    "seed": {"en": "Seed", "ja": "Seed", "zh": "Seed"},
    "rand": {"en": "Randomize seed", "ja": "ランダムシード", "zh": "随机种子"},
    "tgs": {"en": "True guidance scale", "ja": "True guidance scale", "zh": "True guidance scale"},
    "steps": {"en": "Steps", "ja": "生成ステップ数", "zh": "生成步数"},
    "run": {"en": "Generate", "ja": "生成", "zh": "生成"},
    "output": {"en": "Output image", "ja": "出力画像", "zh": "输出图像"},
    "status": {"en": "Status", "ja": "ステータス", "zh": "状态"},
    "status_ok": {
        "en": "Generated 1 image (PNG).",
        "ja": "1枚生成しました（PNG）。",
        "zh": "已生成 1 张图片（PNG）。",
    },
    "err_no_img": {
        "en": "Error: Please upload both Face and Hair images.",
        "ja": "エラー: Face画像とHair画像の両方をアップロードしてください。",
        "zh": "错误：请先上传 Face 和 Hair 两张图片。",
    },
    "lang_label": {"en": "UI Language", "ja": "UI言語", "zh": "界面语言"},
}


def t(key, lang):
    return I18N[key][lang]


mp_face = mp.solutions.face_mesh

FACE_OVAL = [
    10,338,297,332,284,251,389,356,454,323,361,288,
    397,365,379,378,400,377,152,148,176,149,150,136,
    172,58,132,93,234,127,162,21,54,103,67,109
]

# 眉の代表点（下側）
LEFT_BROW = [105, 66, 107]       # 左眉の中央付近
RIGHT_BROW = [334, 293, 300]     # 右眉の中央付近


# ===============================
# --- Face Preprocess with OpenCV ---
# ===============================

def preprocess_face(image: Image.Image, target_size: int = 1024) -> Image.Image:
    """
    MediaPipe による高精度な顔検出で、顔・首が中心になるよう整形して正方形画像を生成。
    """

    face_ratio = 0.6  # 顔の占有率（0.6 = 60%）

    img_rgb = np.array(image.convert("RGB"))
    h, w, _ = img_rgb.shape

    # OpenCV BGR
    img_bgr = img_rgb[:, :, ::-1]

    # ================================
    # MediaPipe 顔検出
    # ================================
    mp_face = mp.solutions.face_detection
    with mp_face.FaceDetection(model_selection=1, min_detection_confidence=0.5) as fd:
        results = fd.process(img_rgb)

    # -----------------------------------------------------------
    # 顔が見つからない場合：中央を正方形に切って白背景でパディング
    # -----------------------------------------------------------
    if not results.detections:
        side = min(w, h)
        x1 = (w - side) // 2
        y1 = (h - side) // 2
        crop = img_rgb[y1:y1 + side, x1:x1 + side]
        pil = Image.fromarray(crop).resize((target_size, target_size), Image.LANCZOS)
        return pil

    # ================================
    # 最大の顔検出を選択（信頼度 or bbox サイズ）
    # ================================
    detections = results.detections
    def bbox_area(det):
        box = det.location_data.relative_bounding_box
        return box.width * box.height

    det = max(detections, key=bbox_area)
    box = det.location_data.relative_bounding_box

    # MediaPipe は0〜1正規化 → 画像座標へ変換
    x = int(box.xmin * w)
    y = int(box.ymin * h)
    fw = int(box.width * w)
    fh = int(box.height * h)

    # 顔中心
    cx = x + fw // 2
    cy = y + fh // 2

    # 顔重心を少し上に補正
    cy_adjusted = cy - int(fh * 0.15)

    # 顔 + 首の範囲を大きめに取る
    head_top    = max(0,     y - int(fh * 0.3))
    head_bottom = min(h, y + fh + int(fh * 0.4))
    head_left   = max(0,     x - int(fw * 0.3))
    head_right  = min(w, x + fw + int(fw * 0.3))

    head_w = head_right - head_left
    head_h = head_bottom - head_top

    # 顔サイズに応じてスケール
    desired_face_size = int(target_size * face_ratio)
    scale = desired_face_size / max(fw, fh)

    # 画像全体をスケール
    scaled_w = int(w * scale)
    scaled_h = int(h * scale)
    scaled_img = cv2.resize(img_rgb, (scaled_w, scaled_h), interpolation=cv2.INTER_LANCZOS4)

    # スケール後の顔中心
    cx_s = int(cx * scale)
    cy_s = int(cy_adjusted * scale)

    # キャンバス中央に合わせるオフセット
    offset_x = target_size // 2 - cx_s
    offset_y = target_size // 2 - cy_s

    # 白背景キャンバス
    canvas = np.ones((target_size, target_size, 3), dtype=np.uint8) * 255

    # 貼り付け先座標
    x_start = max(0, offset_x)
    y_start = max(0, offset_y)
    x_end = min(target_size, offset_x + scaled_w)
    y_end = min(target_size, offset_y + scaled_h)

    # 元画像の切り出し位置
    src_x1 = max(0, -offset_x)
    src_y1 = max(0, -offset_y)
    src_x2 = src_x1 + (x_end - x_start)
    src_y2 = src_y1 + (y_end - y_start)

    src_x2 = min(src_x2, scaled_w)
    src_y2 = min(src_y2, scaled_h)

    # キャンバスに貼り付け
    canvas[y_start:y_end, x_start:x_end] = scaled_img[src_y1:src_y2, src_x1:src_x2]

    return Image.fromarray(canvas)


# ===============================
# --- Blur Hair Image with face_recognition ---
# ===============================

def blur_face_with_landmarks(image_pil):
    if image_pil is None:
        return None

    img_rgb = np.array(image_pil.convert("RGB"))
    img = img_rgb[:, :, ::-1]  # RGB→BGR
    h, w = img.shape[:2]
    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    with mp_face.FaceMesh(
        static_image_mode=True,
        max_num_faces=5,
        refine_landmarks=True,
    ) as face:

        res = face.process(rgb)

        # ★★ 顔なし → 全体ブラーに変更 ★★
        if not res.multi_face_landmarks:
            blurred = cv2.GaussianBlur(img, (301, 301), 0)
            return Image.fromarray(blurred[:, :, ::-1])  # BGR→RGB

        output = img.copy()

        for lm in res.multi_face_landmarks:
            L = lm.landmark

            oval = [(int(L[i].x * w), int(L[i].y * h)) for i in FACE_OVAL]

            left_brow = [(int(L[i].x*w), int(L[i].y*h)) for i in LEFT_BROW]
            right_brow = [(int(L[i].x*w), int(L[i].y*h)) for i in RIGHT_BROW]

            brow_center = np.mean(left_brow + right_brow, axis=0)
            chin_center = np.mean(oval[:8], axis=0)
            face_h = np.linalg.norm(brow_center - chin_center)

            offset = int(face_h * 0.12)

            forehead = [(x, y - offset) for (x, y) in left_brow + right_brow]

            mask = np.zeros((h, w), dtype=np.uint8)
            cv2.fillPoly(mask, [np.array(oval, np.int32)], 255)

            flood = mask.copy()
            cv2.floodFill(flood, None, seedPoint=(w//2, h//2), newVal=255)

            cv2.fillPoly(flood, [np.array(forehead, np.int32)], 255)

            final_mask = flood

            blurred = cv2.GaussianBlur(output, (301, 301), 0)
            mask3 = cv2.merge([final_mask, final_mask, final_mask])

            output = np.where(mask3 == 255, blurred, output)

        return Image.fromarray(output[:, :, ::-1])


def whiteout_except_face(image_pil):
    if image_pil is None:
        return None

    img_rgb = np.array(image_pil.convert("RGB"))
    img = img_rgb[:, :, ::-1]  # RGB→BGR
    h, w = img.shape[:2]
    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    with mp_face.FaceMesh(
        static_image_mode=True,
        max_num_faces=5,
        refine_landmarks=True,
    ) as face:

        res = face.process(rgb)

        # ★★ 顔なし → 全部白塗りに変更 ★★
        if not res.multi_face_landmarks:
            white_img = np.full_like(img, 255)   # BGRで白
            return Image.fromarray(white_img[:, :, ::-1])  # RGBに戻す

        white_bg = np.full_like(img, 255)
        mask_total = np.zeros((h, w), dtype=np.uint8)

        for lm in res.multi_face_landmarks:
            L = lm.landmark

            oval = [(int(L[i].x * w), int(L[i].y * h)) for i in FACE_OVAL]

            left_brow = [(int(L[i].x * w), int(L[i].y * h)) for i in LEFT_BROW]
            right_brow = [(int(L[i].x * w), int(L[i].y * h)) for i in RIGHT_BROW]
            brow_y = int(np.mean([p[1] for p in left_brow + right_brow]))

            chin_y = int(np.mean([oval[i][1] for i in range(8)]))
            face_h = abs(chin_y - brow_y)

            margin = int(face_h * 0.50)

            mask = np.zeros((h, w), dtype=np.uint8)
            cv2.fillPoly(mask, [np.array(oval, np.int32)], 255)

            filled = mask.copy()
            cv2.floodFill(filled, None, seedPoint=(w//2, h//2), newVal=255)

            cut_y = max(brow_y - margin, 0)
            filled[:cut_y, :] = 0

            mask_total = cv2.bitwise_or(mask_total, filled)

        soft_mask = cv2.GaussianBlur(mask_total, (0, 0), 25)

        soft_mask_f = soft_mask.astype(np.float32) / 255.0
        soft_mask_f = cv2.merge([soft_mask_f]*3)

        output = img * soft_mask_f + white_bg * (1.0 - soft_mask_f)
        output = output.astype(np.uint8)

        return Image.fromarray(output[:, :, ::-1])


# ===============================
# --- Unified Inference Function ---
# ===============================

@spaces.GPU()
def infer(
    face_image,
    hair_image,
    seed=DEFAULT_SEED,
    randomize_seed=DEFAULT_RANDOMIZE,
    true_guidance_scale=DEFAULT_TRUE_GUIDANCE_SCALE,
    num_inference_steps=DEFAULT_NUM_INFERENCE_STEPS,
    lang="en",
    progress=gr.Progress(track_tqdm=True),
):
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)

    if face_image is None or hair_image is None:
        return None, t("err_no_img", lang)

    # --- PIL 化 ---
    if isinstance(face_image, Image.Image):
        face_pil = face_image.convert("RGB")
    else:
        face_pil = Image.open(face_image).convert("RGB")

    # hair_image はドロップダウンの値（ファイル名 str）
    hair_pil = load_hair_image(hair_image)
    if hair_pil is None:
        return None, t("err_no_img", lang)

    hair_pil = hair_pil.convert("RGB")


    # --- 顔中心になるようトリミング＆リサイズ (1024x1024) ---
    face_pil = preprocess_face(face_pil, target_size=1024)
    #hair_pil = preprocess_face(hair_pil, target_size=1024)

    # --- Hair 画像にのみ顔部分ブラーを適用 ---
    hair_pil = blur_face_with_landmarks(hair_pil)

    # --- face 画像にのみ顔以外白塗り ---    
    face_pil = whiteout_except_face(face_pil)

    # picture1 = Hair（ブラー済みベース）、picture2 = Face として渡す
    pil_images = [hair_pil, face_pil]

    progress(0.4, desc="Generating..." if lang == "en" else ("生成中..." if lang == "ja" else "生成中..."))
    generator = torch.Generator(device=device).manual_seed(seed)

    result = pipe(
    torch.cuda.empty_cache() # Clean up VRAM after inference

        image=pil_images,
        prompt=FIXED_PROMPT,
        negative_prompt=FIXED_NEGATIVE_PROMPT,
        num_inference_steps=num_inference_steps,
        generator=generator,
        true_cfg_scale=true_guidance_scale,
        num_images_per_prompt=1,
    ).images

    progress(1.0, desc="Done" if lang == "en" else ("完了" if lang == "ja" else "完成"))
    return result[0], t("status_ok", lang)
    #return result[0], t("status_ok", lang), hair_pil, face_pil

# ===============================
# --- Gradio UI Section ---
# ===============================

css = """
#app-wrap {margin: 0 auto; max-width: 1200px;}
.notice {
  background: #fff8e1;
  border: 1px solid #facc15;
  color: #713f12;
  padding: 12px 14px;
  border-radius: 12px;
  font-weight: 600;
  margin-bottom: 10px;
}
.card {
  background: white;
  border: 1px solid #e5e7eb;
  border-radius: 14px;
  padding: 14px;
  box-shadow: 0 1px 2px rgba(0,0,0,0.04);
}
"""

with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    lang_selector = gr.Radio(
        label=I18N["lang_label"]["en"],
        choices=[("English", "en"), ("日本語", "ja"), ("中文", "zh")],
        value="en",
        interactive=True,
    )

    title_md = gr.Markdown(I18N["title"]["en"])
    notice_html = gr.HTML(f"<div class='notice'>{I18N['notice']['en']}</div>")

    with gr.Column(elem_id="app-wrap"):
        with gr.Row():
            with gr.Column(scale=1):
                hair_dropdown = gr.Dropdown(
                    label=I18N["hair_input"]["en"],
                    choices=list_hair_images(),
                    value=None,
                )

                hair_preview = gr.Image(
                    label="Hair Preview",
                    type="pil",
                    height=320,
                    interactive=False,
                )

                face_image = gr.Image(
                    label=I18N["face_input"]["en"],
                    type="pil",
                    height=320,
                )

            with gr.Column(scale=1, elem_classes=["card"]):
                with gr.Accordion(I18N["accordion"]["en"], open=False):
                    seed = gr.Slider(
                        label=I18N["seed"]["en"],
                        minimum=0,
                        maximum=MAX_SEED,
                        step=1,
                        value=DEFAULT_SEED,
                    )
                    randomize_seed = gr.Checkbox(
                        label=I18N["rand"]["en"],
                        value=DEFAULT_RANDOMIZE,
                    )
                    true_guidance_scale = gr.Slider(
                        label=I18N["tgs"]["en"],
                        minimum=1.0,
                        maximum=10.0,
                        step=0.1,
                        value=DEFAULT_TRUE_GUIDANCE_SCALE,
                    )
                    num_inference_steps = gr.Slider(
                        label=I18N["steps"]["en"],
                        minimum=1,
                        maximum=40,
                        step=1,
                        value=DEFAULT_NUM_INFERENCE_STEPS,
                    )
                run_button = gr.Button(I18N["run"]["en"], variant="primary")

        result_image = gr.Image(
            label=I18N["output"]["en"],
            type="pil",
            format="png",
            height=520,
            show_download_button=True,
        )


       
        status_text = gr.Textbox(label=I18N["status"]["en"], interactive=False)

    def _switch_lang(lang):
        return (
            gr.update(label=I18N["lang_label"][lang]),  # lang_selector label
            I18N["title"][lang],                        # title_md markdown text
            gr.update(value=f"<div class='notice'>{I18N['notice'][lang]}</div>"),  # notice_html
            gr.update(label=I18N["hair_input"][lang]),  # hair_image label
            gr.update(label=I18N["face_input"][lang]),  # face_image label
            gr.update(label=I18N["seed"][lang]),
            gr.update(label=I18N["rand"][lang]),
            gr.update(label=I18N["tgs"][lang]),
            gr.update(label=I18N["steps"][lang]),
            gr.update(value=I18N["run"][lang]),
            gr.update(label=I18N["output"][lang]),
            gr.update(label=I18N["status"][lang]),
        )
    
    def update_hair_preview(selected):
        if selected is None:
            return None
        return load_hair_image(selected)

    hair_dropdown.change(
        fn=update_hair_preview,
        inputs=[hair_dropdown],
        outputs=[hair_preview],
    )

    lang_selector.change(
        fn=_switch_lang,
        inputs=[lang_selector],
        outputs=[
            lang_selector,
            title_md,
            notice_html,
            hair_preview,
            face_image,
            seed,
            randomize_seed,
            true_guidance_scale,
            num_inference_steps,
            run_button,
            result_image,
            status_text,
        ],
    )

    run_button.click(
        fn=infer,
        inputs=[
            face_image,
            hair_dropdown,
            seed,
            randomize_seed,
            true_guidance_scale,
            num_inference_steps,
            lang_selector,
        ],
        outputs=[result_image, status_text],
    )
if __name__ == "__main__":
    demo.launch()