Spaces:

adowu
/

Qwen-Image-Edit-Optimizedd

Runtime error

App Files Files Community

Qwen-Image-Edit-Optimizedd / app.py

adowu

Upload folder using huggingface_hub

18e7576 verified about 2 months ago

raw

history blame contribute delete

21.6 kB

	import math
	import random

	import cv2
	import gradio as gr
	import numpy as np
	import spaces
	import torch
	from PIL import Image
	from diffusers import FlowMatchEulerDiscreteScheduler
	import mediapipe as mp


	from optimization import optimize_pipeline_
	from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
	from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
	from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel

	import glob
	import os
	os.environ['HF_HOME'] = '/tmp'
	os.environ['TRANSFORMERS_CACHE'] = '/tmp'


	HAIR_IMAGE_DIR = "hair_png"

	def list_hair_images():
	files = glob.glob(os.path.join(HAIR_IMAGE_DIR, "*.png"))
	return [os.path.basename(f) for f in files]

	def load_hair_image(filename):
	if filename is None:
	return None
	path = os.path.join(HAIR_IMAGE_DIR, filename)
	return Image.open(path).convert("RGB")

	# ===============================
	# --- Model Loading ---
	# ===============================

	dtype = torch.bfloat16
	device = "cuda" if torch.cuda.is_available() else "cpu"

	scheduler_config = {
	"base_image_seq_len": 256,
	"base_shift": math.log(3),
	"invert_sigmas": False,
	"max_image_seq_len": 8192,
	"max_shift": math.log(3),
	"num_train_timesteps": 1000,
	"shift": 1.0,
	"shift_terminal": None,
	"stochastic_sampling": False,
	"time_shift_type": "exponential",
	"use_beta_sigmas": False,
	"use_dynamic_shifting": True,
	"use_exponential_sigmas": False,
	"use_karras_sigmas": False,
	}
	scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)

	# Pipeline読み込み
	pipe = QwenImageEditPlusPipeline.from_pretrained(
	"Qwen/Qwen-Image-Edit-2509",
	scheduler=scheduler,
	torch_dtype=dtype,
	).to(device)

	# LoRA適用（Lightning 4steps）
	pipe.load_lora_weights(
	"lightx2v/Qwen-Image-Lightning",
	weight_name="Qwen-Image-Edit-2509/Qwen-Image-Edit-2509-Lightning-4steps-V1.0-bf16.safetensors",
	weight_dtype=torch.bfloat16,
	)
	pipe.fuse_lora(lora_scale=1.0)

	pipe.transformer.__class__ = QwenImageTransformer2DModel
	pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())

	# ===============================
	# --- Constants & Prompts ---
	# ===============================

	MAX_SEED = np.iinfo(np.int32).max
	DEFAULT_SEED = 0
	DEFAULT_RANDOMIZE = True
	DEFAULT_TRUE_GUIDANCE_SCALE = 1.0
	DEFAULT_NUM_INFERENCE_STEPS = 4

	# picture1 = Hair / ベース画像, picture2 = Face画像
	FIXED_PROMPT = (
	"Replace the blurred face in picture1 with the face from picture2."
	"Use only the visible face from picture2, and ignore all the surrounding white areas."
	"Preserve picture1’s hairstyle, head shape, lighting, shadows, and background exactly as they are."
	"Place the new face in the correct position and adjust its size, rotation angle, and perspective so that it matches the original head orientation naturally."
	"Match the inserted face to picture2's skin tone, lighting direction, and contrast."
	"Also adjust the skin tone of the neck and body in picture1 so that it matches the skin tone of the face from picture2 naturally."
	"Blend edges smoothly so the result looks like a single realistic person with no visible editing artifacts."
	)

	FIXED_NEGATIVE_PROMPT = "blurry, extra lines, color bleeding"

	# ===============================
	# --- I18N Dictionary ---
	# ===============================

	I18N = {
	"title": {
	"en": "Hairstyle Transformer",
	"ja": "髪型変換",
	"zh": "发型变换",
	},
	"notice": {
	"en": (
	"Note: When using this software, please comply with applicable laws and ensure that you do not infringe on the rights of others. "
	"The software developer assumes no responsibility for how users utilize this software. "
	"When posting images online (SNS, etc.), be sure to use source photos of fictional people created with image-generation tools, "
	"and never engage in activities such as deepfakes that impersonate or mislead others."
	),
	"ja": (
	"注意：本ソフトウェアを利用する際は、関連する法規制を遵守し、他者の権利を侵害しないよう十分ご注意ください。"
	"また、ソフトウェア開発者は、ユーザーによる利用方法について一切の責任を負いません。"
	"SNS等で公開する際は、画像生成アプリなどで作成した実在しない人物の画像を入力素材としてご使用ください。"
	"他者を不当に模倣・誤認させるディープフェイクなどの行為は絶対に行わないでください。"
	),
	"zh": (
	"注意：使用本软件时，请遵守相关法律法规，并注意不要侵犯他人的权利。"
	"软件开发者对用户的使用方式不承担任何责任。"
	"在社交平台（SNS等）公开发布时，请使用通过图像生成工具创建的虚构人物图片作为输入，"
	"绝不可从事深度伪造等不正当模仿或误导他人的行为。"
	)
	},
	"face_input": {
	"en": "Face image (picture2)",
	"ja": "Face入力画像（picture2）",
	"zh": "人脸图像（picture2）",
	},
	"hair_input": {
	"en": "Hair image (picture1)",
	"ja": "Hair 画像（picture1）",
	"zh": "头发图像（picture1）",
	},
	"accordion": {"en": "Advanced settings", "ja": "詳細設定", "zh": "高级设置"},
	"seed": {"en": "Seed", "ja": "Seed", "zh": "Seed"},
	"rand": {"en": "Randomize seed", "ja": "ランダムシード", "zh": "随机种子"},
	"tgs": {"en": "True guidance scale", "ja": "True guidance scale", "zh": "True guidance scale"},
	"steps": {"en": "Steps", "ja": "生成ステップ数", "zh": "生成步数"},
	"run": {"en": "Generate", "ja": "生成", "zh": "生成"},
	"output": {"en": "Output image", "ja": "出力画像", "zh": "输出图像"},
	"status": {"en": "Status", "ja": "ステータス", "zh": "状态"},
	"status_ok": {
	"en": "Generated 1 image (PNG).",
	"ja": "1枚生成しました（PNG）。",
	"zh": "已生成 1 张图片（PNG）。",
	},
	"err_no_img": {
	"en": "Error: Please upload both Face and Hair images.",
	"ja": "エラー: Face画像とHair画像の両方をアップロードしてください。",
	"zh": "错误：请先上传 Face 和 Hair 两张图片。",
	},
	"lang_label": {"en": "UI Language", "ja": "UI言語", "zh": "界面语言"},
	}


	def t(key, lang):
	return I18N[key][lang]


	mp_face = mp.solutions.face_mesh

	FACE_OVAL = [
	10,338,297,332,284,251,389,356,454,323,361,288,
	397,365,379,378,400,377,152,148,176,149,150,136,
	172,58,132,93,234,127,162,21,54,103,67,109
	]

	# 眉の代表点（下側）
	LEFT_BROW = [105, 66, 107] # 左眉の中央付近
	RIGHT_BROW = [334, 293, 300] # 右眉の中央付近


	# ===============================
	# --- Face Preprocess with OpenCV ---
	# ===============================

	def preprocess_face(image: Image.Image, target_size: int = 1024) -> Image.Image:
	"""
	MediaPipe による高精度な顔検出で、顔・首が中心になるよう整形して正方形画像を生成。
	"""

	face_ratio = 0.6 # 顔の占有率（0.6 = 60%）

	img_rgb = np.array(image.convert("RGB"))
	h, w, _ = img_rgb.shape

	# OpenCV BGR
	img_bgr = img_rgb[:, :, ::-1]

	# ================================
	# MediaPipe 顔検出
	# ================================
	mp_face = mp.solutions.face_detection
	with mp_face.FaceDetection(model_selection=1, min_detection_confidence=0.5) as fd:
	results = fd.process(img_rgb)

	# -----------------------------------------------------------
	# 顔が見つからない場合：中央を正方形に切って白背景でパディング
	# -----------------------------------------------------------
	if not results.detections:
	side = min(w, h)
	x1 = (w - side) // 2
	y1 = (h - side) // 2
	crop = img_rgb[y1:y1 + side, x1:x1 + side]
	pil = Image.fromarray(crop).resize((target_size, target_size), Image.LANCZOS)
	return pil

	# ================================
	# 最大の顔検出を選択（信頼度 or bbox サイズ）
	# ================================
	detections = results.detections
	def bbox_area(det):
	box = det.location_data.relative_bounding_box
	return box.width * box.height

	det = max(detections, key=bbox_area)
	box = det.location_data.relative_bounding_box

	# MediaPipe は0〜1正規化 → 画像座標へ変換
	x = int(box.xmin * w)
	y = int(box.ymin * h)
	fw = int(box.width * w)
	fh = int(box.height * h)

	# 顔中心
	cx = x + fw // 2
	cy = y + fh // 2

	# 顔重心を少し上に補正
	cy_adjusted = cy - int(fh * 0.15)

	# 顔 + 首の範囲を大きめに取る
	head_top = max(0, y - int(fh * 0.3))
	head_bottom = min(h, y + fh + int(fh * 0.4))
	head_left = max(0, x - int(fw * 0.3))
	head_right = min(w, x + fw + int(fw * 0.3))

	head_w = head_right - head_left
	head_h = head_bottom - head_top

	# 顔サイズに応じてスケール
	desired_face_size = int(target_size * face_ratio)
	scale = desired_face_size / max(fw, fh)

	# 画像全体をスケール
	scaled_w = int(w * scale)
	scaled_h = int(h * scale)
	scaled_img = cv2.resize(img_rgb, (scaled_w, scaled_h), interpolation=cv2.INTER_LANCZOS4)

	# スケール後の顔中心
	cx_s = int(cx * scale)
	cy_s = int(cy_adjusted * scale)

	# キャンバス中央に合わせるオフセット
	offset_x = target_size // 2 - cx_s
	offset_y = target_size // 2 - cy_s

	# 白背景キャンバス
	canvas = np.ones((target_size, target_size, 3), dtype=np.uint8) * 255

	# 貼り付け先座標
	x_start = max(0, offset_x)
	y_start = max(0, offset_y)
	x_end = min(target_size, offset_x + scaled_w)
	y_end = min(target_size, offset_y + scaled_h)

	# 元画像の切り出し位置
	src_x1 = max(0, -offset_x)
	src_y1 = max(0, -offset_y)
	src_x2 = src_x1 + (x_end - x_start)
	src_y2 = src_y1 + (y_end - y_start)

	src_x2 = min(src_x2, scaled_w)
	src_y2 = min(src_y2, scaled_h)

	# キャンバスに貼り付け
	canvas[y_start:y_end, x_start:x_end] = scaled_img[src_y1:src_y2, src_x1:src_x2]

	return Image.fromarray(canvas)


	# ===============================
	# --- Blur Hair Image with face_recognition ---
	# ===============================

	def blur_face_with_landmarks(image_pil):
	if image_pil is None:
	return None

	img_rgb = np.array(image_pil.convert("RGB"))
	img = img_rgb[:, :, ::-1] # RGB→BGR
	h, w = img.shape[:2]
	rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

	with mp_face.FaceMesh(
	static_image_mode=True,
	max_num_faces=5,
	refine_landmarks=True,
	) as face:

	res = face.process(rgb)

	# ★★ 顔なし → 全体ブラーに変更 ★★
	if not res.multi_face_landmarks:
	blurred = cv2.GaussianBlur(img, (301, 301), 0)
	return Image.fromarray(blurred[:, :, ::-1]) # BGR→RGB

	output = img.copy()

	for lm in res.multi_face_landmarks:
	L = lm.landmark

	oval = [(int(L[i].x * w), int(L[i].y * h)) for i in FACE_OVAL]

	left_brow = [(int(L[i].xw), int(L[i].yh)) for i in LEFT_BROW]
	right_brow = [(int(L[i].xw), int(L[i].yh)) for i in RIGHT_BROW]

	brow_center = np.mean(left_brow + right_brow, axis=0)
	chin_center = np.mean(oval[:8], axis=0)
	face_h = np.linalg.norm(brow_center - chin_center)

	offset = int(face_h * 0.12)

	forehead = [(x, y - offset) for (x, y) in left_brow + right_brow]

	mask = np.zeros((h, w), dtype=np.uint8)
	cv2.fillPoly(mask, [np.array(oval, np.int32)], 255)

	flood = mask.copy()
	cv2.floodFill(flood, None, seedPoint=(w//2, h//2), newVal=255)

	cv2.fillPoly(flood, [np.array(forehead, np.int32)], 255)

	final_mask = flood

	blurred = cv2.GaussianBlur(output, (301, 301), 0)
	mask3 = cv2.merge([final_mask, final_mask, final_mask])

	output = np.where(mask3 == 255, blurred, output)

	return Image.fromarray(output[:, :, ::-1])


	def whiteout_except_face(image_pil):
	if image_pil is None:
	return None

	img_rgb = np.array(image_pil.convert("RGB"))
	img = img_rgb[:, :, ::-1] # RGB→BGR
	h, w = img.shape[:2]
	rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

	with mp_face.FaceMesh(
	static_image_mode=True,
	max_num_faces=5,
	refine_landmarks=True,
	) as face:

	res = face.process(rgb)

	# ★★ 顔なし → 全部白塗りに変更 ★★
	if not res.multi_face_landmarks:
	white_img = np.full_like(img, 255) # BGRで白
	return Image.fromarray(white_img[:, :, ::-1]) # RGBに戻す

	white_bg = np.full_like(img, 255)
	mask_total = np.zeros((h, w), dtype=np.uint8)

	for lm in res.multi_face_landmarks:
	L = lm.landmark

	oval = [(int(L[i].x * w), int(L[i].y * h)) for i in FACE_OVAL]

	left_brow = [(int(L[i].x * w), int(L[i].y * h)) for i in LEFT_BROW]
	right_brow = [(int(L[i].x * w), int(L[i].y * h)) for i in RIGHT_BROW]
	brow_y = int(np.mean([p[1] for p in left_brow + right_brow]))

	chin_y = int(np.mean([oval[i][1] for i in range(8)]))
	face_h = abs(chin_y - brow_y)

	margin = int(face_h * 0.50)

	mask = np.zeros((h, w), dtype=np.uint8)
	cv2.fillPoly(mask, [np.array(oval, np.int32)], 255)

	filled = mask.copy()
	cv2.floodFill(filled, None, seedPoint=(w//2, h//2), newVal=255)

	cut_y = max(brow_y - margin, 0)
	filled[:cut_y, :] = 0

	mask_total = cv2.bitwise_or(mask_total, filled)

	soft_mask = cv2.GaussianBlur(mask_total, (0, 0), 25)

	soft_mask_f = soft_mask.astype(np.float32) / 255.0
	soft_mask_f = cv2.merge([soft_mask_f]*3)

	output = img * soft_mask_f + white_bg * (1.0 - soft_mask_f)
	output = output.astype(np.uint8)

	return Image.fromarray(output[:, :, ::-1])


	# ===============================
	# --- Unified Inference Function ---
	# ===============================

	@spaces.GPU()
	def infer(
	face_image,
	hair_image,
	seed=DEFAULT_SEED,
	randomize_seed=DEFAULT_RANDOMIZE,
	true_guidance_scale=DEFAULT_TRUE_GUIDANCE_SCALE,
	num_inference_steps=DEFAULT_NUM_INFERENCE_STEPS,
	lang="en",
	progress=gr.Progress(track_tqdm=True),
	):
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	if face_image is None or hair_image is None:
	return None, t("err_no_img", lang)

	# --- PIL 化 ---
	if isinstance(face_image, Image.Image):
	face_pil = face_image.convert("RGB")
	else:
	face_pil = Image.open(face_image).convert("RGB")

	# hair_image はドロップダウンの値（ファイル名 str）
	hair_pil = load_hair_image(hair_image)
	if hair_pil is None:
	return None, t("err_no_img", lang)

	hair_pil = hair_pil.convert("RGB")


	# --- 顔中心になるようトリミング＆リサイズ (1024x1024) ---
	face_pil = preprocess_face(face_pil, target_size=1024)
	#hair_pil = preprocess_face(hair_pil, target_size=1024)

	# --- Hair 画像にのみ顔部分ブラーを適用 ---
	hair_pil = blur_face_with_landmarks(hair_pil)

	# --- face 画像にのみ顔以外白塗り ---
	face_pil = whiteout_except_face(face_pil)

	# picture1 = Hair（ブラー済みベース）、picture2 = Face として渡す
	pil_images = [hair_pil, face_pil]

	progress(0.4, desc="Generating..." if lang == "en" else ("生成中..." if lang == "ja" else "生成中..."))
	generator = torch.Generator(device=device).manual_seed(seed)

	result = pipe(
	image=pil_images,
	prompt=FIXED_PROMPT,
	negative_prompt=FIXED_NEGATIVE_PROMPT,
	num_inference_steps=num_inference_steps,
	generator=generator,
	true_cfg_scale=true_guidance_scale,
	num_images_per_prompt=1,
	).images

	progress(1.0, desc="Done" if lang == "en" else ("完了" if lang == "ja" else "完成"))
	return result[0], t("status_ok", lang)
	#return result[0], t("status_ok", lang), hair_pil, face_pil

	# ===============================
	# --- Gradio UI Section ---
	# ===============================

	css = """
	#app-wrap {margin: 0 auto; max-width: 1200px;}
	.notice {
	background: #fff8e1;
	border: 1px solid #facc15;
	color: #713f12;
	padding: 12px 14px;
	border-radius: 12px;
	font-weight: 600;
	margin-bottom: 10px;
	}
	.card {
	background: white;
	border: 1px solid #e5e7eb;
	border-radius: 14px;
	padding: 14px;
	box-shadow: 0 1px 2px rgba(0,0,0,0.04);
	}
	"""

	with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
	lang_selector = gr.Radio(
	label=I18N["lang_label"]["en"],
	choices=[("English", "en"), ("日本語", "ja"), ("中文", "zh")],
	value="en",
	interactive=True,
	)

	title_md = gr.Markdown(I18N["title"]["en"])
	notice_html = gr.HTML(f"<div class='notice'>{I18N['notice']['en']}</div>")

	with gr.Column(elem_id="app-wrap"):
	with gr.Row():
	with gr.Column(scale=1):
	hair_dropdown = gr.Dropdown(
	label=I18N["hair_input"]["en"],
	choices=list_hair_images(),
	value=None,
	)

	hair_preview = gr.Image(
	label="Hair Preview",
	type="pil",
	height=320,
	interactive=False,
	)

	face_image = gr.Image(
	label=I18N["face_input"]["en"],
	type="pil",
	height=320,
	)

	with gr.Column(scale=1, elem_classes=["card"]):
	with gr.Accordion(I18N["accordion"]["en"], open=False):
	seed = gr.Slider(
	label=I18N["seed"]["en"],
	minimum=0,
	maximum=MAX_SEED,
	step=1,
	value=DEFAULT_SEED,
	)
	randomize_seed = gr.Checkbox(
	label=I18N["rand"]["en"],
	value=DEFAULT_RANDOMIZE,
	)
	true_guidance_scale = gr.Slider(
	label=I18N["tgs"]["en"],
	minimum=1.0,
	maximum=10.0,
	step=0.1,
	value=DEFAULT_TRUE_GUIDANCE_SCALE,
	)
	num_inference_steps = gr.Slider(
	label=I18N["steps"]["en"],
	minimum=1,
	maximum=40,
	step=1,
	value=DEFAULT_NUM_INFERENCE_STEPS,
	)
	run_button = gr.Button(I18N["run"]["en"], variant="primary")

	result_image = gr.Image(
	label=I18N["output"]["en"],
	type="pil",
	format="png",
	height=520,
	show_download_button=True,
	)



	status_text = gr.Textbox(label=I18N["status"]["en"], interactive=False)

	def _switch_lang(lang):
	return (
	gr.update(label=I18N["lang_label"][lang]), # lang_selector label
	I18N["title"][lang], # title_md markdown text
	gr.update(value=f"<div class='notice'>{I18N['notice'][lang]}</div>"), # notice_html
	gr.update(label=I18N["hair_input"][lang]), # hair_image label
	gr.update(label=I18N["face_input"][lang]), # face_image label
	gr.update(label=I18N["seed"][lang]),
	gr.update(label=I18N["rand"][lang]),
	gr.update(label=I18N["tgs"][lang]),
	gr.update(label=I18N["steps"][lang]),
	gr.update(value=I18N["run"][lang]),
	gr.update(label=I18N["output"][lang]),
	gr.update(label=I18N["status"][lang]),
	)

	def update_hair_preview(selected):
	if selected is None:
	return None
	return load_hair_image(selected)

	hair_dropdown.change(
	fn=update_hair_preview,
	inputs=[hair_dropdown],
	outputs=[hair_preview],
	)

	lang_selector.change(
	fn=_switch_lang,
	inputs=[lang_selector],
	outputs=[
	lang_selector,
	title_md,
	notice_html,
	hair_preview,
	face_image,
	seed,
	randomize_seed,
	true_guidance_scale,
	num_inference_steps,
	run_button,
	result_image,
	status_text,
	],
	)

	run_button.click(
	fn=infer,
	inputs=[
	face_image,
	hair_dropdown,
	seed,
	randomize_seed,
	true_guidance_scale,
	num_inference_steps,
	lang_selector,
	],
	outputs=[result_image, status_text],
	)
	if __name__ == "__main__":
	demo.launch()