Spaces:

DiffSynth-Studio
/

Z-Image-i2L

Running on Zero

App Files Files Community

Z-Image-i2L / app.py

kelseye

Update app.py

4a60f19 verified 5 days ago

raw

history blame contribute delete

17.3 kB

	import spaces
	import gradio as gr
	import torch
	from PIL import Image
	import os
	import sys
	import subprocess
	import tempfile
	from pathlib import Path
	import glob

	# Default negative prompts
	NEGATIVE_PROMPT_CN = "泛黄，发绿，模糊，低分辨率，低质量图像，扭曲的肢体，诡异的外观，丑陋，AI感，噪点，网格感，JPEG压缩条纹，异常的肢体，水印，乱码，意义不明的字符"
	NEGATIVE_PROMPT_EN = "Yellowed, green-tinted, blurry, low-resolution, low-quality image, distorted limbs, eerie appearance, ugly, AI-looking, noise, grid-like artifacts, JPEG compression artifacts, abnormal limbs, watermark, garbled text, meaningless characters"

	# Model paths - can be overridden via environment variables
	MODELS_DIR = Path(os.environ.get("ZIMAGE_MODELS_DIR", "./models"))


	# =============================================================================
	# Model Download Functions
	# =============================================================================

	def download_hf_models(output_dir: Path) -> dict:
	"""
	Download required models from Hugging Face using huggingface_hub.

	Downloads:
	- DiffSynth-Studio/Z-Image-i2L
	- Tongyi-MAI/Z-Image
	- DiffSynth-Studio/General-Image-Encoders
	- Tongyi-MAI/Z-Image-Turbo

	Returns dict with paths to downloaded models.
	"""
	from huggingface_hub import snapshot_download

	output_dir.mkdir(parents=True, exist_ok=True)

	models = [
	{
	"repo_id": "DiffSynth-Studio/General-Image-Encoders",
	"description": "General Image Encoders (SigLIP2-G384, DINOv3-7B)",
	"allow_patterns": None,
	},
	{
	"repo_id": "Tongyi-MAI/Z-Image-Turbo",
	"description": "Z-Image Turbo (text encoder, VAE, tokenizer)",
	"allow_patterns": [
	"text_encoder/*.safetensors",
	"vae/*.safetensors",
	"tokenizer/*",
	],
	},
	{
	"repo_id": "Tongyi-MAI/Z-Image",
	"description": "Z-Image base model (transformer)",
	"allow_patterns": ["transformer/*.safetensors"],
	},
	{
	"repo_id": "DiffSynth-Studio/Z-Image-i2L",
	"description": "Z-Image-i2L (Image to LoRA model)",
	"allow_patterns": ["*.safetensors"],
	},
	]

	downloaded_paths = {}

	for model in models:
	repo_id = model["repo_id"]
	local_dir = output_dir / repo_id

	# Check if already downloaded
	if local_dir.exists() and any(local_dir.rglob("*.safetensors")):
	print(f" ✓ {repo_id} (already downloaded)")
	downloaded_paths[repo_id] = local_dir
	continue

	print(f" 📥 Downloading {repo_id}...")
	print(f" {model['description']}")

	try:
	result_path = snapshot_download(
	repo_id=repo_id,
	local_dir=str(local_dir),
	allow_patterns=model["allow_patterns"],
	local_dir_use_symlinks=False,
	resume_download=True,
	)
	downloaded_paths[repo_id] = Path(result_path)
	print(f" ✓ {repo_id}")
	except Exception as e:
	print(f" ❌ Error downloading {repo_id}: {e}")
	raise

	return downloaded_paths


	def get_model_files(base_path: Path, pattern: str) -> list:
	"""Get list of files matching a glob pattern."""
	full_pattern = str(base_path / pattern)
	files = sorted(glob.glob(full_pattern))
	return files


	def install_diffsynth_studio():
	"""Clone and install DiffSynth-Studio if not already installed."""
	try:
	from diffsynth.pipelines.z_image import ZImagePipeline
	return True, "✅ DiffSynth-Studio is already installed."
	except ImportError:
	pass

	repo_dir = Path(__file__).parent / "DiffSynth-Studio"

	try:
	if not repo_dir.exists():
	print("📥 Cloning DiffSynth-Studio repository...")
	subprocess.run(
	["git", "clone", "https://github.com/modelscope/DiffSynth-Studio.git", str(repo_dir)],
	capture_output=True,
	text=True,
	check=True
	)
	print("✅ Repository cloned successfully.")
	else:
	print("📁 DiffSynth-Studio directory already exists, pulling latest...")
	subprocess.run(
	["git", "-C", str(repo_dir), "pull"],
	capture_output=True,
	text=True
	)

	print("📦 Installing DiffSynth-Studio...")
	subprocess.run(
	[sys.executable, "-m", "pip", "install", "-e", str(repo_dir)],
	capture_output=True,
	text=True,
	check=True
	)
	print("✅ DiffSynth-Studio installed successfully.")

	sys.path.insert(0, str(repo_dir))

	from diffsynth.pipelines.z_image import ZImagePipeline
	return True, "✅ DiffSynth-Studio installed successfully!"

	except subprocess.CalledProcessError as e:
	error_msg = f"❌ Installation failed: {e.stderr}"
	print(error_msg)
	return False, error_msg
	except Exception as e:
	error_msg = f"❌ Error during installation: {str(e)}"
	print(error_msg)
	return False, error_msg


	# =============================================================================
	# Pipeline Initialization
	# =============================================================================

	print("=" * 60)
	print(" Z-Image-i2L Gradio Demo - Initializing")
	print("=" * 60)
	print()

	# Step 1: Install DiffSynth-Studio
	print("🔍 Step 1: Checking DiffSynth-Studio installation...")
	success, message = install_diffsynth_studio()
	print(message)

	if not success:
	raise RuntimeError("Failed to install DiffSynth-Studio. Cannot continue.")

	# Step 2: Download HuggingFace models
	print()
	print("🔍 Step 2: Downloading models from HuggingFace...")
	print(f" Models directory: {MODELS_DIR.absolute()}")
	downloaded_paths = download_hf_models(MODELS_DIR)

	# Import required modules
	from diffsynth.pipelines.z_image import (
	ZImagePipeline, ModelConfig,
	ZImageUnit_Image2LoRAEncode, ZImageUnit_Image2LoRADecode
	)
	from safetensors.torch import save_file, load_file

	# Step 3: Configure VRAM settings
	print()
	print("⚙️ Step 3: Configuring VRAM settings...")
	vram_config = {
	"offload_dtype": torch.bfloat16,
	"offload_device": "cuda",
	"onload_dtype": torch.bfloat16,
	"onload_device": "cuda",
	"preparing_dtype": torch.bfloat16,
	"preparing_device": "cuda",
	"computation_dtype": torch.bfloat16,
	"computation_device": "cuda",
	}

	# Step 4: Resolve local model paths
	print()
	print("📂 Step 4: Resolving model paths...")

	# Z-Image transformer
	zimage_path = MODELS_DIR / "Tongyi-MAI" / "Z-Image"
	zimage_transformer_files = get_model_files(zimage_path, "transformer/*.safetensors")

	# Z-Image-Turbo
	zimage_turbo_path = MODELS_DIR / "Tongyi-MAI" / "Z-Image-Turbo"
	text_encoder_files = get_model_files(zimage_turbo_path, "text_encoder/*.safetensors")
	vae_file = get_model_files(zimage_turbo_path, "vae/diffusion_pytorch_model.safetensors")
	tokenizer_path = zimage_turbo_path / "tokenizer"

	# General Image Encoders
	encoders_path = MODELS_DIR / "DiffSynth-Studio" / "General-Image-Encoders"
	siglip_file = get_model_files(encoders_path, "SigLIP2-G384/model.safetensors")
	dino_file = get_model_files(encoders_path, "DINOv3-7B/model.safetensors")

	# Z-Image-i2L from HuggingFace
	zimage_i2l_path = MODELS_DIR / "DiffSynth-Studio" / "Z-Image-i2L"
	zimage_i2l_file = get_model_files(zimage_i2l_path, "model.safetensors")

	print(f" Z-Image transformer: {len(zimage_transformer_files)} file(s)")
	print(f" Text encoder: {len(text_encoder_files)} file(s)")
	print(f" VAE: {len(vae_file)} file(s)")
	print(f" Tokenizer: {tokenizer_path}")
	print(f" SigLIP2: {len(siglip_file)} file(s)")
	print(f" DINOv3: {len(dino_file)} file(s)")
	print(f" Z-Image-i2L: {len(zimage_i2l_file)} file(s)")

	# Validate files
	missing = []
	if not zimage_transformer_files: missing.append("Z-Image transformer")
	if not text_encoder_files: missing.append("Text encoder")
	if not vae_file: missing.append("VAE")
	if not tokenizer_path.exists(): missing.append("Tokenizer")
	if not siglip_file: missing.append("SigLIP2")
	if not dino_file: missing.append("DINOv3")
	if not zimage_i2l_file: missing.append("Z-Image-i2L")

	if missing:
	raise FileNotFoundError(f"Missing model files: {', '.join(missing)}")

	# Step 5: Load pipeline
	print()
	print("🚀 Step 5: Loading Z-Image pipeline...")
	print(" All models loaded from HuggingFace local paths")

	model_configs = [
	# All models from HuggingFace - use path= for local files
	ModelConfig(path=zimage_transformer_files, **vram_config),
	ModelConfig(path=text_encoder_files),
	ModelConfig(path=vae_file),
	ModelConfig(path=siglip_file),
	ModelConfig(path=dino_file),
	ModelConfig(path=zimage_i2l_file),
	]

	pipe = ZImagePipeline.from_pretrained(
	torch_dtype=torch.bfloat16,
	device="cuda",
	model_configs=model_configs,
	tokenizer_config=ModelConfig(path=str(tokenizer_path)),
	)

	print()
	print("✅ Pipeline loaded successfully!")
	print("=" * 60)
	print()


	# =============================================================================
	# Gradio Functions
	# =============================================================================

	@spaces.GPU(duration=120)
	def image_to_lora(images, progress=gr.Progress()):
	"""Convert input images to a LoRA model."""
	if images is None or len(images) == 0:
	return None, "❌ Please upload at least one image!"

	try:
	progress(0.1, desc="Processing images...")

	pil_images = []
	for img in images:
	if isinstance(img, str):
	pil_images.append(Image.open(img).convert("RGB"))
	elif isinstance(img, tuple):
	pil_images.append(Image.open(img[0]).convert("RGB"))
	else:
	pil_images.append(Image.fromarray(img).convert("RGB"))

	progress(0.3, desc="Encoding images to LoRA...")

	with torch.no_grad():
	embs = ZImageUnit_Image2LoRAEncode().process(pipe, image2lora_images=pil_images)
	progress(0.7, desc="Decoding LoRA weights...")
	lora = ZImageUnit_Image2LoRADecode().process(pipe, **embs)["lora"]

	progress(0.9, desc="Saving LoRA file...")

	temp_dir = tempfile.mkdtemp()
	lora_path = os.path.join(temp_dir, "generated_lora.safetensors")
	save_file(lora, lora_path)

	progress(1.0, desc="Done!")

	return lora_path, f"✅ LoRA generated successfully from {len(pil_images)} image(s)!"

	except Exception as e:
	return None, f"❌ Error generating LoRA: {str(e)}"


	@spaces.GPU(duration=60)
	def generate_image(
	lora_file,
	prompt,
	negative_prompt,
	seed,
	cfg_scale,
	sigma_shift,
	num_steps,
	progress=gr.Progress()
	):
	"""Generate an image using the created LoRA."""
	if lora_file is None:
	return None, "❌ Please generate or upload a LoRA file first!"

	try:
	progress(0.1, desc="Loading LoRA...")

	lora = load_file(lora_file)
	# Move LoRA tensors to CUDA with correct dtype
	lora = {k: v.to(device="cuda", dtype=torch.bfloat16) for k, v in lora.items()}

	progress(0.3, desc="Generating image...")

	image = pipe(
	prompt=prompt,
	negative_prompt=negative_prompt,
	seed=int(seed),
	cfg_scale=cfg_scale,
	num_inference_steps=int(num_steps),
	positive_only_lora=lora,
	sigma_shift=sigma_shift
	)

	progress(1.0, desc="Done!")

	return image, "✅ Image generated successfully!"

	except Exception as e:
	return None, f"❌ Error generating image: {str(e)}"


	def create_demo():
	"""Create the Gradio interface."""

	with gr.Blocks(
	title="Z-Image-i2L Demo",
	theme=gr.themes.Soft(),
	css=".gradio-container { max-width: 1200px !important; margin: 0 auto}"
	) as demo:
	gr.Markdown("""
	# 🎨 Z-Image-i2L: Image to LoRA Demo

	> 💡 Tip: For best results, use 4-6 images with a consistent artistic style.
	""")

	with gr.Tabs():
	with gr.TabItem("📸 Step 1: Image to LoRA"):
	with gr.Row():
	with gr.Column(scale=1):
	input_gallery = gr.Gallery(
	label="Upload Style Images (1-6 images)",
	file_types=["image"],
	columns=3,
	height=300,
	interactive=True
	)

	gr.Markdown("""
	Guidelines:
	- Upload 1-6 images with a consistent style
	- Higher quality images produce better results
	- Mix of subjects helps generalization
	""")

	generate_lora_btn = gr.Button("🎯 Generate LoRA", variant="primary")

	with gr.Column(scale=1):
	lora_output = gr.File(
	label="Generated LoRA File",
	file_types=[".safetensors"],
	interactive=False
	)
	lora_status = gr.Textbox(
	label="Status",
	interactive=False,
	lines=2
	)

	with gr.TabItem("🖼️ Step 2: Generate Images"):
	with gr.Row():
	with gr.Column(scale=1):
	lora_input = gr.File(
	label="LoRA File (from Step 1 or upload)",
	file_types=[".safetensors"]
	)

	prompt = gr.Textbox(
	label="Prompt",
	placeholder="Describe what you want to generate...",
	value="a cat",
	lines=2
	)

	with gr.Accordion("Negative Prompt", open=False):
	negative_prompt = gr.Textbox(
	label="Negative Prompt",
	value=NEGATIVE_PROMPT_CN,
	lines=3
	)
	with gr.Row():
	use_cn_neg = gr.Button("Use Chinese", size="sm")
	use_en_neg = gr.Button("Use English", size="sm")

	with gr.Accordion("Advanced Settings", open=False):
	seed = gr.Number(label="Seed", value=0, precision=0)
	cfg_scale = gr.Slider(label="CFG Scale", minimum=1, maximum=10, value=4, step=0.5)
	sigma_shift = gr.Slider(label="Sigma Shift", minimum=1, maximum=15, value=8, step=1)
	num_steps = gr.Slider(label="Steps", minimum=20, maximum=100, value=50, step=5)

	generate_btn = gr.Button("✨ Generate Image", variant="primary")

	with gr.Column(scale=1):
	output_image = gr.Image(label="Generated Image", type="pil", height=512)
	gen_status = gr.Textbox(label="Status", interactive=False, lines=2)

	gr.Markdown("""
	---
	Resources: [Z-Image-i2L (HuggingFace)](https://huggingface.co/DiffSynth-Studio/Z-Image-i2L) \|
	[DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) \|
	Settings: CFG=4, Sigma Shift=8, Steps=50
	""")

	# Event handlers
	generate_lora_btn.click(
	fn=image_to_lora,
	inputs=[input_gallery],
	outputs=[lora_output, lora_status]
	)

	lora_output.change(fn=lambda x: x, inputs=[lora_output], outputs=[lora_input])

	generate_btn.click(
	fn=generate_image,
	inputs=[lora_input, prompt, negative_prompt, seed, cfg_scale, sigma_shift, num_steps],
	outputs=[output_image, gen_status]
	)

	use_cn_neg.click(fn=lambda: NEGATIVE_PROMPT_CN, outputs=[negative_prompt])
	use_en_neg.click(fn=lambda: NEGATIVE_PROMPT_EN, outputs=[negative_prompt])

	return demo


	if __name__ == "__main__":
	print("Starting Gradio server...")
	demo = create_demo()
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False)