hexware's picture
Update app.py
17a64d0 verified
import os
import uuid
import numpy as np
import random
import tempfile
import zipfile
import spaces
import torch
import gradio as gr
from PIL import Image
from diffusers import QwenImageLayeredPipeline
from pptx import Presentation
LOG_DIR = "/tmp/local"
MAX_SEED = np.iinfo(np.int32).max
# Optional HF login (works in Spaces if you set HF token as secret env var "hf")
from huggingface_hub import login
login(token=os.environ.get("hf"))
dtype = torch.bfloat16
device = "cuda" if torch.cuda.is_available() else "cpu"
pipeline = QwenImageLayeredPipeline.from_pretrained(
"Qwen/Qwen-Image-Layered", torch_dtype=dtype
).to(device)
def ensure_dirname(path: str):
if path and not os.path.exists(path):
os.makedirs(path, exist_ok=True)
def random_str(length=8):
return uuid.uuid4().hex[:length]
def imagelist_to_pptx(img_files):
with Image.open(img_files[0]) as img:
img_width_px, img_height_px = img.size
def px_to_emu(px, dpi=96):
inch = px / dpi
emu = inch * 914400
return int(emu)
prs = Presentation()
prs.slide_width = px_to_emu(img_width_px)
prs.slide_height = px_to_emu(img_height_px)
slide = prs.slides.add_slide(prs.slide_layouts[6])
left = top = 0
for img_path in img_files:
slide.shapes.add_picture(
img_path,
left,
top,
width=px_to_emu(img_width_px),
height=px_to_emu(img_height_px),
)
with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as tmp:
prs.save(tmp.name)
return tmp.name
def _clamp_int(x, default: int, lo: int, hi: int) -> int:
try:
v = int(x)
except Exception:
v = default
return max(lo, min(hi, v))
# Dynamic duration callable: must accept the same args as infer(). It returns seconds.
def get_duration(
input_image,
seed=777,
randomize_seed=False,
prompt=None,
neg_prompt=" ",
true_guidance_scale=4.0,
num_inference_steps=50,
layer=4,
cfg_norm=True,
use_en_prompt=True,
resolution=640,
gpu_duration=1000, # <-- NEW
):
# Allow user override via UI (text field), but keep it sane
return _clamp_int(gpu_duration, default=1000, lo=20, hi=1500)
@spaces.GPU(duration=get_duration)
def infer(
input_image,
seed=777,
randomize_seed=False,
prompt=None,
neg_prompt=" ",
true_guidance_scale=4.0,
num_inference_steps=50,
layer=4,
cfg_norm=True,
use_en_prompt=True,
resolution=640,
gpu_duration=1000, # <-- NEW (must match get_duration signature)
):
# Seed
if randomize_seed:
seed = random.randint(0, MAX_SEED)
# Normalize resolution input
resolution = _clamp_int(resolution, default=640, lo=640, hi=1024)
if resolution not in (640, 1024):
resolution = 640
# Normalize image input
if isinstance(input_image, list):
input_image = input_image[0]
if isinstance(input_image, str):
pil_image = Image.open(input_image).convert("RGB").convert("RGBA")
elif isinstance(input_image, Image.Image):
pil_image = input_image.convert("RGB").convert("RGBA")
elif isinstance(input_image, np.ndarray):
pil_image = Image.fromarray(input_image).convert("RGB").convert("RGBA")
else:
raise ValueError(f"Unsupported input_image type: {type(input_image)}")
gen_device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = {
"image": pil_image,
"generator": torch.Generator(device=gen_device).manual_seed(seed),
"true_cfg_scale": true_guidance_scale,
"prompt": prompt,
"negative_prompt": neg_prompt,
"num_inference_steps": num_inference_steps,
"num_images_per_prompt": 1,
"layers": layer,
"resolution": resolution, # 640 or 1024
"cfg_normalize": cfg_norm,
"use_en_prompt": use_en_prompt,
}
print("INFER INPUTS:", inputs)
print("REQUESTED GPU DURATION:", gpu_duration)
with torch.inference_mode():
out = pipeline(**inputs)
output_images = out.images[0] # list of PIL images (layers)
# Prepare gallery + export files
gallery_out = []
temp_files = []
for img in output_images:
gallery_out.append(img)
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
img.save(tmp.name)
temp_files.append(tmp.name)
pptx_path = imagelist_to_pptx(temp_files)
with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmpzip:
with zipfile.ZipFile(tmpzip.name, "w", zipfile.ZIP_DEFLATED) as zipf:
for i, img_path in enumerate(temp_files):
zipf.write(img_path, f"layer_{i+1}.png")
zip_path = tmpzip.name
return gallery_out, pptx_path, zip_path
ensure_dirname(LOG_DIR)
examples = [
"assets/test_images/1.png",
"assets/test_images/2.png",
"assets/test_images/3.png",
"assets/test_images/4.png",
"assets/test_images/5.png",
"assets/test_images/6.png",
"assets/test_images/7.png",
"assets/test_images/8.png",
"assets/test_images/9.png",
"assets/test_images/10.png",
"assets/test_images/11.png",
"assets/test_images/12.png",
"assets/test_images/13.png",
]
with gr.Blocks() as demo:
with gr.Column(elem_id="col-container"):
gr.HTML(
'<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/layered/qwen-image-layered-logo.png" '
'alt="Qwen-Image-Layered Logo" width="600" style="display: block; margin: 0 auto;">'
)
gr.Markdown(
"""
The text prompt is intended to describe the overall content of the input image—including elements that may be partially occluded (e.g., you may specify the text hidden behind a foreground object). It is not designed to control the semantic content of individual layers explicitly.
"""
)
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(label="Input Image", image_mode="RGBA")
with gr.Accordion("Advanced Settings", open=False):
prompt = gr.Textbox(
label="Prompt (Optional)",
placeholder="Please enter the prompt to descibe the image. (Optional)",
value="",
lines=2,
)
neg_prompt = gr.Textbox(
label="Negative Prompt (Optional)",
placeholder="Please enter the negative prompt",
value=" ",
lines=2,
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
true_guidance_scale = gr.Slider(
label="True guidance scale",
minimum=1.0,
maximum=10.0,
step=0.1,
value=4.0,
)
num_inference_steps = gr.Slider(
label="Number of inference steps",
minimum=1,
maximum=50,
step=1,
value=50,
)
layer = gr.Slider(
label="Layers",
minimum=2,
maximum=10,
step=1,
value=4,
)
resolution = gr.Radio(
label="Processing resolution",
choices=[640, 1024],
value=640,
)
cfg_norm = gr.Checkbox(
label="Whether enable CFG normalization", value=True
)
use_en_prompt = gr.Checkbox(
label="Automatic caption language if no prompt provided, True for EN, False for ZH",
value=True,
)
# NEW: text field for GPU duration override (seconds)
gpu_duration = gr.Textbox(
label="GPU duration override (seconds, 20..1500)",
value="1000",
lines=1,
placeholder="e.g. 60, 120, 300, 1000, 1500",
)
run_button = gr.Button("Decompose!", variant="primary")
with gr.Column(scale=2):
gallery = gr.Gallery(label="Layers", columns=4, rows=1, format="png")
with gr.Row():
export_file = gr.File(label="Download PPTX")
export_zip_file = gr.File(label="Download ZIP")
gr.Examples(
examples=examples,
inputs=[input_image],
outputs=[gallery, export_file, export_zip_file],
fn=infer,
examples_per_page=14,
cache_examples=False,
run_on_click=True,
)
run_button.click(
fn=infer,
inputs=[
input_image,
seed,
randomize_seed,
prompt,
neg_prompt,
true_guidance_scale,
num_inference_steps,
layer,
cfg_norm,
use_en_prompt,
resolution,
gpu_duration, # <-- NEW
],
outputs=[gallery, export_file, export_zip_file],
)
if __name__ == "__main__":
demo.launch()