import gradio as gr import math import numpy as np import random import torch import spaces from PIL import Image from diffusers import QwenImageEditPlusPipeline from typing import Optional, Tuple MAX_SEED = np.iinfo(np.int32).max # --- Model Loading --- dtype = torch.bfloat16 device = "cuda" if torch.cuda.is_available() else "cpu" pipe = QwenImageEditPlusPipeline.from_pretrained( "Qwen/Qwen-Image-Edit-2511", torch_dtype=dtype ).to(device) # Load the lightning LoRA for fast inference pipe.load_lora_weights( "lightx2v/Qwen-Image-Edit-2511-Lightning", weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors", adapter_name="lightning" ) # Load the color grade transfer LoRA pipe.load_lora_weights( "ovi054/QIE-2511-Color-Grade-Transfer-LoRA", weight_name="QIE-2511-Color-Grade-Transfer-LoRA.safetensors", adapter_name="color" ) pipe.set_adapters(["lightning", "color"], adapter_weights=[1.0, 1.0]) # VAE_IMAGE_SIZE must match the pipeline constant (pipeline_qwenimage_edit_plus.py line 67) _VAE_IMAGE_SIZE = 1024 * 1024 def calculate_vae_gen_size(image: Image.Image) -> tuple: """ Return (gen_w, gen_h) that exactly matches the pipeline's internal VAE conditioning scale for this image. The pipeline always resizes every input image to VAE_IMAGE_SIZE (~1MP) before VAE-encoding it into image_latents, using: vae_width, vae_height = calculate_dimensions(VAE_IMAGE_SIZE, w / h) img_shapes (used for 2-D RoPE) is built from BOTH the output size (height/width) AND the conditioning sizes (vae_width, vae_height). When they differ, the RoPE coordinate systems are misaligned → huge pixel shift. Passing gen_h/gen_w = the same 1MP-equivalent makes the output tokens and Image 1 conditioning tokens share an identical coordinate system → no shift. This is exactly what ComfyUI’s ImageScaleToTotalPixels (megapixels=1.0) achieves. """ W, H = image.size ratio = W / H gen_w = math.sqrt(_VAE_IMAGE_SIZE * ratio) gen_h = gen_w / ratio # pipeline rounds to multiples of 32 (also satisfies the ÷16 divisibility requirement) gen_w = round(gen_w / 32) * 32 gen_h = round(gen_h / 32) * 32 return int(gen_w), int(gen_h) def update_dimensions_on_upload(image: Optional[Image.Image]) -> Image.Image: """ Cap longest side to 1328px, snap to multiples of 16. Pipeline requires divisibility by vae_scale_factor * 2 = 8 * 2 = 16. Never upscales. """ if image is None: return image MAX_SIDE = 1328 original_width, original_height = image.size scale = min(MAX_SIDE / original_width, MAX_SIDE / original_height, 1.0) # Must be multiples of 16 (vae_scale_factor * 2) new_width = (int(original_width * scale) // 16) * 16 new_height = (int(original_height * scale) // 16) * 16 if (new_width, new_height) == (original_width, original_height): return image return image.resize((new_width, new_height), Image.LANCZOS) @spaces.GPU def infer( source_image: Optional[Image.Image] = None, reference_image: Optional[Image.Image] = None, seed: int = 0, randomize_seed: bool = True, true_guidance_scale: float = 1.0, num_inference_steps: int = 4, progress=gr.Progress(track_tqdm=True) ) -> Tuple[Image.Image, int]: """ Transfer color grading from a reference image onto a source image. """ if source_image is None: raise gr.Error("Please upload a source image (Image 1).") if reference_image is None: raise gr.Error("Please upload a color grade reference image (Image 2).") if randomize_seed: seed = random.randint(0, MAX_SEED) generator = torch.Generator(device=device).manual_seed(seed) src_img = source_image.convert("RGB") ref_img = reference_image.convert("RGB") # Original size — used to resize the output back at the end out_w, out_h = src_img.size # Generate at the 1MP-equivalent of Image 1’s aspect ratio. # The pipeline internally scales ALL input images to VAE_IMAGE_SIZE (~1MP) before # VAE-encoding them as conditioning latents. img_shapes (for 2-D RoPE) combines # the output size (height/width) with those conditioning sizes. If they differ, # the RoPE coordinate systems are misaligned → huge pixel shift. # Using the same 1MP formula as the pipeline eliminates the mismatch. # (ComfyUI achieves this via ImageScaleToTotalPixels at megapixels=1.0.) gen_w, gen_h = calculate_vae_gen_size(src_img) result = pipe( image=[src_img, ref_img], prompt="Transfer ONLY the color grading from Image 2 onto Image 1", height=gen_h, width=gen_w, num_inference_steps=num_inference_steps, generator=generator, true_cfg_scale=true_guidance_scale, num_images_per_prompt=1, ).images[0] # Resize output back to the original image dimensions # if result.size != (out_w, out_h): # result = result.resize((out_w, out_h), Image.LANCZOS) return (src_img, result), seed # --- UI --- css = ''' #col-container { max-width: 1000px; margin: 0 auto; } .dark .progress-text { color: white !important } #examples { max-width: 1000px; margin: 0 auto; } .image-container { min-height: 300px; } ''' with gr.Blocks() as demo: with gr.Column(elem_id="col-container"): gr.Markdown("## 🎨 Color Grade Transfer - Qwen Image Edit + LoRA") gr.Markdown(""" Transfer color grading and tones from a reference image onto your source image ✨ Using my [ovi054/Color-Grade-Transfer-LoRA](https://huggingface.co/ovi054/QIE-2511-Color-Grade-Transfer-LoRA) and 4 step inference """) with gr.Row(): with gr.Column(): with gr.Row(): source_image = gr.Image( label="Image 1 (Source — content to preserve)", type="pil", elem_classes="image-container" ) reference_image = gr.Image( label="Image 2 (Color Grade Reference)", type="pil", elem_classes="image-container" ) run_btn = gr.Button("🎨 Transfer Color Grade", variant="primary", size="lg") with gr.Accordion("Advanced Settings", open=False): seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0 ) randomize_seed = gr.Checkbox( label="Randomize Seed", value=True ) true_guidance_scale = gr.Slider( label="True Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=1.0 ) num_inference_steps = gr.Slider( label="Inference Steps", minimum=1, maximum=40, step=1, value=4 ) with gr.Column(): result = gr.ImageSlider(label="Color Graded Output", interactive=False) gr.Examples( examples=[ ["images/image1.jpg", "images/image2.jpeg"], ["images/image2.jpeg","images/image1.jpg"], ], inputs=[source_image, reference_image], outputs=[result, seed], fn=infer, cache_examples=True, cache_mode="lazy", elem_id="examples" ) inputs = [ source_image, reference_image, seed, randomize_seed, true_guidance_scale, num_inference_steps, ] outputs = [result, seed] run_btn.click(fn=infer, inputs=inputs, outputs=outputs) source_image.upload( fn=update_dimensions_on_upload, inputs=[source_image], outputs=[source_image] ) reference_image.upload( fn=update_dimensions_on_upload, inputs=[reference_image], outputs=[reference_image] ) demo.launch(mcp_server=True, theme=gr.themes.Citrus(), css=css, footer_links=["api", "gradio", "settings"])