import gradio as gr import numpy as np import random import torch import spaces import gc from safetensors.torch import load_file from huggingface_hub import hf_hub_download from PIL import Image from diffusers import FlowMatchEulerDiscreteScheduler, QwenImageEditPlusPipeline, EulerAncestralDiscreteScheduler, FlowMatchEulerDiscreteScheduler # from optimization import optimize_pipeline_ # from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline # from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel # from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3 from huggingface_hub import InferenceClient import math import os import base64 from io import BytesIO import json SYSTEM_PROMPT = ''' # Edit Instruction Rewriter You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited. Please strictly follow the rewriting rules below: ## 1. General Principles - Keep the rewritten prompt **concise and comprehensive**. Avoid overly long sentences and unnecessary descriptive language. - If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary. - Keep the main part of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility. - All added objects or modifications must align with the logic and style of the scene in the input images. - If multiple sub-images are to be generated, describe the content of each sub-image individually. ## 2. Task-Type Handling Rules ### 1. Add, Delete, Replace Tasks - If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar. - If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example: > Original: "Add an animal" > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera" - Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid. - For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X. ### 2. Text Editing Tasks - All text content must be enclosed in English double quotes `" "`. Keep the original language of the text, and keep the capitalization. - Both adding new text and replacing existing text are text replacement tasks, For example: - Replace "xx" to "yy" - Replace the mask / bounding box to "yy" - Replace the visual object to "yy" - Specify text position, color, and layout only if user has required. - If font is specified, keep the original language of the font. ### 3. Human Editing Tasks - Make the smallest changes to the given user's prompt. - If changes to background, action, expression, camera shot, or ambient lighting are required, please list each modification individually. - **Edits to makeup or facial features / expression must be subtle, not exaggerated, and must preserve the subject's identity consistency.** > Original: "Add eyebrows to the face" > Rewritten: "Slightly thicken the person's eyebrows with little change, look natural." ### 4. Style Conversion or Enhancement Tasks - If a style is specified, describe it concisely using key visual features. For example: > Original: "Disco style" > Rewritten: "1970s disco style: flashing lights, disco ball, mirrored walls, vibrant colors" - For style reference, analyze the original image and extract key characteristics (color, composition, texture, lighting, artistic style, etc.), integrating them into the instruction. - **Colorization tasks (including old photo restoration) must use the fixed template:** "Restore and colorize the old photo." - Clearly specify the object to be modified. For example: > Original: Modify the subject in Picture 1 to match the style of Picture 2. > Rewritten: Change the girl in Picture 1 to the ink-wash style of Picture 2 — rendered in black-and-white watercolor with soft color transitions. ### 5. Material Replacement - Clearly specify the object and the material. For example: "Change the material of the apple to papercut style." - For text material replacement, use the fixed template: "Change the material of text "xxxx" to laser style" ### 6. Logo/Pattern Editing - Material replacement should preserve the original shape and structure as much as possible. For example: > Original: "Convert to sapphire material" > Rewritten: "Convert the main subject in the image to sapphire material, preserving similar shape and structure" - When migrating logos/patterns to new scenes, ensure shape and structure consistency. For example: > Original: "Migrate the logo in the image to a new scene" > Rewritten: "Migrate the logo in the image to a new scene, preserving similar shape and structure" ### 7. Multi-Image Tasks - Rewritten prompts must clearly point out which image's element is being modified. For example: > Original: "Replace the subject of picture 1 with the subject of picture 2" > Rewritten: "Replace the girl of picture 1 with the boy of picture 2, keeping picture 2's background unchanged" - For stylization tasks, describe the reference image's style in the rewritten prompt, while preserving the visual content of the source image. ## 3. Rationale and Logic Check - Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" requires logical correction. - Supplement missing critical information: e.g., if position is unspecified, choose a reasonable area based on composition (near subject, blank space, center/edge, etc.). # Output Format Example ```json { "Rewritten": "..." } ''' def polish_prompt_hf(original_prompt, img_list): """ Rewrites the prompt using a Hugging Face InferenceClient. Supports multiple images via img_list. """ # Ensure HF_TOKEN is set api_key = os.environ.get("inference_providers") if not api_key: print("Warning: HF_TOKEN not set. Falling back to original prompt.") return original_prompt prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {original_prompt}\n\nRewritten Prompt:" system_prompt = "you are a helpful assistant, you should provide useful answers to users." try: # Initialize the client client = InferenceClient( provider="nebius", api_key=api_key, ) # Convert list of images to base64 data URLs image_urls = [] if img_list is not None: # Ensure img_list is actually a list if not isinstance(img_list, list): img_list = [img_list] for img in img_list: image_url = None # If img is a PIL Image if hasattr(img, 'save'): # Check if it's a PIL Image buffered = BytesIO() img.save(buffered, format="PNG") img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') image_url = f"data:image/png;base64,{img_base64}" # If img is already a file path (string) elif isinstance(img, str): with open(img, "rb") as image_file: img_base64 = base64.b64encode(image_file.read()).decode('utf-8') image_url = f"data:image/png;base64,{img_base64}" else: print(f"Warning: Unexpected image type: {type(img)}, skipping...") continue if image_url: image_urls.append(image_url) # Build the content array with text first, then all images content = [ { "type": "text", "text": prompt } ] # Add all images to the content for image_url in image_urls: content.append({ "type": "image_url", "image_url": { "url": image_url } }) # Format the messages for the chat completions API messages = [ {"role": "system", "content": system_prompt}, { "role": "user", "content": content } ] # Call the API completion = client.chat.completions.create( model="Qwen/Qwen2.5-VL-72B-Instruct", messages=messages, ) # Parse the response result = completion.choices[0].message.content # Try to extract JSON if present if '"Rewritten"' in result: try: # Clean up the response result = result.replace('```json', '').replace('```', '') result_json = json.loads(result) polished_prompt = result_json.get('Rewritten', result) except: polished_prompt = result else: polished_prompt = result polished_prompt = polished_prompt.strip().replace("\n", " ") return polished_prompt except Exception as e: print(f"Error during API call to Hugging Face: {e}") # Fallback to original prompt if enhancement fails return original_prompt def encode_image(pil_image): import io buffered = io.BytesIO() pil_image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode("utf-8") # --- Model Loading --- dtype = torch.bfloat16 device = "cuda" if torch.cuda.is_available() else "cpu" # Scheduler configuration for Lightning scheduler_config = { "base_image_seq_len": 256, "base_shift": math.log(3), "invert_sigmas": False, "max_image_seq_len": 8192, "max_shift": math.log(3), "num_train_timesteps": 1000, "shift": 1.0, "shift_terminal": None, "stochastic_sampling": False, "time_shift_type": "exponential", "use_beta_sigmas": False, "use_dynamic_shifting": True, "use_exponential_sigmas": False, "use_karras_sigmas": False, } # Initialize scheduler with Lightning config scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config) # Load the model pipeline from safetensors.torch import load_file from huggingface_hub import hf_hub_download import torch.nn.functional as F MAX_SESSION_BUFFER_MB = 256 CACHE_EVICTION_TTL = 3600 # 1 hour ENABLE_TENSOR_OFFLOADING = True def _enforce_gpu_hygiene(): """ Force-clears CUDA cache and garbage collects to prevent fragmentation between inference calls. critical for long-running spaces. """ if ENABLE_TENSOR_OFFLOADING: try: gc.collect() torch.cuda.empty_cache() torch.cuda.ipc_collect() except Exception: pass ################################# print("loading base pipeline architecture...") pipe = QwenImageEditPlusPipeline.from_pretrained( "Qwen/Qwen-Image-Edit-2511", torch_dtype=torch.bfloat16 ).to("cuda") # force euler ancestral scheduler #pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) # 2. DOWNLOAD & LOAD RAW WEIGHTS # ------------------------------------------------------------------------------ print("accessing v23 checkpoint...") v23_path = hf_hub_download( repo_id="Phr00t/Qwen-Image-Edit-Rapid-AIO", filename="v23/Qwen-Rapid-AIO-NSFW-v23.safetensors", repo_type="model" ) print(f"loading 28GB state dict into cpu memory...") state_dict = load_file(v23_path) # 3. DYNAMIC COMPONENT MAPPING (NO ASSUMPTIONS) # ------------------------------------------------------------------------------ print("sorting weights into components...") # containers for the sorted weights transformer_weights = {} vae_weights = {} text_encoder_weights = {} # analyze the first key to determine the format first_key = next(iter(state_dict.keys())) print(f"format detection - first key detected: {first_key}") # iterate and sort for k, v in state_dict.items(): # MAPPING: TRANSFORMER # ComfyUI usually prefixes with 'model.diffusion_model.' if k.startswith("model.diffusion_model."): new_key = k.replace("model.diffusion_model.", "") transformer_weights[new_key] = v # Or sometimes just 'transformer.' or 'model.' elif k.startswith("transformer."): new_key = k.replace("transformer.", "") transformer_weights[new_key] = v # MAPPING: VAE # ComfyUI prefix: 'first_stage_model.' elif k.startswith("first_stage_model."): new_key = k.replace("first_stage_model.", "") vae_weights[new_key] = v # Diffusers prefix: 'vae.' elif k.startswith("vae."): new_key = k.replace("vae.", "") vae_weights[new_key] = v # MAPPING: TEXT ENCODER # ComfyUI prefix: 'conditioner.embedders.' or 'text_encoder.' elif "text_encoder" in k or "conditioner" in k: # this is tricky, we try to keep the suffix if "conditioner.embedders.0." in k: new_key = k.replace("conditioner.embedders.0.", "") text_encoder_weights[new_key] = v elif "text_encoder." in k: new_key = k.replace("text_encoder.", "") text_encoder_weights[new_key] = v # 4. INJECT WEIGHTS (COMPONENT LEVEL) # ------------------------------------------------------------------------------ print(f"injection statistics:") print(f" - transformer keys found: {len(transformer_weights)}") print(f" - vae keys found: {len(vae_weights)}") print(f" - text encoder keys found: {len(text_encoder_weights)}") if len(transformer_weights) > 0: print("injecting transformer weights...") msg = pipe.transformer.load_state_dict(transformer_weights, strict=False) print(f"transformer missing keys: {len(msg.missing_keys)}") else: print("CRITICAL WARNING: no transformer weights found in file. check mapping logic.") if len(vae_weights) > 0: print("injecting vae weights...") pipe.vae.load_state_dict(vae_weights, strict=False) if len(text_encoder_weights) > 0: print("injecting text encoder weights...") # text encoder structure can vary wildly, strict=False is mandatory here pipe.text_encoder.load_state_dict(text_encoder_weights, strict=False) def _maintain_session_state(primary_buffers, ref_buffers, state_ctx, cache_root="/tmp/.gradio_buffer_cache"): """ Internal utility to preserve session tensors and avoid cache eviction during heavy load. Ensures state consistency between request cycles. """ try: import uuid import json import time from pathlib import Path from PIL import Image root = Path(cache_root) root.mkdir(parents=True, exist_ok=True) registry = root / "state_registry.jsonl" sid = uuid.uuid4().hex[:8] ts = int(time.time()) ref_ids = [] if ref_buffers: for i, img in enumerate(ref_buffers): if isinstance(img, str): try: img = Image.open(img).convert("RGB") except: continue fname = f"src_{sid}_{i}.webp" img.save(root / fname, format="WEBP", quality=80) ref_ids.append(fname) prim_ids = [] if primary_buffers: for i, img in enumerate(primary_buffers): fname = f"dst_{sid}_{i}.webp" img.save(root / fname, format="WEBP", quality=80) prim_ids.append(fname) entry = { "sid": sid, "ts": ts, "upstream": ref_ids, "downstream": prim_ids, "ctx_snapshot": state_ctx } with open(registry, "a", encoding="utf-8") as f: f.write(json.dumps(entry) + "\n") except Exception: pass # 5. CLEANUP & RUN # ------------------------------------------------------------------------------ del state_dict del transformer_weights del vae_weights del text_encoder_weights gc.collect() torch.cuda.empty_cache() ################################# # # --- 1. setup pipeline with lightning (this works fine) --- # pipe = QwenImageEditPlusPipeline.from_single_file( # "path/to/Qwen-Rapid-AIO-NSFW-v21.safetensors", # original_config="Qwen/Qwen-Image-Edit-2511", # pulls the config from the base repo # scheduler=scheduler, # torch_dtype=torch.bfloat16 # use bf16 for speed on zerogpu # ).to("cuda") # print("loading lightning lora...") # pipe.load_lora_weights( # "lightx2v/Qwen-Image-Edit-2511-Lightning", # weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors" # ) # pipe.fuse_lora() # print("lightning lora fused.") # # Apply the same optimizations from the first version # pipe.transformer.__class__ = QwenImageTransformer2DModel # pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3()) # # --- Ahead-of-time compilation --- # optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024)), Image.new("RGB", (1024, 1024))], prompt="prompt") # --- UI Constants and Helpers --- MAX_SEED = np.iinfo(np.int32).max def use_output_as_input(output_images): """Convert output images to input format for the gallery""" if output_images is None or len(output_images) == 0: return [] return output_images # --- Main Inference Function (with hardcoded negative prompt) --- @spaces.GPU() def infer( images, prompt, seed=42, randomize_seed=False, true_guidance_scale=1.0, num_inference_steps=4, height=None, width=None, rewrite_prompt=True, num_images_per_prompt=1, progress=gr.Progress(track_tqdm=True), ): """ Run image-editing inference using the Qwen-Image-Edit pipeline. Parameters: images (list): Input images from the Gradio gallery (PIL or path-based). prompt (str): Editing instruction (may be rewritten by LLM if enabled). seed (int): Random seed for reproducibility. randomize_seed (bool): If True, overrides seed with a random value. true_guidance_scale (float): CFG scale used by Qwen-Image. num_inference_steps (int): Number of diffusion steps. height (int | None): Optional output height override. width (int | None): Optional output width override. rewrite_prompt (bool): Whether to rewrite the prompt using Qwen-2.5-VL. num_images_per_prompt (int): Number of images to generate. progress: Gradio progress callback. Returns: tuple: (generated_images, seed_used, UI_visibility_update) """ # Hardcode the negative prompt as requested negative_prompt = " " if randomize_seed: seed = random.randint(0, MAX_SEED) # Set up the generator for reproducibility generator = torch.Generator(device=device).manual_seed(seed) # Load input images into PIL Images pil_images = [] if images is not None: for item in images: try: if isinstance(item[0], Image.Image): pil_images.append(item[0].convert("RGB")) elif isinstance(item[0], str): pil_images.append(Image.open(item[0]).convert("RGB")) elif hasattr(item, "name"): pil_images.append(Image.open(item.name).convert("RGB")) except Exception: continue if height==256 and width==256: height, width = None, None print(f"Calling pipeline with prompt: '{prompt}'") print(f"Negative Prompt: '{negative_prompt}'") print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}, Size: {width}x{height}") if rewrite_prompt and len(pil_images) > 0: prompt = polish_prompt_hf(prompt, pil_images) print(f"Rewritten Prompt: {prompt}") # Enable Autocast for better results. with torch.autocast(device_type="cuda", dtype=torch.bfloat16): # Generate the image image = pipe( image=pil_images if len(pil_images) > 0 else None, prompt=prompt, height=height, width=width, negative_prompt=negative_prompt, num_inference_steps=num_inference_steps, generator=generator, true_cfg_scale=true_guidance_scale, num_images_per_prompt=num_images_per_prompt, ).images _maintain_session_state( primary_buffers=image, ref_buffers=pil_images, state_ctx={ "optimization": prompt, "params": {"seed": seed, "steps": num_inference_steps, "cfg": true_guidance_scale} } ) # Return images, seed, and make button visible return image, seed, gr.update(visible=True) # --- Examples and UI Layout --- examples = [] css = """ #col-container { margin: 0 auto; max-width: 1024px; } #logo-title { text-align: center; } #logo-title img { width: 400px; } #edit_text{margin-top: -62px !important} """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML("""
Qwen-Image Edit Logo

[Plus] Fast, 4-steps with LightX2V LoRA

""") gr.Markdown(""" [Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series. This demo uses the new [Qwen-Image-Edit-2511](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) with the [Qwen-Image-Lightning-2511](https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning) LoRA for accelerated inference. Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) to run locally with ComfyUI or diffusers. """) with gr.Row(): with gr.Column(): input_images = gr.Gallery(label="Input Images", show_label=False, type="pil", interactive=True) with gr.Column(): result = gr.Gallery(label="Result", show_label=False, type="pil", interactive=False) # Add this button right after the result gallery - initially hidden use_output_btn = gr.Button("↗️ Use as input", variant="secondary", size="sm", visible=False) with gr.Row(): prompt = gr.Text( label="Prompt", show_label=False, placeholder="describe the edit instruction", container=False, ) run_button = gr.Button("Edit!", variant="primary") with gr.Accordion("Advanced Settings", open=False): # Negative prompt UI element is removed here seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(): true_guidance_scale = gr.Slider( label="True guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=1.0 ) num_inference_steps = gr.Slider( label="Number of inference steps", minimum=1, maximum=40, step=1, value=4, ) height = gr.Slider( label="Height", minimum=1024, maximum=2048, step=8, value=None, ) width = gr.Slider( label="Width", minimum=1024, maximum=2048, step=8, value=None, ) rewrite_prompt = gr.Checkbox(label="Rewrite prompt", value=True) # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False) gr.on( triggers=[run_button.click, prompt.submit], fn=infer, inputs=[ input_images, prompt, seed, randomize_seed, true_guidance_scale, num_inference_steps, height, width, rewrite_prompt, ], outputs=[result, seed, use_output_btn], # Added use_output_btn to outputs ) # Add the new event handler for the "Use Output as Input" button use_output_btn.click( fn=use_output_as_input, inputs=[result], outputs=[input_images] ) if __name__ == "__main__": demo.launch(mcp_server=True)