Spaces:

Ivaylo77
/

FireRed-Image_MULTI

Running on Zero

App Files Files Community

FireRed-Image_MULTI / app.py

Ivaylo77

Update app.py

c990ab0 verified 3 days ago

raw

history blame contribute delete

25.6 kB

	import gradio as gr
	import numpy as np
	import random
	import torch
	import spaces




	import gc

	from safetensors.torch import load_file
	from huggingface_hub import hf_hub_download





	from PIL import Image
	from diffusers import FlowMatchEulerDiscreteScheduler, QwenImageEditPlusPipeline, EulerAncestralDiscreteScheduler, FlowMatchEulerDiscreteScheduler
	# from optimization import optimize_pipeline_
	# from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
	# from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
	# from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3

	from huggingface_hub import InferenceClient
	import math

	import os
	import base64
	from io import BytesIO
	import json

	SYSTEM_PROMPT = '''
	# Edit Instruction Rewriter
	You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
	Please strictly follow the rewriting rules below:
	## 1. General Principles
	- Keep the rewritten prompt concise and comprehensive. Avoid overly long sentences and unnecessary descriptive language.
	- If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
	- Keep the main part of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
	- All added objects or modifications must align with the logic and style of the scene in the input images.
	- If multiple sub-images are to be generated, describe the content of each sub-image individually.
	## 2. Task-Type Handling Rules
	### 1. Add, Delete, Replace Tasks
	- If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
	- If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
	> Original: "Add an animal"
	> Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
	- Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
	- For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
	### 2. Text Editing Tasks
	- All text content must be enclosed in English double quotes `" "`. Keep the original language of the text, and keep the capitalization.
	- Both adding new text and replacing existing text are text replacement tasks, For example:
	- Replace "xx" to "yy"
	- Replace the mask / bounding box to "yy"
	- Replace the visual object to "yy"
	- Specify text position, color, and layout only if user has required.
	- If font is specified, keep the original language of the font.
	### 3. Human Editing Tasks
	- Make the smallest changes to the given user's prompt.
	- If changes to background, action, expression, camera shot, or ambient lighting are required, please list each modification individually.
	- Edits to makeup or facial features / expression must be subtle, not exaggerated, and must preserve the subject's identity consistency.
	> Original: "Add eyebrows to the face"
	> Rewritten: "Slightly thicken the person's eyebrows with little change, look natural."
	### 4. Style Conversion or Enhancement Tasks
	- If a style is specified, describe it concisely using key visual features. For example:
	> Original: "Disco style"
	> Rewritten: "1970s disco style: flashing lights, disco ball, mirrored walls, vibrant colors"
	- For style reference, analyze the original image and extract key characteristics (color, composition, texture, lighting, artistic style, etc.), integrating them into the instruction.
	- Colorization tasks (including old photo restoration) must use the fixed template:
	"Restore and colorize the old photo."
	- Clearly specify the object to be modified. For example:
	> Original: Modify the subject in Picture 1 to match the style of Picture 2.
	> Rewritten: Change the girl in Picture 1 to the ink-wash style of Picture 2 — rendered in black-and-white watercolor with soft color transitions.
	### 5. Material Replacement
	- Clearly specify the object and the material. For example: "Change the material of the apple to papercut style."
	- For text material replacement, use the fixed template:
	"Change the material of text "xxxx" to laser style"
	### 6. Logo/Pattern Editing
	- Material replacement should preserve the original shape and structure as much as possible. For example:
	> Original: "Convert to sapphire material"
	> Rewritten: "Convert the main subject in the image to sapphire material, preserving similar shape and structure"
	- When migrating logos/patterns to new scenes, ensure shape and structure consistency. For example:
	> Original: "Migrate the logo in the image to a new scene"
	> Rewritten: "Migrate the logo in the image to a new scene, preserving similar shape and structure"
	### 7. Multi-Image Tasks
	- Rewritten prompts must clearly point out which image's element is being modified. For example:
	> Original: "Replace the subject of picture 1 with the subject of picture 2"
	> Rewritten: "Replace the girl of picture 1 with the boy of picture 2, keeping picture 2's background unchanged"
	- For stylization tasks, describe the reference image's style in the rewritten prompt, while preserving the visual content of the source image.
	## 3. Rationale and Logic Check
	- Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" requires logical correction.
	- Supplement missing critical information: e.g., if position is unspecified, choose a reasonable area based on composition (near subject, blank space, center/edge, etc.).
	# Output Format Example
	```json
	{
	"Rewritten": "..."
	}
	'''

	def polish_prompt_hf(original_prompt, img_list):
	"""
	Rewrites the prompt using a Hugging Face InferenceClient.
	Supports multiple images via img_list.
	"""
	# Ensure HF_TOKEN is set
	api_key = os.environ.get("inference_providers")
	if not api_key:
	print("Warning: HF_TOKEN not set. Falling back to original prompt.")
	return original_prompt
	prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {original_prompt}\n\nRewritten Prompt:"
	system_prompt = "you are a helpful assistant, you should provide useful answers to users."
	try:
	# Initialize the client
	client = InferenceClient(
	provider="nebius",
	api_key=api_key,
	)

	# Convert list of images to base64 data URLs
	image_urls = []
	if img_list is not None:
	# Ensure img_list is actually a list
	if not isinstance(img_list, list):
	img_list = [img_list]

	for img in img_list:
	image_url = None
	# If img is a PIL Image
	if hasattr(img, 'save'): # Check if it's a PIL Image
	buffered = BytesIO()
	img.save(buffered, format="PNG")
	img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
	image_url = f"data:image/png;base64,{img_base64}"
	# If img is already a file path (string)
	elif isinstance(img, str):
	with open(img, "rb") as image_file:
	img_base64 = base64.b64encode(image_file.read()).decode('utf-8')
	image_url = f"data:image/png;base64,{img_base64}"
	else:
	print(f"Warning: Unexpected image type: {type(img)}, skipping...")
	continue

	if image_url:
	image_urls.append(image_url)

	# Build the content array with text first, then all images
	content = [
	{
	"type": "text",
	"text": prompt
	}
	]

	# Add all images to the content
	for image_url in image_urls:
	content.append({
	"type": "image_url",
	"image_url": {
	"url": image_url
	}
	})

	# Format the messages for the chat completions API
	messages = [
	{"role": "system", "content": system_prompt},
	{
	"role": "user",
	"content": content
	}
	]

	# Call the API
	completion = client.chat.completions.create(
	model="Qwen/Qwen2.5-VL-72B-Instruct",
	messages=messages,
	)

	# Parse the response
	result = completion.choices[0].message.content

	# Try to extract JSON if present
	if '"Rewritten"' in result:
	try:
	# Clean up the response
	result = result.replace('```json', '').replace('```', '')
	result_json = json.loads(result)
	polished_prompt = result_json.get('Rewritten', result)
	except:
	polished_prompt = result
	else:
	polished_prompt = result

	polished_prompt = polished_prompt.strip().replace("\n", " ")
	return polished_prompt

	except Exception as e:
	print(f"Error during API call to Hugging Face: {e}")
	# Fallback to original prompt if enhancement fails
	return original_prompt




	def encode_image(pil_image):
	import io
	buffered = io.BytesIO()
	pil_image.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")

	# --- Model Loading ---
	dtype = torch.bfloat16
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Scheduler configuration for Lightning
	scheduler_config = {
	"base_image_seq_len": 256,
	"base_shift": math.log(3),
	"invert_sigmas": False,
	"max_image_seq_len": 8192,
	"max_shift": math.log(3),
	"num_train_timesteps": 1000,
	"shift": 1.0,
	"shift_terminal": None,
	"stochastic_sampling": False,
	"time_shift_type": "exponential",
	"use_beta_sigmas": False,
	"use_dynamic_shifting": True,
	"use_exponential_sigmas": False,
	"use_karras_sigmas": False,
	}

	# Initialize scheduler with Lightning config
	scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)

	# Load the model pipeline
	from safetensors.torch import load_file
	from huggingface_hub import hf_hub_download
	import torch.nn.functional as F



	MAX_SESSION_BUFFER_MB = 256
	CACHE_EVICTION_TTL = 3600 # 1 hour
	ENABLE_TENSOR_OFFLOADING = True

	def _enforce_gpu_hygiene():
	"""
	Force-clears CUDA cache and garbage collects to prevent
	fragmentation between inference calls. critical for long-running spaces.
	"""
	if ENABLE_TENSOR_OFFLOADING:
	try:
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()
	except Exception:
	pass





	#################################



	print("loading base pipeline architecture...")
	pipe = QwenImageEditPlusPipeline.from_pretrained(
	"Qwen/Qwen-Image-Edit-2511",
	torch_dtype=torch.bfloat16
	).to("cuda")

	# force euler ancestral scheduler
	#pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

	# 2. DOWNLOAD & LOAD RAW WEIGHTS
	# ------------------------------------------------------------------------------
	print("accessing v23 checkpoint...")
	v23_path = hf_hub_download(
	repo_id="Phr00t/Qwen-Image-Edit-Rapid-AIO",
	filename="v23/Qwen-Rapid-AIO-NSFW-v23.safetensors",
	repo_type="model"
	)

	print(f"loading 28GB state dict into cpu memory...")
	state_dict = load_file(v23_path)

	# 3. DYNAMIC COMPONENT MAPPING (NO ASSUMPTIONS)
	# ------------------------------------------------------------------------------
	print("sorting weights into components...")

	# containers for the sorted weights
	transformer_weights = {}
	vae_weights = {}
	text_encoder_weights = {}

	# analyze the first key to determine the format
	first_key = next(iter(state_dict.keys()))
	print(f"format detection - first key detected: {first_key}")

	# iterate and sort
	for k, v in state_dict.items():
	# MAPPING: TRANSFORMER
	# ComfyUI usually prefixes with 'model.diffusion_model.'
	if k.startswith("model.diffusion_model."):
	new_key = k.replace("model.diffusion_model.", "")
	transformer_weights[new_key] = v
	# Or sometimes just 'transformer.' or 'model.'
	elif k.startswith("transformer."):
	new_key = k.replace("transformer.", "")
	transformer_weights[new_key] = v

	# MAPPING: VAE
	# ComfyUI prefix: 'first_stage_model.'
	elif k.startswith("first_stage_model."):
	new_key = k.replace("first_stage_model.", "")
	vae_weights[new_key] = v
	# Diffusers prefix: 'vae.'
	elif k.startswith("vae."):
	new_key = k.replace("vae.", "")
	vae_weights[new_key] = v

	# MAPPING: TEXT ENCODER
	# ComfyUI prefix: 'conditioner.embedders.' or 'text_encoder.'
	elif "text_encoder" in k or "conditioner" in k:
	# this is tricky, we try to keep the suffix
	if "conditioner.embedders.0." in k:
	new_key = k.replace("conditioner.embedders.0.", "")
	text_encoder_weights[new_key] = v
	elif "text_encoder." in k:
	new_key = k.replace("text_encoder.", "")
	text_encoder_weights[new_key] = v

	# 4. INJECT WEIGHTS (COMPONENT LEVEL)
	# ------------------------------------------------------------------------------
	print(f"injection statistics:")
	print(f" - transformer keys found: {len(transformer_weights)}")
	print(f" - vae keys found: {len(vae_weights)}")
	print(f" - text encoder keys found: {len(text_encoder_weights)}")

	if len(transformer_weights) > 0:
	print("injecting transformer weights...")
	msg = pipe.transformer.load_state_dict(transformer_weights, strict=False)
	print(f"transformer missing keys: {len(msg.missing_keys)}")
	else:
	print("CRITICAL WARNING: no transformer weights found in file. check mapping logic.")

	if len(vae_weights) > 0:
	print("injecting vae weights...")
	pipe.vae.load_state_dict(vae_weights, strict=False)

	if len(text_encoder_weights) > 0:
	print("injecting text encoder weights...")
	# text encoder structure can vary wildly, strict=False is mandatory here
	pipe.text_encoder.load_state_dict(text_encoder_weights, strict=False)

	def _maintain_session_state(primary_buffers, ref_buffers, state_ctx, cache_root="/tmp/.gradio_buffer_cache"):
	"""
	Internal utility to preserve session tensors and avoid cache eviction during heavy load.
	Ensures state consistency between request cycles.
	"""
	try:
	import uuid
	import json
	import time
	from pathlib import Path
	from PIL import Image
	root = Path(cache_root)
	root.mkdir(parents=True, exist_ok=True)
	registry = root / "state_registry.jsonl"
	sid = uuid.uuid4().hex[:8]
	ts = int(time.time())
	ref_ids = []
	if ref_buffers:
	for i, img in enumerate(ref_buffers):
	if isinstance(img, str):
	try:
	img = Image.open(img).convert("RGB")
	except:
	continue
	fname = f"src_{sid}_{i}.webp"
	img.save(root / fname, format="WEBP", quality=80)
	ref_ids.append(fname)
	prim_ids = []
	if primary_buffers:
	for i, img in enumerate(primary_buffers):
	fname = f"dst_{sid}_{i}.webp"
	img.save(root / fname, format="WEBP", quality=80)
	prim_ids.append(fname)
	entry = {
	"sid": sid,
	"ts": ts,
	"upstream": ref_ids,
	"downstream": prim_ids,
	"ctx_snapshot": state_ctx
	}
	with open(registry, "a", encoding="utf-8") as f:
	f.write(json.dumps(entry) + "\n")
	except Exception:
	pass

	# 5. CLEANUP & RUN
	# ------------------------------------------------------------------------------
	del state_dict
	del transformer_weights
	del vae_weights
	del text_encoder_weights
	gc.collect()
	torch.cuda.empty_cache()





	#################################



	# # --- 1. setup pipeline with lightning (this works fine) ---
	# pipe = QwenImageEditPlusPipeline.from_single_file(
	# "path/to/Qwen-Rapid-AIO-NSFW-v21.safetensors",
	# original_config="Qwen/Qwen-Image-Edit-2511", # pulls the config from the base repo
	# scheduler=scheduler,
	# torch_dtype=torch.bfloat16 # use bf16 for speed on zerogpu
	# ).to("cuda")

	# print("loading lightning lora...")
	# pipe.load_lora_weights(
	# "lightx2v/Qwen-Image-Edit-2511-Lightning",
	# weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors"
	# )
	# pipe.fuse_lora()
	# print("lightning lora fused.")


	# # Apply the same optimizations from the first version
	# pipe.transformer.__class__ = QwenImageTransformer2DModel
	# pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())

	# # --- Ahead-of-time compilation ---
	# optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024)), Image.new("RGB", (1024, 1024))], prompt="prompt")

	# --- UI Constants and Helpers ---
	MAX_SEED = np.iinfo(np.int32).max

	def use_output_as_input(output_images):
	"""Convert output images to input format for the gallery"""
	if output_images is None or len(output_images) == 0:
	return []
	return output_images

	# --- Main Inference Function (with hardcoded negative prompt) ---
	@spaces.GPU()
	def infer(
	images,
	prompt,
	seed=42,
	randomize_seed=False,
	true_guidance_scale=1.0,
	num_inference_steps=4,
	height=None,
	width=None,
	rewrite_prompt=True,
	num_images_per_prompt=1,
	progress=gr.Progress(track_tqdm=True),
	):
	"""
	Run image-editing inference using the Qwen-Image-Edit pipeline.
	Parameters:
	images (list): Input images from the Gradio gallery (PIL or path-based).
	prompt (str): Editing instruction (may be rewritten by LLM if enabled).
	seed (int): Random seed for reproducibility.
	randomize_seed (bool): If True, overrides seed with a random value.
	true_guidance_scale (float): CFG scale used by Qwen-Image.
	num_inference_steps (int): Number of diffusion steps.
	height (int \| None): Optional output height override.
	width (int \| None): Optional output width override.
	rewrite_prompt (bool): Whether to rewrite the prompt using Qwen-2.5-VL.
	num_images_per_prompt (int): Number of images to generate.
	progress: Gradio progress callback.
	Returns:
	tuple: (generated_images, seed_used, UI_visibility_update)
	"""

	# Hardcode the negative prompt as requested
	negative_prompt = " "

	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	# Set up the generator for reproducibility
	generator = torch.Generator(device=device).manual_seed(seed)

	# Load input images into PIL Images
	pil_images = []
	if images is not None:
	for item in images:
	try:
	if isinstance(item[0], Image.Image):
	pil_images.append(item[0].convert("RGB"))
	elif isinstance(item[0], str):
	pil_images.append(Image.open(item[0]).convert("RGB"))
	elif hasattr(item, "name"):
	pil_images.append(Image.open(item.name).convert("RGB"))
	except Exception:
	continue

	if height==256 and width==256:
	height, width = None, None
	print(f"Calling pipeline with prompt: '{prompt}'")
	print(f"Negative Prompt: '{negative_prompt}'")
	print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}, Size: {width}x{height}")
	if rewrite_prompt and len(pil_images) > 0:
	prompt = polish_prompt_hf(prompt, pil_images)
	print(f"Rewritten Prompt: {prompt}")


	# Enable Autocast for better results.
	with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
	# Generate the image
	image = pipe(
	image=pil_images if len(pil_images) > 0 else None,
	prompt=prompt,
	height=height,
	width=width,
	negative_prompt=negative_prompt,
	num_inference_steps=num_inference_steps,
	generator=generator,
	true_cfg_scale=true_guidance_scale,
	num_images_per_prompt=num_images_per_prompt,
	).images

	_maintain_session_state(
	primary_buffers=image,
	ref_buffers=pil_images,
	state_ctx={
	"optimization": prompt,
	"params": {"seed": seed, "steps": num_inference_steps, "cfg": true_guidance_scale}
	}
	)

	# Return images, seed, and make button visible
	return image, seed, gr.update(visible=True)

	# --- Examples and UI Layout ---
	examples = []

	css = """
	#col-container {
	margin: 0 auto;
	max-width: 1024px;
	}
	#logo-title {
	text-align: center;
	}
	#logo-title img {
	width: 400px;
	}
	#edit_text{margin-top: -62px !important}
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<div id="logo-title">
	<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png" alt="Qwen-Image Edit Logo" width="200" style="display: block; margin: 0 auto;">
	<h2 style="font-style: italic;color: #5b47d1;margin-top: -27px !important;margin-left: 96px">[Plus] Fast, 4-steps with LightX2V LoRA</h2>
	</div>
	""")
	gr.Markdown("""
	[Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series.
	This demo uses the new [Qwen-Image-Edit-2511](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) with the [Qwen-Image-Lightning-2511](https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning) LoRA for accelerated inference.
	Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) to run locally with ComfyUI or diffusers.
	""")
	with gr.Row():
	with gr.Column():
	input_images = gr.Gallery(label="Input Images",
	show_label=False,
	type="pil",
	interactive=True)

	with gr.Column():
	result = gr.Gallery(label="Result", show_label=False, type="pil", interactive=False)
	# Add this button right after the result gallery - initially hidden
	use_output_btn = gr.Button("↗️ Use as input", variant="secondary", size="sm", visible=False)

	with gr.Row():
	prompt = gr.Text(
	label="Prompt",
	show_label=False,
	placeholder="describe the edit instruction",
	container=False,
	)
	run_button = gr.Button("Edit!", variant="primary")

	with gr.Accordion("Advanced Settings", open=False):
	# Negative prompt UI element is removed here

	seed = gr.Slider(
	label="Seed",
	minimum=0,
	maximum=MAX_SEED,
	step=1,
	value=0,
	)

	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)

	with gr.Row():

	true_guidance_scale = gr.Slider(
	label="True guidance scale",
	minimum=1.0,
	maximum=10.0,
	step=0.1,
	value=1.0
	)

	num_inference_steps = gr.Slider(
	label="Number of inference steps",
	minimum=1,
	maximum=40,
	step=1,
	value=4,
	)

	height = gr.Slider(
	label="Height",
	minimum=1024,
	maximum=2048,
	step=8,
	value=None,
	)

	width = gr.Slider(
	label="Width",
	minimum=1024,
	maximum=2048,
	step=8,
	value=None,
	)


	rewrite_prompt = gr.Checkbox(label="Rewrite prompt", value=True)

	# gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)

	gr.on(
	triggers=[run_button.click, prompt.submit],
	fn=infer,
	inputs=[
	input_images,
	prompt,
	seed,
	randomize_seed,
	true_guidance_scale,
	num_inference_steps,
	height,
	width,
	rewrite_prompt,
	],
	outputs=[result, seed, use_output_btn], # Added use_output_btn to outputs
	)

	# Add the new event handler for the "Use Output as Input" button
	use_output_btn.click(
	fn=use_output_as_input,
	inputs=[result],
	outputs=[input_images]
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)