WeVi commited on
Commit
af56fc4
·
verified ·
1 Parent(s): 3c1ca7d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +374 -0
app.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import random
4
+ import torch
5
+ import spaces
6
+ import os
7
+ import json
8
+
9
+ from PIL import Image
10
+ from diffusers import QwenImageEditPipeline, FlowMatchEulerDiscreteScheduler
11
+
12
+ from huggingface_hub import InferenceClient
13
+ import math
14
+
15
+ # Assuming optimization.py and qwenimage/ are in the same directory
16
+ from optimization import optimize_pipeline_
17
+ from qwenimage.pipeline_qwen_image_edit import QwenImageEditPipeline as QwenImageEditPipelineCustom
18
+ from qwenimage.transformer_qwen_image import QwenImageTransformer2DModel
19
+ from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
20
+
21
+ # --- prompt enhancement using hugging face inferenceclient ---
22
+ def polish_prompt_hf(original_prompt, system_prompt):
23
+ """
24
+ Rewrites the prompt using a Hugging Face InferenceClient.
25
+ """
26
+ api_key = os.environ.get("HF_TOKEN") # Changed to HF_TOKEN as per common practice
27
+ if not api_key:
28
+ print("Warning: HF_TOKEN not set. Falling back to original prompt.")
29
+ return original_prompt
30
+
31
+ try:
32
+ client = InferenceClient(
33
+ provider="cerebras",
34
+ api_key=api_key,
35
+ )
36
+
37
+ messages = [
38
+ {"role": "system", "content": system_prompt},
39
+ {"role": "user", "content": original_prompt}
40
+ ]
41
+
42
+ completion = client.chat.completions.create(
43
+ model="qwen/qwen3-235b-a22b-instruct-2507",
44
+ messages=messages,
45
+ )
46
+
47
+ result = completion.choices[0].message.content
48
+
49
+ if '{"rewritten"' in result:
50
+ try:
51
+ result = result.replace('```json', '').replace('```', '')
52
+ result_json = json.loads(result)
53
+ polished_prompt = result_json.get('rewritten', result)
54
+ except Exception: # Catch broader exception for JSON parsing
55
+ polished_prompt = result
56
+ else:
57
+ polished_prompt = result
58
+
59
+ polished_prompt = polished_prompt.strip().replace("\n", " ")
60
+ return polished_prompt
61
+
62
+ except Exception as e: # Catch broader exception for API calls
63
+ print(f"Error during API call to Hugging Face: {e}")
64
+ return original_prompt
65
+
66
+
67
+ def polish_prompt(prompt, img):
68
+ """
69
+ Main function to polish prompts for image editing using HF inference.
70
+ """
71
+ system_prompt = '''
72
+ # EDIT INSTRUCTION REWRITER
73
+ You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
74
+
75
+ Please strictly follow the rewriting rules below:
76
+
77
+ ## 1. GENERAL PRINCIPLES
78
+ - Keep the rewritten prompt **concise**. Avoid overly long sentences and reduce unnecessary descriptive language.
79
+ - If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
80
+ - Keep the core intention of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
81
+ - All added objects or modifications must align with the logic and style of the edited input image's overall scene.
82
+
83
+ ## 2. TASK TYPE HANDLING RULES
84
+ ### 1. ADD, DELETE, REPLACE TASKS
85
+ - If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
86
+ - If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
87
+ > Original: "add an animal"
88
+ > Rewritten: "add a light-gray cat in the bottom-right corner, sitting and facing the camera"
89
+ - Remove meaningless instructions: e.g., "add 0 objects" should be ignored or flagged as invalid.
90
+ - For replacement tasks, specify "replace Y with X" and briefly describe the key visual features of X.
91
+
92
+ ### 2. TEXT EDITING TASKS
93
+ - All text content must be enclosed in English double quotes " ". Do not translate or alter the original language of the text, and do not change the capitalization.
94
+ - **For text replacement tasks, always use the fixed template:**
95
+ - Replace "XX" to "YY".
96
+ - Replace the XX bounding box to "YY".
97
+ - If the user does not specify text content, infer and add concise text based on the instruction and the input image's context. For example:
98
+ > Original: "add a line of text" (poster)
99
+ > Rewritten: "add text "Limited Edition" at the top center with slight shadow"
100
+ - Specify text position, color, and layout in a concise way.
101
+
102
+ ### 3. HUMAN EDITING TASKS
103
+ - Maintain the person's core visual consistency (ethnicity, gender, age, hairstyle, expression, outfit, etc.).
104
+ - If modifying appearance (e.g., clothes, hairstyle), ensure the new element is consistent with the original style.
105
+ - **For expression changes, they must be natural and subtle, never exaggerated.** - If deletion is not specifically emphasized, the most important subject in the original image (e.g., a person, an animal) should be preserved.
106
+ - For background change tasks, emphasize maintaining subject consistency at first.
107
+ - Example:
108
+ > Original: "change the person's hat"
109
+ > Rewritten: "replace the man's hat with a dark brown beret; keep smile, short hair, and gray jacket unchanged"
110
+
111
+ ### 4. STYLE TRANSFORMATION OR ENHANCEMENT TASKS
112
+ - If a style is specified, describe it concisely with key visual traits. For example:
113
+ > Original: "disco style"
114
+ > Rewritten: "1970s Disco: flashing lights, disco ball, mirrored walls, colorful tones"
115
+ - If the instruction says "use reference style" or "keep current style," analyze the input image, extract main features (color, composition, texture, lighting, art style), and integrate them concisely.
116
+ - **For coloring tasks, including restoring old photos, always use the fixed template:** "restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"
117
+ - If there are other changes, place the style description at the end.
118
+
119
+ ## 3. RATIONALITY AND LOGIC CHECKS
120
+ - Resolve contradictory instructions: e.g., "remove all trees but keep all trees" should be logically corrected.
121
+ - Add missing key information: if position is unspecified, choose a reasonable area based on composition (near subject, empty space, center/edges).
122
+
123
+ # OUTPUT FORMAT
124
+ Return only the rewritten instruction text directly, without JSON formatting or any other wrapper.
125
+ '''
126
+
127
+ full_prompt = f"{system_prompt}\n\nUser input: {prompt}\n\nRewritten prompt:"
128
+
129
+ return polish_prompt_hf(full_prompt, system_prompt)
130
+
131
+
132
+ # --- model loading ---
133
+ dtype = torch.bfloat16
134
+ device = "cuda" if torch.cuda.is_available() else "cpu"
135
+
136
+ # Scheduler configuration for lightning
137
+ scheduler_config = {
138
+ "base_image_seq_len": 256,
139
+ "base_shift": math.log(3),
140
+ "invert_sigmas": False, # Corrected boolean case
141
+ "max_image_seq_len": 8192,
142
+ "max_shift": math.log(3),
143
+ "num_train_timesteps": 1000,
144
+ "shift": 1.0,
145
+ "shift_terminal": None, # Corrected None case
146
+ "stochastic_sampling": False, # Corrected boolean case
147
+ "time_shift_type": "exponential",
148
+ "use_beta_sigmas": False, # Corrected boolean case
149
+ "use_dynamic_shifting": True, # Corrected boolean case
150
+ "use_exponential_sigmas": False, # Corrected boolean case
151
+ "use_karras_sigmas": False, # Corrected boolean case
152
+ }
153
+
154
+ # Initialize scheduler with lightning config
155
+ scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
156
+
157
+ # Load the edit pipeline with lightning scheduler
158
+ pipe = QwenImageEditPipelineCustom.from_pretrained( # Corrected class name
159
+ "qwen/qwen-image-edit",
160
+ scheduler=scheduler,
161
+ torch_dtype=dtype
162
+ ).to(device)
163
+
164
+ # Load lightning LoRA weights for acceleration
165
+ try:
166
+ pipe.load_lora_weights(
167
+ "lightx2v/qwen-image-lightning",
168
+ weight_name="qwen-image-lightning-8steps-v1.1.safetensors"
169
+ )
170
+ pipe.fuse_lora()
171
+ print("Successfully loaded lightning LoRA weights")
172
+ except Exception as e: # Catch broader exception
173
+ print(f"Warning: Could not load lightning LoRA weights: {e}")
174
+ print("Continuing with base model...")
175
+
176
+ # Apply the same optimizations from the first version
177
+ pipe.transformer.__class__ = QwenImageTransformer2DModel # Corrected class name
178
+ pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3()) # Corrected class name
179
+
180
+ # --- Ahead-of-time compilation ---
181
+ # It's important that the dummy image for optimization has the expected dimensions (e.g., 1024x1024)
182
+ optimize_pipeline_(pipe, image=Image.new("RGB", (1024, 1024)), prompt="prompt")
183
+
184
+ # --- UI constants and helpers ---
185
+ max_seed = np.iinfo(np.int32).max
186
+
187
+ # --- Main inference function ---
188
+ spaces.gpu(duration=60)
189
+ def infer(
190
+ image,
191
+ prompt,
192
+ seed=42,
193
+ randomize_seed=False, # Corrected boolean case
194
+ true_guidance_scale=1.0,
195
+ num_inference_steps=8, # Default to 8 steps for fast inference
196
+ rewrite_prompt=True, # Corrected boolean case
197
+ output_size="Original (1024x1024)", # New parameter for output size
198
+ progress=gr.Progress(track_tqdm=True), # Corrected class name
199
+ ):
200
+ """
201
+ Generates an edited image using the Qwen-Image-Edit pipeline with lightning acceleration,
202
+ and optionally resizes the output.
203
+ """
204
+ negative_prompt = " "
205
+
206
+ if randomize_seed:
207
+ seed = random.randint(0, max_seed)
208
+
209
+ generator = torch.Generator(device=device).manual_seed(seed) # Corrected class name
210
+
211
+ print(f"Original prompt: '{prompt}'")
212
+ print(f"Negative prompt: '{negative_prompt}'")
213
+ print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}")
214
+
215
+ if rewrite_prompt:
216
+ prompt = polish_prompt(prompt, image)
217
+ print(f"Rewritten prompt: {prompt}")
218
+
219
+ try:
220
+ images = pipe(
221
+ image,
222
+ prompt=prompt,
223
+ negative_prompt=negative_prompt,
224
+ num_inference_steps=num_inference_steps,
225
+ generator=generator,
226
+ true_cfg_scale=true_guidance_scale,
227
+ num_images_per_prompt=1
228
+ ).images
229
+
230
+ output_image = images[0]
231
+
232
+ # Post-processing: Resize if a different output size is selected
233
+ if output_size != "Original (1024x1024)":
234
+ try:
235
+ if output_size == "Small (512x512)":
236
+ target_size = (512, 512)
237
+ elif output_size == "Medium (768x768)":
238
+ target_size = (768, 768)
239
+ elif output_size == "Large (1536x1536)":
240
+ target_size = (1536, 1536)
241
+ else: # Custom size, parse it from "Custom (WxH)"
242
+ width, height = map(int, output_size.split('(')[1][:-1].split('x'))
243
+ target_size = (width, height)
244
+
245
+ output_image = output_image.resize(target_size, Image.LANCZOS) # Use LANCZOS for high quality down/upscaling
246
+ print(f"Resized output image to: {target_size[0]}x{target_size[1]}")
247
+ except Exception as resize_e:
248
+ print(f"Warning: Could not resize image to {output_size}: {resize_e}")
249
+ print("Returning original size image.")
250
+
251
+ return output_image, seed
252
+
253
+ except Exception as e:
254
+ print(f"Error during inference: {e}")
255
+ raise e
256
+
257
+ # --- Examples and UI layout ---
258
+ examples = [
259
+ # Example for demonstration, replace with actual image paths
260
+ # Ensure these paths are valid if running locally, or adjust for Hugging Face Spaces
261
+ [Image.new("RGB", (1024, 1024), color = 'red'), "Change the color to blue"],
262
+ [Image.new("RGB", (1024, 1024), color = 'green'), "Add a fluffy white cat sitting in the center"],
263
+ ]
264
+
265
+ css = """
266
+ #col-container {
267
+ margin: 0 auto;
268
+ max-width: 1024px;
269
+ }
270
+ #logo-title {
271
+ text-align: center;
272
+ }
273
+ #logo-title img {
274
+ width: 400px;
275
+ }
276
+ #edit_text{margin-top: -62px !important}
277
+ """
278
+
279
+ with gr.Blocks(css=css) as demo:
280
+ with gr.Column(elem_id="col-container"):
281
+ gr.HTML("""
282
+ <div id="logo-title">
283
+ <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/qwen-image/qwen_image_edit_logo.png" alt="Qwen-Image Edit Logo" width="400" style="display: block; margin: 0 auto;">
284
+ <h2 style="font-style: italic;color: #5b47d1;margin-top: -27px !important;margin-left: 96px">Fast, 8-steps with Lightning LoRA</h2>
285
+ </div>
286
+ """)
287
+ gr.Markdown("""
288
+ [Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series.
289
+ This demo uses the [Qwen-Image-Lightning](https://huggingface.co/lightx2v/qwen-image-lightning) LoRA for accelerated inference.
290
+ Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit) to run locally with ComfyUI or Diffusers.
291
+ """)
292
+
293
+ with gr.Row():
294
+ with gr.Column():
295
+ input_image = gr.Image(
296
+ label="Input Image",
297
+ show_label=True,
298
+ type="pil"
299
+ )
300
+ result = gr.Image(
301
+ label="Result",
302
+ show_label=True,
303
+ type="pil"
304
+ )
305
+
306
+ with gr.Row():
307
+ prompt = gr.Text(
308
+ label="Edit Instruction",
309
+ show_label=False,
310
+ placeholder="Describe the edit instruction (e.g., 'replace the background with a sunset', 'add a red hat', 'remove the person')",
311
+ container=False,
312
+ )
313
+ run_button = gr.Button("Edit!", variant="primary")
314
+
315
+ with gr.Accordion("Advanced Settings", open=False):
316
+ seed = gr.Slider(
317
+ label="Seed",
318
+ minimum=0,
319
+ maximum=max_seed,
320
+ step=1,
321
+ value=0,
322
+ )
323
+
324
+ randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
325
+
326
+ with gr.Row():
327
+ true_guidance_scale = gr.Slider(
328
+ label="True Guidance Scale",
329
+ minimum=1.0,
330
+ maximum=10.0,
331
+ step=0.1,
332
+ value=1.0
333
+ )
334
+
335
+ num_inference_steps = gr.Slider(
336
+ label="Number of Inference Steps",
337
+ minimum=4,
338
+ maximum=28,
339
+ step=1,
340
+ value=8
341
+ )
342
+
343
+ rewrite_prompt = gr.Checkbox(
344
+ label="Enhance Prompt (using HF Inference)",
345
+ value=True
346
+ )
347
+
348
+ # New dropdown for output image size
349
+ output_size = gr.Dropdown(
350
+ label="Output Image Size",
351
+ choices=["Original (1024x1024)", "Small (512x512)", "Medium (768x768)", "Large (1536x1536)"],
352
+ value="Original (1024x1024)"
353
+ )
354
+
355
+ gr.Examples(examples=examples, inputs=[input_image, prompt], outputs=[result, seed], fn=infer, cache_examples=False) # Changed to use the new example inputs/outputs
356
+
357
+ gr.on(
358
+ triggers=[run_button.click, prompt.submit],
359
+ fn=infer,
360
+ inputs=[
361
+ input_image,
362
+ prompt,
363
+ seed,
364
+ randomize_seed,
365
+ true_guidance_scale,
366
+ num_inference_steps,
367
+ rewrite_prompt,
368
+ output_size, # Added output_size to inputs
369
+ ],
370
+ outputs=[result, seed],
371
+ )
372
+
373
+ if __name__ == "__main__":
374
+ demo.launch()