SEVAQWERTY commited on
Commit
29014f3
·
verified ·
1 Parent(s): 24ba517

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -474
app.py CHANGED
@@ -1,474 +1,104 @@
1
- import gradio as gr
2
- import numpy as np
3
- import random
4
- import torch
5
- import spaces
6
-
7
- from PIL import Image
8
- from diffusers import FlowMatchEulerDiscreteScheduler, QwenImageEditPlusPipeline
9
- # from optimization import optimize_pipeline_
10
- # from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
11
- # from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
12
- # from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
13
-
14
- from huggingface_hub import InferenceClient
15
- import math
16
-
17
- import os
18
- import base64
19
- from io import BytesIO
20
- import json
21
-
22
- SYSTEM_PROMPT = '''
23
- # Edit Instruction Rewriter
24
- You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
25
-
26
- Please strictly follow the rewriting rules below:
27
-
28
- ## 1. General Principles
29
- - Keep the rewritten prompt **concise and comprehensive**. Avoid overly long sentences and unnecessary descriptive language.
30
- - If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
31
- - Keep the main part of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
32
- - All added objects or modifications must align with the logic and style of the scene in the input images.
33
- - If multiple sub-images are to be generated, describe the content of each sub-image individually.
34
-
35
- ## 2. Task-Type Handling Rules
36
-
37
- ### 1. Add, Delete, Replace Tasks
38
- - If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
39
- - If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
40
- > Original: "Add an animal"
41
- > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
42
- - Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
43
- - For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
44
-
45
- ### 2. Text Editing Tasks
46
- - All text content must be enclosed in English double quotes `" "`. Keep the original language of the text, and keep the capitalization.
47
- - Both adding new text and replacing existing text are text replacement tasks, For example:
48
- - Replace "xx" to "yy"
49
- - Replace the mask / bounding box to "yy"
50
- - Replace the visual object to "yy"
51
- - Specify text position, color, and layout only if user has required.
52
- - If font is specified, keep the original language of the font.
53
-
54
- ### 3. Human Editing Tasks
55
- - Make the smallest changes to the given user's prompt.
56
- - If changes to background, action, expression, camera shot, or ambient lighting are required, please list each modification individually.
57
- - **Edits to makeup or facial features / expression must be subtle, not exaggerated, and must preserve the subject's identity consistency.**
58
- > Original: "Add eyebrows to the face"
59
- > Rewritten: "Slightly thicken the person's eyebrows with little change, look natural."
60
-
61
- ### 4. Style Conversion or Enhancement Tasks
62
- - If a style is specified, describe it concisely using key visual features. For example:
63
- > Original: "Disco style"
64
- > Rewritten: "1970s disco style: flashing lights, disco ball, mirrored walls, vibrant colors"
65
- - For style reference, analyze the original image and extract key characteristics (color, composition, texture, lighting, artistic style, etc.), integrating them into the instruction.
66
- - **Colorization tasks (including old photo restoration) must use the fixed template:**
67
- "Restore and colorize the old photo."
68
- - Clearly specify the object to be modified. For example:
69
- > Original: Modify the subject in Picture 1 to match the style of Picture 2.
70
- > Rewritten: Change the girl in Picture 1 to the ink-wash style of Picture 2 — rendered in black-and-white watercolor with soft color transitions.
71
-
72
- ### 5. Material Replacement
73
- - Clearly specify the object and the material. For example: "Change the material of the apple to papercut style."
74
- - For text material replacement, use the fixed template:
75
- "Change the material of text "xxxx" to laser style"
76
-
77
- ### 6. Logo/Pattern Editing
78
- - Material replacement should preserve the original shape and structure as much as possible. For example:
79
- > Original: "Convert to sapphire material"
80
- > Rewritten: "Convert the main subject in the image to sapphire material, preserving similar shape and structure"
81
- - When migrating logos/patterns to new scenes, ensure shape and structure consistency. For example:
82
- > Original: "Migrate the logo in the image to a new scene"
83
- > Rewritten: "Migrate the logo in the image to a new scene, preserving similar shape and structure"
84
-
85
- ### 7. Multi-Image Tasks
86
- - Rewritten prompts must clearly point out which image's element is being modified. For example:
87
- > Original: "Replace the subject of picture 1 with the subject of picture 2"
88
- > Rewritten: "Replace the girl of picture 1 with the boy of picture 2, keeping picture 2's background unchanged"
89
- - For stylization tasks, describe the reference image's style in the rewritten prompt, while preserving the visual content of the source image.
90
-
91
- ## 3. Rationale and Logic Check
92
- - Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" requires logical correction.
93
- - Supplement missing critical information: e.g., if position is unspecified, choose a reasonable area based on composition (near subject, blank space, center/edge, etc.).
94
-
95
- # Output Format Example
96
- ```json
97
- {
98
- "Rewritten": "..."
99
- }
100
- '''
101
-
102
- def polish_prompt_hf(original_prompt, img_list):
103
- """
104
- Rewrites the prompt using a Hugging Face InferenceClient.
105
- Supports multiple images via img_list.
106
- """
107
- # Ensure HF_TOKEN is set
108
- api_key = os.environ.get("inference_providers")
109
- if not api_key:
110
- print("Warning: HF_TOKEN not set. Falling back to original prompt.")
111
- return original_prompt
112
- prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {original_prompt}\n\nRewritten Prompt:"
113
- system_prompt = "you are a helpful assistant, you should provide useful answers to users."
114
- try:
115
- # Initialize the client
116
- client = InferenceClient(
117
- provider="nebius",
118
- api_key=api_key,
119
- )
120
-
121
- # Convert list of images to base64 data URLs
122
- image_urls = []
123
- if img_list is not None:
124
- # Ensure img_list is actually a list
125
- if not isinstance(img_list, list):
126
- img_list = [img_list]
127
-
128
- for img in img_list:
129
- image_url = None
130
- # If img is a PIL Image
131
- if hasattr(img, 'save'): # Check if it's a PIL Image
132
- buffered = BytesIO()
133
- img.save(buffered, format="PNG")
134
- img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
135
- image_url = f"data:image/png;base64,{img_base64}"
136
- # If img is already a file path (string)
137
- elif isinstance(img, str):
138
- with open(img, "rb") as image_file:
139
- img_base64 = base64.b64encode(image_file.read()).decode('utf-8')
140
- image_url = f"data:image/png;base64,{img_base64}"
141
- else:
142
- print(f"Warning: Unexpected image type: {type(img)}, skipping...")
143
- continue
144
-
145
- if image_url:
146
- image_urls.append(image_url)
147
-
148
- # Build the content array with text first, then all images
149
- content = [
150
- {
151
- "type": "text",
152
- "text": prompt
153
- }
154
- ]
155
-
156
- # Add all images to the content
157
- for image_url in image_urls:
158
- content.append({
159
- "type": "image_url",
160
- "image_url": {
161
- "url": image_url
162
- }
163
- })
164
-
165
- # Format the messages for the chat completions API
166
- messages = [
167
- {"role": "system", "content": system_prompt},
168
- {
169
- "role": "user",
170
- "content": content
171
- }
172
- ]
173
-
174
- # Call the API
175
- completion = client.chat.completions.create(
176
- model="Qwen/Qwen2.5-VL-72B-Instruct",
177
- messages=messages,
178
- )
179
-
180
- # Parse the response
181
- result = completion.choices[0].message.content
182
-
183
- # Try to extract JSON if present
184
- if '"Rewritten"' in result:
185
- try:
186
- # Clean up the response
187
- result = result.replace('```json', '').replace('```', '')
188
- result_json = json.loads(result)
189
- polished_prompt = result_json.get('Rewritten', result)
190
- except:
191
- polished_prompt = result
192
- else:
193
- polished_prompt = result
194
-
195
- polished_prompt = polished_prompt.strip().replace("\n", " ")
196
- return polished_prompt
197
-
198
- except Exception as e:
199
- print(f"Error during API call to Hugging Face: {e}")
200
- # Fallback to original prompt if enhancement fails
201
- return original_prompt
202
-
203
-
204
-
205
- def encode_image(pil_image):
206
- import io
207
- buffered = io.BytesIO()
208
- pil_image.save(buffered, format="PNG")
209
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
210
-
211
- # --- Model Loading ---
212
- dtype = torch.bfloat16
213
- device = "cuda" if torch.cuda.is_available() else "cpu"
214
-
215
- # Scheduler configuration for Lightning
216
- scheduler_config = {
217
- "base_image_seq_len": 256,
218
- "base_shift": math.log(3),
219
- "invert_sigmas": False,
220
- "max_image_seq_len": 8192,
221
- "max_shift": math.log(3),
222
- "num_train_timesteps": 1000,
223
- "shift": 1.0,
224
- "shift_terminal": None,
225
- "stochastic_sampling": False,
226
- "time_shift_type": "exponential",
227
- "use_beta_sigmas": False,
228
- "use_dynamic_shifting": True,
229
- "use_exponential_sigmas": False,
230
- "use_karras_sigmas": False,
231
- }
232
-
233
- # Initialize scheduler with Lightning config
234
- scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
235
-
236
- # Load the model pipeline
237
- pipe = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2511",
238
- scheduler=scheduler,
239
- torch_dtype=dtype).to(device)
240
- pipe.load_lora_weights(
241
- "lightx2v/Qwen-Image-Edit-2511-Lightning",
242
- weight_name="Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors"
243
- )
244
- pipe.fuse_lora()
245
-
246
- # # Apply the same optimizations from the first version
247
- # pipe.transformer.__class__ = QwenImageTransformer2DModel
248
- # pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
249
-
250
- # # --- Ahead-of-time compilation ---
251
- # optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024)), Image.new("RGB", (1024, 1024))], prompt="prompt")
252
-
253
- # --- UI Constants and Helpers ---
254
- MAX_SEED = np.iinfo(np.int32).max
255
-
256
- def use_output_as_input(output_images):
257
- """Convert output images to input format for the gallery"""
258
- if output_images is None or len(output_images) == 0:
259
- return []
260
- return output_images
261
-
262
- # --- Main Inference Function (with hardcoded negative prompt) ---
263
- @spaces.GPU()
264
- def infer(
265
- images,
266
- prompt,
267
- seed=42,
268
- randomize_seed=False,
269
- true_guidance_scale=1.0,
270
- num_inference_steps=4,
271
- height=None,
272
- width=None,
273
- rewrite_prompt=True,
274
- num_images_per_prompt=1,
275
- progress=gr.Progress(track_tqdm=True),
276
- ):
277
- """
278
- Run image-editing inference using the Qwen-Image-Edit pipeline.
279
-
280
- Parameters:
281
- images (list): Input images from the Gradio gallery (PIL or path-based).
282
- prompt (str): Editing instruction (may be rewritten by LLM if enabled).
283
- seed (int): Random seed for reproducibility.
284
- randomize_seed (bool): If True, overrides seed with a random value.
285
- true_guidance_scale (float): CFG scale used by Qwen-Image.
286
- num_inference_steps (int): Number of diffusion steps.
287
- height (int | None): Optional output height override.
288
- width (int | None): Optional output width override.
289
- rewrite_prompt (bool): Whether to rewrite the prompt using Qwen-2.5-VL.
290
- num_images_per_prompt (int): Number of images to generate.
291
- progress: Gradio progress callback.
292
-
293
- Returns:
294
- tuple: (generated_images, seed_used, UI_visibility_update)
295
- """
296
-
297
- # Hardcode the negative prompt as requested
298
- negative_prompt = " "
299
-
300
- if randomize_seed:
301
- seed = random.randint(0, MAX_SEED)
302
-
303
- # Set up the generator for reproducibility
304
- generator = torch.Generator(device=device).manual_seed(seed)
305
-
306
- # Load input images into PIL Images
307
- pil_images = []
308
- if images is not None:
309
- for item in images:
310
- try:
311
- if isinstance(item[0], Image.Image):
312
- pil_images.append(item[0].convert("RGB"))
313
- elif isinstance(item[0], str):
314
- pil_images.append(Image.open(item[0]).convert("RGB"))
315
- elif hasattr(item, "name"):
316
- pil_images.append(Image.open(item.name).convert("RGB"))
317
- except Exception:
318
- continue
319
-
320
- if height==256 and width==256:
321
- height, width = None, None
322
- print(f"Calling pipeline with prompt: '{prompt}'")
323
- print(f"Negative Prompt: '{negative_prompt}'")
324
- print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}, Size: {width}x{height}")
325
- if rewrite_prompt and len(pil_images) > 0:
326
- prompt = polish_prompt_hf(prompt, pil_images)
327
- print(f"Rewritten Prompt: {prompt}")
328
-
329
-
330
- # Generate the image
331
- image = pipe(
332
- image=pil_images if len(pil_images) > 0 else None,
333
- prompt=prompt,
334
- height=height,
335
- width=width,
336
- negative_prompt=negative_prompt,
337
- num_inference_steps=num_inference_steps,
338
- generator=generator,
339
- true_cfg_scale=true_guidance_scale,
340
- num_images_per_prompt=num_images_per_prompt,
341
- ).images
342
-
343
- # Return images, seed, and make button visible
344
- return image, seed, gr.update(visible=True)
345
-
346
- # --- Examples and UI Layout ---
347
- examples = []
348
-
349
- css = """
350
- #col-container {
351
- margin: 0 auto;
352
- max-width: 1024px;
353
- }
354
- #logo-title {
355
- text-align: center;
356
- }
357
- #logo-title img {
358
- width: 400px;
359
- }
360
- #edit_text{margin-top: -62px !important}
361
- """
362
-
363
- with gr.Blocks(css=css) as demo:
364
- with gr.Column(elem_id="col-container"):
365
- gr.HTML("""
366
- <div id="logo-title">
367
- <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png" alt="Qwen-Image Edit Logo" width="400" style="display: block; margin: 0 auto;">
368
- <h2 style="font-style: italic;color: #5b47d1;margin-top: -27px !important;margin-left: 96px">[Plus] Fast, 4-steps with LightX2V LoRA</h2>
369
- </div>
370
- """)
371
- gr.Markdown("""
372
- [Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series.
373
- This demo uses the new [Qwen-Image-Edit-2511](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) with the [Qwen-Image-Lightning-2511](https://huggingface.co/lightx2v/Qwen-Image-Edit-2511-Lightning) LoRA for accelerated inference.
374
- Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) to run locally with ComfyUI or diffusers.
375
- """)
376
- with gr.Row():
377
- with gr.Column():
378
- input_images = gr.Gallery(label="Input Images",
379
- show_label=False,
380
- type="pil",
381
- interactive=True)
382
-
383
- with gr.Column():
384
- result = gr.Gallery(label="Result", show_label=False, type="pil", interactive=False)
385
- # Add this button right after the result gallery - initially hidden
386
- use_output_btn = gr.Button("↗️ Use as input", variant="secondary", size="sm", visible=False)
387
-
388
- with gr.Row():
389
- prompt = gr.Text(
390
- label="Prompt",
391
- show_label=False,
392
- placeholder="describe the edit instruction",
393
- container=False,
394
- )
395
- run_button = gr.Button("Edit!", variant="primary")
396
-
397
- with gr.Accordion("Advanced Settings", open=False):
398
- # Negative prompt UI element is removed here
399
-
400
- seed = gr.Slider(
401
- label="Seed",
402
- minimum=0,
403
- maximum=MAX_SEED,
404
- step=1,
405
- value=0,
406
- )
407
-
408
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
409
-
410
- with gr.Row():
411
-
412
- true_guidance_scale = gr.Slider(
413
- label="True guidance scale",
414
- minimum=1.0,
415
- maximum=10.0,
416
- step=0.1,
417
- value=1.0
418
- )
419
-
420
- num_inference_steps = gr.Slider(
421
- label="Number of inference steps",
422
- minimum=1,
423
- maximum=40,
424
- step=1,
425
- value=4,
426
- )
427
-
428
- height = gr.Slider(
429
- label="Height",
430
- minimum=256,
431
- maximum=2048,
432
- step=8,
433
- value=None,
434
- )
435
-
436
- width = gr.Slider(
437
- label="Width",
438
- minimum=256,
439
- maximum=2048,
440
- step=8,
441
- value=None,
442
- )
443
-
444
-
445
- rewrite_prompt = gr.Checkbox(label="Rewrite prompt", value=True)
446
-
447
- # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
448
-
449
- gr.on(
450
- triggers=[run_button.click, prompt.submit],
451
- fn=infer,
452
- inputs=[
453
- input_images,
454
- prompt,
455
- seed,
456
- randomize_seed,
457
- true_guidance_scale,
458
- num_inference_steps,
459
- height,
460
- width,
461
- rewrite_prompt,
462
- ],
463
- outputs=[result, seed, use_output_btn], # Added use_output_btn to outputs
464
- )
465
-
466
- # Add the new event handler for the "Use Output as Input" button
467
- use_output_btn.click(
468
- fn=use_output_as_input,
469
- inputs=[result],
470
- outputs=[input_images]
471
- )
472
-
473
- if __name__ == "__main__":
474
- demo.launch(mcp_server=True)
 
1
+ import gradio as gr
2
+ import torch
3
+ import random
4
+ import numpy as np
5
+ from PIL import Image
6
+ from diffusers import StableDiffusionInstructPix2PixPipeline
7
+ import spaces
8
+
9
+ # ==============================
10
+ # Device (CPU ONLY)
11
+ # ==============================
12
+ device = "cpu"
13
+ dtype = torch.float32
14
+
15
+ print("Loading InstructPix2Pix pipeline...")
16
+
17
+ pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
18
+ "timbrooks/instruct-pix2pix",
19
+ torch_dtype=dtype,
20
+ safety_checker=None
21
+ ).to(device)
22
+
23
+ # CPU optimizations
24
+ pipe.enable_attention_slicing()
25
+
26
+ print("Model loaded successfully.")
27
+
28
+ MAX_SEED = np.iinfo(np.int32).max
29
+
30
+
31
+ # ==============================
32
+ # Inference Function
33
+ # ==============================
34
+ @spaces.GPU() # Safe even on CPU Basic
35
+ def infer(
36
+ image,
37
+ prompt,
38
+ seed=0,
39
+ randomize_seed=True,
40
+ guidance_scale=7.5,
41
+ num_inference_steps=20,
42
+ ):
43
+ if image is None:
44
+ return None, seed
45
+
46
+ if randomize_seed:
47
+ seed = random.randint(0, MAX_SEED)
48
+
49
+ generator = torch.Generator(device=device).manual_seed(seed)
50
+
51
+ image = image.convert("RGB").resize((512, 512))
52
+
53
+ result = pipe(
54
+ prompt=prompt,
55
+ image=image,
56
+ guidance_scale=guidance_scale,
57
+ num_inference_steps=min(num_inference_steps, 30),
58
+ generator=generator,
59
+ ).images[0]
60
+
61
+ return result, seed
62
+
63
+
64
+ # ==============================
65
+ # UI
66
+ # ==============================
67
+ with gr.Blocks() as demo:
68
+ gr.Markdown("# 🖼️ Image Edit (CPU Version)")
69
+ gr.Markdown("Stable Diffusion InstructPix2Pix works on 16GB CPU Basic")
70
+
71
+ with gr.Row():
72
+ input_image = gr.Image(type="pil", label="Input Image")
73
+ output_image = gr.Image(type="pil", label="Edited Image")
74
+
75
+ prompt = gr.Textbox(
76
+ label="Edit Instruction",
77
+ placeholder="e.g. make the sky pink"
78
+ )
79
+
80
+ with gr.Row():
81
+ seed = gr.Slider(0, MAX_SEED, value=0, step=1, label="Seed")
82
+ randomize_seed = gr.Checkbox(value=True, label="Randomize Seed")
83
+
84
+ with gr.Row():
85
+ guidance_scale = gr.Slider(1.0, 15.0, value=7.5, step=0.5, label="Guidance Scale")
86
+ num_inference_steps = gr.Slider(1, 40, value=20, step=1, label="Steps")
87
+
88
+ run_button = gr.Button("Edit Image")
89
+
90
+ run_button.click(
91
+ fn=infer,
92
+ inputs=[
93
+ input_image,
94
+ prompt,
95
+ seed,
96
+ randomize_seed,
97
+ guidance_scale,
98
+ num_inference_steps
99
+ ],
100
+ outputs=[output_image, seed],
101
+ )
102
+
103
+ if __name__ == "__main__":
104
+ demo.launch()