LPX55 commited on
Commit
6bb05d6
·
verified ·
1 Parent(s): aba4931

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -264
app.py CHANGED
@@ -3,318 +3,205 @@ import numpy as np
3
  import random
4
  import torch
5
  import spaces
6
-
7
  from PIL import Image
8
  from diffusers import QwenImageEditPipeline
9
-
10
  import os
11
  import base64
12
  import json
13
  from huggingface_hub import InferenceClient
14
 
15
- SYSTEM_PROMPT = '''
16
- # Edit Instruction Rewriter
17
- You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
18
-
19
- Please strictly follow the rewriting rules below:
20
-
21
- ## 1. General Principles
22
- - Keep the rewritten prompt **concise**. Avoid overly long sentences and reduce unnecessary descriptive language.
23
- - If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
24
- - Keep the core intention of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
25
- - All added objects or modifications must align with the logic and style of the edited input image’s overall scene.
26
-
27
- ## 2. Task Type Handling Rules
28
- ### 1. Add, Delete, Replace Tasks
29
- - If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
30
- - If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
31
- > Original: "Add an animal"
32
- > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
33
- - Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
34
- - For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
35
-
36
- ### 2. Text Editing Tasks
37
- - All text content must be enclosed in English double quotes `" "`. Do not translate or alter the original language of the text, and do not change the capitalization.
38
- - **For text replacement tasks, always use the fixed template:**
39
- - `Replace "xx" to "yy"`.
40
- - `Replace the xx bounding box to "yy"`.
41
- - If the user does not specify text content, infer and add concise text based on the instruction and the input image’s context. For example:
42
- > Original: "Add a line of text" (poster)
43
- > Rewritten: "Add text \"LIMITED EDITION\" at the top center with slight shadow"
44
- - Specify text position, color, and layout in a concise way.
45
-
46
- ### 3. Human Editing Tasks
47
- - Maintain the person’s core visual consistency (ethnicity, gender, age, hairstyle, expression, outfit, etc.).
48
- - If modifying appearance (e.g., clothes, hairstyle), ensure the new element is consistent with the original style.
49
- - **For expression changes, they must be natural and subtle, never exaggerated.**
50
- - If deletion is not specifically emphasized, the most important subject in the original image (e.g., a person, an animal) should be preserved.
51
- - For background change tasks, emphasize maintaining subject consistency at first.
52
- - Example:
53
- > Original: "Change the person’s hat"
54
- > Rewritten: "Replace the man’s hat with a dark brown beret; keep smile, short hair, and gray jacket unchanged"
55
-
56
- ### 4. Style Transformation or Enhancement Tasks
57
- - If a style is specified, describe it concisely with key visual traits. For example:
58
- > Original: "Disco style"
59
- > Rewritten: "1970s disco: flashing lights, disco ball, mirrored walls, colorful tones"
60
- - If the instruction says "use reference style" or "keep current style," analyze the input image, extract main features (color, composition, texture, lighting, art style), and integrate them concisely.
61
- - **For coloring tasks, including restoring old photos, always use the fixed template:** "Restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"
62
- - If there are other changes, place the style description at the end.
63
-
64
- ## 3. Rationality and Logic Checks
65
- - Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" should be logically corrected.
66
- - Add missing key information: if position is unspecified, choose a reasonable area based on composition (near subject, empty space, center/edges).
67
-
68
- # Output Format Example
69
- ```json
70
- {
71
- "Rewritten": "..."
72
- }
73
- '''
74
-
75
- # def polish_prompt(original_prompt, system_prompt):
76
- # """
77
- # Rewrites the prompt using a Hugging Face InferenceClient.
78
- # """
79
- # # Ensure HF_TOKEN is set
80
- # api_key = os.environ.get("HF_TOKEN")
81
- # if not api_key:
82
- # raise EnvironmentError("HF_TOKEN is not set. Please set it in your environment.")
83
-
84
- # # Initialize the client
85
- # client = InferenceClient(
86
- # provider="cerebras",
87
- # api_key=api_key,
88
- # )
89
-
90
- # # Format the messages for the chat completions API
91
- # messages = [
92
- # {"role": "system", "content": system_prompt},
93
- # {"role": "user", "content": original_prompt}
94
- # ]
95
-
96
- # try:
97
- # # Call the API
98
- # completion = client.chat.completions.create(
99
- # model="Qwen/Qwen3-235B-A22B-Instruct-2507",
100
- # messages=messages,
101
- # )
102
- # polished_prompt = completion.choices[0].message.content
103
- # polished_prompt = polished_prompt.strip().replace("\n", " ")
104
- # return polished_prompt
105
- # except Exception as e:
106
- # print(f"Error during API call to Hugging Face: {e}")
107
- # # Fallback to original prompt if enhancement fails
108
- # return original_prompt
109
-
110
-
111
- def polish_prompt(prompt, system_prompt):
112
  """
113
  Rewrites the prompt using a Hugging Face InferenceClient.
 
114
  """
115
- # Ensure HF_TOKEN is set
116
- api_key = os.environ.get("HF_TOKEN")
117
- if not api_key:
118
- raise EnvironmentError("HF_TOKEN is not set. Please set it in your environment.")
119
- # Initialize the client
120
  client = InferenceClient(
121
  provider="cerebras",
122
- api_key=api_key,
123
  )
124
-
125
- prompt = f"{system_prompt}\n\nUser Input: {prompt}\n\nRewritten Prompt:"
126
- success=False
127
- while not success:
128
- try:
129
- completion = client.chat.completions.create(
130
- model="Qwen/Qwen3-235B-A22B-Instruct-2507",
131
- messages=messages,
132
- )
133
- # print(f"Result: {result}")
134
- # print(f"Polished Prompt: {polished_prompt}")
135
- if isinstance(result, str):
136
- result = result.replace('```json','')
137
- result = result.replace('```','')
138
- result = json.loads(result)
139
- else:
140
- result = json.loads(result)
141
-
142
- polished_prompt = result['Rewritten']
143
- polished_prompt = polished_prompt.strip()
144
- polished_prompt = polished_prompt.replace("\n", " ")
145
- success = True
146
- except Exception as e:
147
- print(f"Error during API call to Hugging Face: {e}")
148
- # Fallback to original prompt if enhancement fails
149
- return prompt
150
-
151
-
152
- def encode_image(pil_image):
153
- import io
154
- buffered = io.BytesIO()
155
- pil_image.save(buffered, format="PNG")
156
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
157
-
158
-
159
-
160
-
161
- def api(prompt, img_list, model="qwen-vl-max-latest", kwargs={}):
162
- import dashscope
163
- api_key = os.environ.get('DASH_API_KEY')
164
- if not api_key:
165
- raise EnvironmentError("DASH_API_KEY is not set")
166
- assert model in ["qwen-vl-max-latest"], f"Not implemented model {model}"
167
- sys_promot = "you are a helpful assistant, you should provide useful answers to users."
168
  messages = [
169
- {"role": "system", "content": sys_promot},
170
- {"role": "user", "content": []}]
171
- for img in img_list:
172
- messages[1]["content"].append(
173
- {"image": f"data:image/png;base64,{encode_image(img)}"})
174
- messages[1]["content"].append({"text": f"{prompt}"})
175
-
176
- response_format = kwargs.get('response_format', None)
177
-
178
- response = dashscope.MultiModalConversation.call(
179
- api_key=api_key,
180
- model=model, # For example, use qwen-plus here. You can change the model name as needed. Model list: https://help.aliyun.com/zh/model-studio/getting-started/models
181
- messages=messages,
182
- result_format='message',
183
- response_format=response_format,
184
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- if response.status_code == 200:
187
- return response.output.choices[0].message.content[0]['text']
188
- else:
189
- raise Exception(f'Failed to post: {response}')
190
-
191
- # --- Model Loading ---
192
  dtype = torch.bfloat16
193
  device = "cuda" if torch.cuda.is_available() else "cpu"
194
-
195
- # Load the model pipeline
196
  pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=dtype).to(device)
 
 
 
 
 
197
 
198
- # --- UI Constants and Helpers ---
199
- MAX_SEED = np.iinfo(np.int32).max
200
-
201
- # --- Main Inference Function (with hardcoded negative prompt) ---
202
- @spaces.GPU()
203
  def infer(
204
  image,
205
  prompt,
206
  seed=42,
207
  randomize_seed=False,
208
  true_guidance_scale=1.0,
209
- num_inference_steps=20,
210
  rewrite_prompt=False,
 
211
  num_images_per_prompt=1,
212
  progress=gr.Progress(track_tqdm=True),
213
  ):
214
  """
215
- Generates an image using the local Qwen-Image diffusers pipeline.
216
  """
217
- # Hardcode the negative prompt as requested
218
  negative_prompt = " "
219
-
220
  if randomize_seed:
221
  seed = random.randint(0, MAX_SEED)
222
-
223
- # Set up the generator for reproducibility
224
  generator = torch.Generator(device=device).manual_seed(seed)
225
 
226
- print(f"Calling pipeline with prompt: '{prompt}'")
227
- print(f"Negative Prompt: '{negative_prompt}'")
228
- print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}")
229
  if rewrite_prompt:
230
- prompt = polish_prompt(prompt, image)
231
- print(f"Rewritten Prompt: {prompt}")
232
-
233
- # Generate the image
234
- image = pipe(
 
 
235
  image,
236
  prompt=prompt,
237
  negative_prompt=negative_prompt,
238
  num_inference_steps=num_inference_steps,
239
  generator=generator,
240
  true_cfg_scale=true_guidance_scale,
241
- num_images_per_prompt=num_images_per_prompt
242
  ).images
 
 
243
 
244
- return image, seed
245
-
246
- # --- Examples and UI Layout ---
247
- examples = []
248
-
249
- css = """
250
- #col-container {
251
- margin: 0 auto;
252
- max-width: 1024px;
253
- }
254
- #edit_text{margin-top: -62px !important}
255
- """
256
-
257
- with gr.Blocks(css=css) as demo:
258
- with gr.Column(elem_id="col-container"):
259
- gr.HTML('<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png" alt="Qwen-Image Logo" width="400" style="display: block; margin: 0 auto;">')
260
- gr.Markdown("[Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series. Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit) to run locally with ComfyUI or diffusers.")
261
- with gr.Row():
262
- with gr.Column():
263
- input_image = gr.Image(label="Input Image", show_label=False, type="pil")
264
-
265
- # result = gr.Image(label="Result", show_label=False, type="pil")
266
- result = gr.Gallery(label="Result", show_label=False, type="pil")
 
 
 
 
 
 
267
  with gr.Row():
268
- prompt = gr.Text(
269
- label="Prompt",
270
- show_label=False,
271
- placeholder="describe the edit instruction",
272
- container=False,
 
273
  )
274
- run_button = gr.Button("Edit!", variant="primary")
275
-
276
- with gr.Accordion("Advanced Settings", open=False):
277
- # Negative prompt UI element is removed here
278
-
279
- seed = gr.Slider(
280
- label="Seed",
281
- minimum=0,
282
- maximum=MAX_SEED,
283
  step=1,
284
- value=0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  )
286
-
287
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
288
-
289
- with gr.Row():
290
-
291
- true_guidance_scale = gr.Slider(
292
- label="True guidance scale",
293
- minimum=1.0,
294
- maximum=10.0,
295
- step=0.1,
296
- value=4.0
297
- )
298
-
299
- num_inference_steps = gr.Slider(
300
- label="Number of inference steps",
301
- minimum=1,
302
- maximum=50,
303
- step=1,
304
- value=50,
305
- )
306
-
307
- num_images_per_prompt = gr.Slider(
308
- label="Number of images per prompt",
309
- minimum=1,
310
- maximum=4,
311
- step=1,
312
- value=1,
313
- )
314
-
315
- rewrite_prompt = gr.Checkbox(label="Rewrite prompt", value=False, visible=False)
316
-
317
- # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
318
 
319
  gr.on(
320
  triggers=[run_button.click, prompt.submit],
@@ -326,8 +213,9 @@ with gr.Blocks(css=css) as demo:
326
  randomize_seed,
327
  true_guidance_scale,
328
  num_inference_steps,
329
- rewrite_prompt,
330
- num_images_per_prompt,
 
331
  ],
332
  outputs=[result, seed],
333
  )
 
3
  import random
4
  import torch
5
  import spaces
 
6
  from PIL import Image
7
  from diffusers import QwenImageEditPipeline
 
8
  import os
9
  import base64
10
  import json
11
  from huggingface_hub import InferenceClient
12
 
13
+ def get_caption_language(prompt):
14
+ """Detects if the prompt contains Chinese characters."""
15
+ ranges = [
16
+ ('\u4e00', '\u9fff'), # CJK Unified Ideographs
17
+ ]
18
+ for char in prompt:
19
+ if any(start <= char <= end for start, end in ranges):
20
+ return 'zh'
21
+ return 'en'
22
+
23
+ def polish_prompt(original_prompt, system_prompt, hf_token):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  """
25
  Rewrites the prompt using a Hugging Face InferenceClient.
26
+ Requires user-provided HF token for API access.
27
  """
28
+ if not hf_token or not hf_token.strip():
29
+ gr.Warning("HF Token is required for prompt rewriting but was not provided!")
30
+ return original_prompt
31
+
 
32
  client = InferenceClient(
33
  provider="cerebras",
34
+ api_key=hf_token,
35
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  messages = [
37
+ {"role": "system", "content": system_prompt},
38
+ {"role": "user", "content": original_prompt}
39
+ ]
40
+ try:
41
+ completion = client.chat.completions.create(
42
+ model="Qwen/Qwen3-235B-A22B-Instruct-2507",
43
+ messages=messages,
44
+ max_tokens=2000,
 
 
 
 
 
 
 
45
  )
46
+ polished_prompt = completion.choices[0].message.content
47
+ polished_prompt = polished_prompt.strip().replace("\n", " ")
48
+ return polished_prompt
49
+ except Exception as e:
50
+ print(f"Error during Hugging Face API call: {e}")
51
+ gr.Warning("Failed to rewrite prompt. Using original.")
52
+ return original_prompt
53
+
54
+ SYSTEM_PROMPT_EDIT = '''
55
+ # Edit Instruction Rewriter
56
+ You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable instruction based on the user's intent and the input image.
57
+ ## 1. General Principles
58
+ - Keep the rewritten instruction **concise** and clear.
59
+ - Avoid contradictions, vagueness, or unachievable instructions.
60
+ - Maintain the core logic of the original instruction; only enhance clarity and feasibility.
61
+ - Ensure new added elements or modifications align with the image's original context and art style.
62
+ ## 2. Task Types
63
+ ### Add, Delete, Replace:
64
+ - When the input is detailed, only refine grammar and clarity.
65
+ - For vague instructions, infer minimal but sufficient details.
66
+ - For replacement, use the format: `"Replace X with Y"`.
67
+ ### Text Editing (e.g., text replacement):
68
+ - Enclose text content in quotes, e.g., `Replace "abc" with "xyz"`.
69
+ - Preserving the original structure and language—**do not translate** or alter style.
70
+ ### Human Editing (e.g., change a person’s face/hair):
71
+ - Preserve core visual identity (gender, ethnic features).
72
+ - Describe expressions in subtle and natural terms.
73
+ - Maintain key clothing or styling details unless explicitly replaced.
74
+ ### Style Transformation:
75
+ - If a style is specified, e.g., `Disco style`, rewrite it to encapsulate the essential visual traits.
76
+ - Use a fixed template for **coloring/restoration**:
77
+ `"Restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"`
78
+ if applicable.
79
+ ## 4. Output Format
80
+ Please provide the rewritten instruction in a clean `json` format as:
81
+ {
82
+ "Rewritten": "..."
83
+ }
84
+ '''
85
 
 
 
 
 
 
 
86
  dtype = torch.bfloat16
87
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
88
  pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=dtype).to(device)
89
+ pipe.load_lora_weights(
90
+ "lightx2v/Qwen-Image-Edit-Lightning",
91
+ weight_name="Qwen-Image-Edit-Lightning-8steps-V1.1.safetensors"
92
+ )
93
+ pipe.fuse_lora()
94
 
95
+ @spaces.GPU(duration=60)
 
 
 
 
96
  def infer(
97
  image,
98
  prompt,
99
  seed=42,
100
  randomize_seed=False,
101
  true_guidance_scale=1.0,
102
+ num_inference_steps=8,
103
  rewrite_prompt=False,
104
+ hf_token="",
105
  num_images_per_prompt=1,
106
  progress=gr.Progress(track_tqdm=True),
107
  ):
108
  """
109
+ Requires user-provided HF token for prompt rewriting.
110
  """
 
111
  negative_prompt = " "
 
112
  if randomize_seed:
113
  seed = random.randint(0, MAX_SEED)
 
 
114
  generator = torch.Generator(device=device).manual_seed(seed)
115
 
 
 
 
116
  if rewrite_prompt:
117
+ lang = get_caption_language(prompt)
118
+ system_prompt = SYSTEM_PROMPT_EDIT
119
+ polished_prompt = polish_prompt(prompt, system_prompt, hf_token)
120
+ print(f"Rewritten Prompt: {polished_prompt}")
121
+ prompt = polished_prompt
122
+
123
+ edited_images = pipe(
124
  image,
125
  prompt=prompt,
126
  negative_prompt=negative_prompt,
127
  num_inference_steps=num_inference_steps,
128
  generator=generator,
129
  true_cfg_scale=true_guidance_scale,
130
+ num_images_per_prompt=num_images_per_prompt,
131
  ).images
132
+
133
+ return edited_images, seed
134
 
135
+ MAX_SEED = np.iinfo(np.int32).max
136
+ examples = [
137
+ "Replace the cat with a friendly golden retriever. Make it look happier, and add more background details.",
138
+ "Add text 'Qwen - AI for image editing' in Chinese at the bottom center with a small shadow.",
139
+ "Change the style to 1970s vintage, add old photo effect, restore any scratches on the wall or window.",
140
+ "Remove the blue sky and replace it with a dark night cityscape.",
141
+ """Replace "Qwen" with "通义" in the Image. Ensure Chinese font is used for "通义" and position it to the top left with a light heading-style font."""
142
+ ]
143
+
144
+ with gr.Blocks() as demo:
145
+ gr.Markdown("# Qwen-Image-Edit with Prompt Enhancement")
146
+ gr.Markdown("⚠️ **Prompt rewriting requires your own [Hugging Face token](https://huggingface.co/settings/tokens)**")
147
+
148
+ with gr.Column():
149
+ input_image = gr.Image(label="Input Image", type="pil")
150
+ prompt = gr.Text(label="Edit Instruction", placeholder="e.g. Add a dog to the right side.")
151
+ run_button = gr.Button("Edit", variant="primary")
152
+ result = gr.Gallery(label="Output Images", show_label=False)
153
+
154
+ with gr.Accordion("Advanced Settings", open=False):
155
+ seed = gr.Slider(
156
+ label="Seed",
157
+ minimum=0,
158
+ maximum=MAX_SEED,
159
+ step=1,
160
+ value=0
161
+ )
162
+ randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
163
+
164
  with gr.Row():
165
+ true_guidance_scale = gr.Slider(
166
+ label="True Guidance Scale",
167
+ minimum=1.0,
168
+ maximum=5.0,
169
+ step=0.1,
170
+ value=4.0
171
  )
172
+ num_inference_steps = gr.Slider(
173
+ label="Inference Steps (Fast 8-step mode)",
174
+ minimum=4,
175
+ maximum=8,
 
 
 
 
 
176
  step=1,
177
+ value=8
178
+ )
179
+ num_images_per_prompt = gr.Slider(
180
+ label="Images per Prompt",
181
+ minimum=1,
182
+ maximum=4,
183
+ step=1,
184
+ value=1
185
+ )
186
+
187
+ with gr.Group():
188
+ rewrite_toggle = gr.Checkbox(label="Use Prompt Rewriter (Requires HF Token)", value=False, interactive=True)
189
+ hf_token_input = gr.Textbox(
190
+ label="Your Hugging Face Token",
191
+ type="password",
192
+ placeholder="hf_xxxxxxxxxxxxxxxx",
193
+ visible=False,
194
+ info="Required for prompt rewriting - get yours from [Hugging Face settings](https://huggingface.co/settings/tokens)"
195
+ )
196
+
197
+ def toggle_token_visibility(checked):
198
+ return gr.update(visible=checked)
199
+
200
+ rewrite_toggle.change(
201
+ toggle_token_visibility,
202
+ inputs=[rewrite_toggle],
203
+ outputs=[hf_token_input]
204
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  gr.on(
207
  triggers=[run_button.click, prompt.submit],
 
213
  randomize_seed,
214
  true_guidance_scale,
215
  num_inference_steps,
216
+ rewrite_toggle,
217
+ hf_token_input,
218
+ num_images_per_prompt
219
  ],
220
  outputs=[result, seed],
221
  )