JasperHaozhe commited on
Commit
76526f7
·
verified ·
1 Parent(s): 4d8bfaf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +279 -67
app.py CHANGED
@@ -15,9 +15,18 @@ model = AutoModelForImageTextToText.from_pretrained(
15
  torch_dtype=torch.bfloat16
16
  ).to("cuda").eval()
17
 
18
- def create_scoring_instruction(prompt, is_editing=True):
19
- if is_editing:
20
- return f"""You are an expert image editing evaluator. Your task is to evaluate the quality of an edited image based on a source image and a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better image edits (if any).
 
 
 
 
 
 
 
 
 
21
 
22
  User Instruction: {prompt}
23
  You are provided with two images:
@@ -83,13 +92,11 @@ Output your evaluation in the following format:
83
  # User Request Refinement:
84
  ## Refinement Comments: [Specific suggestions for improving the user request]
85
  ## Refined Request: [The improved, more specific user request for editing like a standard user instruction]"""
86
- else:
87
- # Generation Prompt (Simplified, no Image Faithfulness)
88
- return f"""You are an expert image generation evaluator. Your task is to evaluate the quality of a generated image based on a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better images (if any).
89
 
90
  User Instruction: {prompt}
91
- You are provided with one image:
92
- 1. Generated Image <image>
93
 
94
  Your task is to evaluate the Generated Image against the User Instruction.
95
  To do this, you must first assess the image on three critical aspects, provide justifications and absolute scores in 1-4 scale.
@@ -102,7 +109,7 @@ To do this, you must first assess the image on three critical aspects, provide j
102
  - **1 (Major deviations):** Key elements are completely missing, altered, or contradicted. Instruction is ignored.
103
 
104
  **2. Physical and Visual Quality** (Technical errors, composition, realism, and physics)
105
- - **4 (No noticeable flaws):** The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
106
  - **3 (Minor flaws):** Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
107
  - **2 (Some flaws):** Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
108
  - **1 (Severe flaws):** Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).
@@ -114,11 +121,12 @@ To do this, you must first assess the image on three critical aspects, provide j
114
  - **1 (Major deviations):** Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).
115
 
116
  ### Scoring Methodology (CRITICAL)
117
- During assessment for each aspect, recall the initial user request and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
118
- 1. **Anchor:** Have a global inspection based on the user request and the resulting generation. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided.
119
  2. **Justify and Adjust:** Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
 
120
 
121
- Afterwards, try to construct a refined user request that helps the visual generation model to produce better images.
122
  Think of the weaknesses identified in the judgement, then map them to instruction details and apply specific fixes.
123
  Provide a final new user request that enrich the initial user request.
124
 
@@ -138,56 +146,219 @@ Output your evaluation in the following format:
138
  # Summary: [ Summary of the evaluation ]
139
 
140
  # User Request Refinement:
141
- ## Refinement Comments: [Specific suggestions for improving the user request]
142
- ## Refined Request: [The improved, more specific user request for editing like a standard user instruction]"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  @spaces.GPU
145
- def model_inference(instruction_text, source_image, edited_image):
146
- # Determine files list based on inputs
147
- files = []
148
- if source_image:
149
- files.append(source_image)
150
- if edited_image:
151
- files.append(edited_image)
152
-
153
- # Check if files are provided
154
- if not files or len(files) == 0:
155
- yield "Error: Please upload at least one image (Generated Image or Source Image)."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  return
157
 
158
- # Determine mode based on number of images
159
- is_editing = len(files) >= 2
160
-
161
  # Load images
162
  loaded_images = [load_image(image) for image in files]
163
-
164
- # Construct content for the model
165
- instruction = create_scoring_instruction(instruction_text, is_editing=is_editing)
166
-
167
  # Interleave images into the <image> placeholders
168
  content = []
169
  parts = instruction.split("<image>")
170
-
171
- # We expect 2 parts for 1 image (Generation), and 3 parts for 2 images (Editing)
172
- if is_editing:
173
- # Expect 2 images
174
- content.append({"type": "text", "text": parts[0]})
175
- content.append({"type": "image", "image": loaded_images[0]}) # Source
176
- content.append({"type": "text", "text": parts[1]})
177
- content.append({"type": "image", "image": loaded_images[1]}) # Edited
178
- if len(parts) > 2:
179
- content.append({"type": "text", "text": parts[2]})
180
- else:
181
- # Expect 1 image
182
- content.append({"type": "text", "text": parts[0]})
183
- content.append({"type": "image", "image": loaded_images[0]}) # Generated
184
- if len(parts) > 1:
185
- content.append({"type": "text", "text": parts[1]})
186
-
187
- messages = [
188
- {"role": "user", "content": content}
189
- ]
190
-
191
  # Generate and stream text
192
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
193
  inputs = processor(
@@ -196,7 +367,7 @@ def model_inference(instruction_text, source_image, edited_image):
196
  return_tensors="pt",
197
  padding=True,
198
  ).to("cuda")
199
-
200
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
201
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
202
 
@@ -208,37 +379,78 @@ def model_inference(instruction_text, source_image, edited_image):
208
  buffer += new_text
209
  yield buffer
210
 
 
 
 
 
211
  with gr.Blocks() as demo:
212
  gr.HTML(html_header)
213
-
 
 
 
 
 
 
 
 
214
  with gr.Row():
215
  with gr.Column(scale=1):
216
- instruction = gr.Textbox(label="User Instruction", lines=5, placeholder="Enter editing instruction here...")
 
 
 
 
217
  submit_btn = gr.Button("Evaluate", variant="primary")
218
-
 
 
 
 
 
 
 
219
  with gr.Column(scale=1):
220
- source_image = gr.Image(label="Source Image", type="filepath", sources=["upload", "clipboard"])
221
-
 
 
 
 
222
  with gr.Column(scale=1):
223
- edited_image = gr.Image(label="Edited Image", type="filepath", sources=["upload", "clipboard"])
224
-
 
 
 
 
 
225
  output = gr.Textbox(label="Evaluation Result", lines=25)
226
 
 
 
 
 
 
 
 
 
227
  submit_btn.click(
228
  fn=model_inference,
229
- inputs=[instruction, source_image, edited_image],
230
- outputs=output
231
  )
232
 
 
233
  gr.Examples(
234
  examples=[
235
- ["Remove the arrows from the blue sign and add the text of Detour ahead, no right turns.", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_source.png", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_ovis_u1_Image A.png"]
236
  ],
237
- inputs=[instruction, source_image, edited_image]
238
  )
239
-
240
  gr.Markdown(tos_markdown)
241
  gr.Markdown(learn_more_markdown)
242
  gr.Markdown(bibtext)
243
 
244
- demo.launch(debug=True)
 
15
  torch_dtype=torch.bfloat16
16
  ).to("cuda").eval()
17
 
18
+ TASK_CHOICES = [
19
+ "Pointwise - Image Editing",
20
+ "Pointwise - T2I Generation",
21
+ "Pairwise - Image Editing",
22
+ "Pairwise - T2I Generation",
23
+ ]
24
+
25
+ # ============================================================
26
+ # Instruction Templates
27
+ # ============================================================
28
+
29
+ POINTWISE_EDITING_INSTRUCTION = """You are an expert image editing evaluator. Your task is to evaluate the quality of an edited image based on a source image and a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better image edits (if any).
30
 
31
  User Instruction: {prompt}
32
  You are provided with two images:
 
92
  # User Request Refinement:
93
  ## Refinement Comments: [Specific suggestions for improving the user request]
94
  ## Refined Request: [The improved, more specific user request for editing like a standard user instruction]"""
95
+
96
+ POINTWISE_T2I_INSTRUCTION = """You are an expert image generation evaluator. Your task is to evaluate the quality of a generated image based on a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better image generation (if any).
 
97
 
98
  User Instruction: {prompt}
99
+ Generated Image: <image>
 
100
 
101
  Your task is to evaluate the Generated Image against the User Instruction.
102
  To do this, you must first assess the image on three critical aspects, provide justifications and absolute scores in 1-4 scale.
 
109
  - **1 (Major deviations):** Key elements are completely missing, altered, or contradicted. Instruction is ignored.
110
 
111
  **2. Physical and Visual Quality** (Technical errors, composition, realism, and physics)
112
+ - **4 (No noticeable flaws):** The image is physically plausible (correct lighting, shadows, geometry, human anatomy). No visible artifacts (seams, blurring, noise). And all elements work together cohesively.
113
  - **3 (Minor flaws):** Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
114
  - **2 (Some flaws):** Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
115
  - **1 (Severe flaws):** Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).
 
121
  - **1 (Major deviations):** Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).
122
 
123
  ### Scoring Methodology (CRITICAL)
124
+ During assessment for each aspect, recall the initial user request, source image and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
125
+ 1. **Anchor:** Have a global inspection based on the user request and the resulting generation. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided .
126
  2. **Justify and Adjust:** Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
127
+ - *Example:* deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.
128
 
129
+ Afterwards, try to construct a refined user request that helps the visual generation model to produce better image edits.
130
  Think of the weaknesses identified in the judgement, then map them to instruction details and apply specific fixes.
131
  Provide a final new user request that enrich the initial user request.
132
 
 
146
  # Summary: [ Summary of the evaluation ]
147
 
148
  # User Request Refinement:
149
+ ## Refinement Comments: [ Specific suggestions for improving generation quality ]
150
+ ## Refined Request: [ The improved, more specific user request ]"""
151
+
152
+ PAIRWISE_EDITING_INSTRUCTION = """You are an expert image editing evaluator. Your task is to evaluate the quality of an edited image based on a source image and a user instruction.
153
+
154
+ User Instruction: {prompt}
155
+ You are provided with three images:
156
+ 1. Source Image <image>
157
+ 2. Edited Image A <image>
158
+ 3. Edited Image B <image>
159
+
160
+ Your task is to compare the two Edited Images according to the User Instruction and source image.
161
+ To do this, you must compare the image on four critical aspects, provide absolute scores for each image and determine who wins.
162
+
163
+ ### Critical Aspects & Scoring Rubric
164
+ **1. Text Faithfulness** (How accurately does the output follow the instruction?)
165
+ - **4 (Full match):** All key elements (objects, colors, actions) are represented exactly as described. No hallucinations or unrequested changes.
166
+ - **3 (Minor mismatch):** Most key elements are present, but minor details are missing, incorrect, or slightly inaccurate.
167
+ - **2 (Some mismatch):** Some key elements are missing, altered, or interpreted incorrectly.
168
+ - **1 (Major deviations):** Key elements are completely missing, altered, or contradicted. Instruction is ignored.
169
+
170
+ **2. Image Faithfulness** (How well are the non-edited parts and key input elements preserved?)
171
+ - **4 (Uses input fully):** All relevant elements from the input (background, style, lighting, identity) are accurately preserved or transformed as instructed.
172
+ - **3 (Minor mismatch):** Most relevant elements are preserved, but a few aspects (e.g., background details, lighting consistency) are missing or incorrectly handled.
173
+ - **2 (Partial mismatch):** Some elements are carried over, but key aspects of the original image are lost or distorted.
174
+ - **1 (Fails to use input):** Key elements of the input image are ignored, misinterpreted, or destroyed.
175
+
176
+ **3. Physical and Visual Quality** (Technical errors, composition, realism, and physics)
177
+ - **4 (No noticeable flaws):** The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
178
+ - **3 (Minor flaws):** Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
179
+ - **2 (Some flaws):** Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
180
+ - **1 (Severe flaws):** Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).
181
+
182
+ **4. Text Rendering** (Only if the instruction involves generating text)
183
+ - **4 (Full match):** Text is correct, legible, and integrated well.
184
+ - **3 (Mostly match):** Minor misspellings or inconsistent capitalization.
185
+ - **2 (Partial match):** Major misspellings or distorted text.
186
+ - **1 (Major deviations):** Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).
187
+
188
+ ### Scoring Methodology (CRITICAL)
189
+ During assessment for each aspect, recall the initial user request, source image and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
190
+ 1. **Anchor:** Have a global inspection. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided (you can also refer to the given human preference or rating).
191
+ 2. **Justify and Adjust:** Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
192
+ - *Example:* deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.
193
+ 3. **Compare:** Ensure the difference between Score A and Score B reflects the correct preference.
194
+
195
+ Output your evaluation in the following format:
196
+ # User Request Analysis
197
+ [ understanding the user request, and what needs to be considered during image editing ]
198
+ # Detailed Judgement
199
+ 1. Text Faithfulness:
200
+ ## Justification: [ Comparative Analysis: Given the request, source image and the scoring rubrics, which image is better in this dimension? Provide concrete evidence and scoring logic. e.g., Image A is roughly [X] score level because [reason]. Deduct/Add points for [specific details] to reach final score. ]
201
+ ## Score A: [float score for Image A]
202
+ ## Score B: [float score for Image B]
203
+ ## Winner: [Image A or Image B or It's a tie]
204
+ 2. Image Faithfulness:
205
+ ## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic for image faithfulness. ]
206
+ ## Score A: [float score for Image A]
207
+ ## Score B: [float score for Image B]
208
+ ## Winner: [Image A or Image B or It's a tie]
209
+ 3. Physical and Visual Quality:
210
+ ## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since physical/visual quality is often not perfect, give 4.0 sparingly only when it is perfectly realistic. ]
211
+ ## Score A: [float score for Image A]
212
+ ## Score B: [float score for Image B]
213
+ ## Winner: [Image A or Image B or It's a tie]
214
+ 4. Text Rendering:
215
+ ## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since text rendering is often challenging, give 4.0 sparingly only if it is perfect. ]
216
+ ## Score A: [float score for Image A]
217
+ ## Score B: [float score for Image B]
218
+ ## Winner: [N/A or Image A or Image B or It's a tie]
219
+ # Summary: [Summary of the evaluation]"""
220
+
221
+ PAIRWISE_T2I_INSTRUCTION = """You are an expert image evaluator. Your task is to evaluate the quality of two generated images based on a user instruction.
222
+
223
+ User Instruction: {prompt}
224
+ You are provided with two images:
225
+ 1. Generated Image A <image>
226
+ 2. Generated Image B <image>
227
+
228
+ Your task is to compare the two Generated Images according to the User Instruction.
229
+ To do this, you must compare the image on three critical aspects, provide absolute scores for each image and determine who wins.
230
+
231
+ ### Critical Aspects & Scoring Rubric
232
+ **1. Text Faithfulness** (How accurately does the output follow the instruction?)
233
+ - **4 (Full match):** All key elements (objects, colors, actions) are represented exactly as described. No hallucinations or unrequested changes.
234
+ - **3 (Minor mismatch):** Most key elements are present, but minor details are missing, incorrect, or slightly inaccurate.
235
+ - **2 (Some mismatch):** Some key elements are missing, altered, or interpreted incorrectly.
236
+ - **1 (Major deviations):** Key elements are completely missing, altered, or contradicted. Instruction is ignored.
237
+
238
+ **2. Physical and Visual Quality** (Technical errors, composition, realism, and physics)
239
+ - **4 (No noticeable flaws):** The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
240
+ - **3 (Minor flaws):** Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
241
+ - **2 (Some flaws):** Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
242
+ - **1 (Severe flaws):** Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).
243
+
244
+ **3. Text Rendering** (Only if the instruction involves generating text)
245
+ - **4 (Full match):** Text is correct, legible, and integrated well.
246
+ - **3 (Mostly match):** Minor misspellings or inconsistent capitalization.
247
+ - **2 (Partial match):** Major misspellings or distorted text.
248
+ - **1 (Major deviations):** Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).
249
+
250
+ ### Scoring Methodology (CRITICAL)
251
+ For every aspect, you must first recap the initial user request and the scoring rubrics of the aspect, then follow this "Anchor and Adjust" process to compare and score the two images:
252
+ 1. **Anchor:** Determine the rough integer score level (1, 2, 3, or 4) based on the definitions provided.
253
+ 2. **Adjust:** Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
254
+ - *Example:* deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.
255
+ 3. **Compare:** Ensure the difference between Score A and Score B reflects the magnitude of the preference. (e.g., A large gap implies one is significantly better; if one is only slightly better, the fine-grained scorings based on identified flaws help explain the preference).
256
+
257
+ Output your evaluation in the following format:
258
+ # User Request Analysis
259
+ [ understanding the user request, try to analyze or decompose the user request deeper. Think of what the request might imply or what needs to be inferred to successfully execute the request. ]
260
+ # Detailed Judgement
261
+ 1. Text Faithfulness:
262
+ ## Justification: [ Comparative Analysis: Given the request and the scoring rubrics, which image is better in this dimension? Provide concrete evidence and scoring logic. e.g., Image A is roughly [X] score level because [reason]. Deduct/Add points for [specific details] to reach final score. ]
263
+ ## Score A: [float score for Image A]
264
+ ## Score B: [float score for Image B]
265
+ ## Winner: [Image A or Image B or It's a tie]
266
+ 2. Physical and Visual Quality:
267
+ ## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since physical/visual quality is often not perfect, give 4.0 sparingly only when it is perfectly realistic. ]
268
+ ## Score A: [float score for Image A]
269
+ ## Score B: [float score for Image B]
270
+ ## Winner: [Image A or Image B or It's a tie]
271
+ 3. Text Rendering:
272
+ ## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since text rendering is often challenging, give 4.0 sparingly only if it is perfect. ]
273
+ ## Score A: [float score for Image A]
274
+ ## Score B: [float score for Image B]
275
+ ## Winner: [N/A or Image A or Image B or It's a tie]
276
+ # Summary: [Summary of the evaluation]"""
277
+
278
+ def create_instruction(prompt, task_type):
279
+ """Create the appropriate instruction based on the task type."""
280
+ if task_type == "Pointwise - Image Editing":
281
+ return POINTWISE_EDITING_INSTRUCTION.format(prompt=prompt)
282
+ elif task_type == "Pointwise - T2I Generation":
283
+ return POINTWISE_T2I_INSTRUCTION.format(prompt=prompt)
284
+ elif task_type == "Pairwise - Image Editing":
285
+ return PAIRWISE_EDITING_INSTRUCTION.format(prompt=prompt)
286
+ elif task_type == "Pairwise - T2I Generation":
287
+ return PAIRWISE_T2I_INSTRUCTION.format(prompt=prompt)
288
+ else:
289
+ raise ValueError(f"Unknown task type: {task_type}")
290
+
291
+ def update_ui_for_task(task_type):
292
+ """Update image component visibility and labels based on selected task type."""
293
+ if task_type == "Pointwise - Image Editing":
294
+ return (
295
+ gr.update(visible=True, label="Source Image"),
296
+ gr.update(visible=True, label="Edited Image"),
297
+ gr.update(visible=False, label="Image B", value=None),
298
+ )
299
+ elif task_type == "Pointwise - T2I Generation":
300
+ return (
301
+ gr.update(visible=True, label="Generated Image"),
302
+ gr.update(visible=False, label="(unused)", value=None),
303
+ gr.update(visible=False, label="(unused)", value=None),
304
+ )
305
+ elif task_type == "Pairwise - Image Editing":
306
+ return (
307
+ gr.update(visible=True, label="Source Image"),
308
+ gr.update(visible=True, label="Image A"),
309
+ gr.update(visible=True, label="Image B"),
310
+ )
311
+ elif task_type == "Pairwise - T2I Generation":
312
+ return (
313
+ gr.update(visible=True, label="Image A"),
314
+ gr.update(visible=True, label="Image B"),
315
+ gr.update(visible=False, label="(unused)", value=None),
316
+ )
317
 
318
  @spaces.GPU
319
+ def model_inference(task_type, instruction_text, image1, image2, image3):
320
+ """Run model inference based on the selected task type and uploaded images."""
321
+ # Validate inputs and collect images based on task
322
+ if task_type == "Pointwise - Image Editing":
323
+ if not image1 or not image2:
324
+ yield "Error: Please upload both Source Image and Edited Image."
325
+ return
326
+ files = [image1, image2]
327
+ elif task_type == "Pointwise - T2I Generation":
328
+ if not image1:
329
+ yield "Error: Please upload the Generated Image."
330
+ return
331
+ files = [image1]
332
+ elif task_type == "Pairwise - Image Editing":
333
+ if not image1 or not image2 or not image3:
334
+ yield "Error: Please upload Source Image, Image A, and Image B."
335
+ return
336
+ files = [image1, image2, image3]
337
+ elif task_type == "Pairwise - T2I Generation":
338
+ if not image1 or not image2:
339
+ yield "Error: Please upload both Image A and Image B."
340
+ return
341
+ files = [image1, image2]
342
+ else:
343
+ yield "Error: Unknown task type selected."
344
  return
345
 
 
 
 
346
  # Load images
347
  loaded_images = [load_image(image) for image in files]
348
+
349
+ # Build instruction with <image> placeholders
350
+ instruction = create_instruction(instruction_text, task_type)
351
+
352
  # Interleave images into the <image> placeholders
353
  content = []
354
  parts = instruction.split("<image>")
355
+ for i, part in enumerate(parts):
356
+ content.append({"type": "text", "text": part})
357
+ if i < len(loaded_images):
358
+ content.append({"type": "image", "image": loaded_images[i]})
359
+
360
+ messages = [{"role": "user", "content": content}]
361
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  # Generate and stream text
363
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
364
  inputs = processor(
 
367
  return_tensors="pt",
368
  padding=True,
369
  ).to("cuda")
370
+
371
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
372
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
373
 
 
379
  buffer += new_text
380
  yield buffer
381
 
382
+ # ============================================================
383
+ # Gradio UI
384
+ # ============================================================
385
+
386
  with gr.Blocks() as demo:
387
  gr.HTML(html_header)
388
+
389
+ with gr.Row():
390
+ task_selector = gr.Radio(
391
+ choices=TASK_CHOICES,
392
+ value="Pointwise - Image Editing",
393
+ label="Task Type",
394
+ info="Select the evaluation task",
395
+ )
396
+
397
  with gr.Row():
398
  with gr.Column(scale=1):
399
+ instruction = gr.Textbox(
400
+ label="User Instruction",
401
+ lines=5,
402
+ placeholder="Enter the user instruction / prompt here..."
403
+ )
404
  submit_btn = gr.Button("Evaluate", variant="primary")
405
+
406
+ with gr.Column(scale=1):
407
+ image1 = gr.Image(
408
+ label="Source Image",
409
+ type="filepath",
410
+ sources=["upload", "clipboard"],
411
+ )
412
+
413
  with gr.Column(scale=1):
414
+ image2 = gr.Image(
415
+ label="Edited Image",
416
+ type="filepath",
417
+ sources=["upload", "clipboard"],
418
+ )
419
+
420
  with gr.Column(scale=1):
421
+ image3 = gr.Image(
422
+ label="Image B",
423
+ type="filepath",
424
+ sources=["upload", "clipboard"],
425
+ visible=False,
426
+ )
427
+
428
  output = gr.Textbox(label="Evaluation Result", lines=25)
429
 
430
+ # Wire task selector to update image visibility/labels
431
+ task_selector.change(
432
+ fn=update_ui_for_task,
433
+ inputs=[task_selector],
434
+ outputs=[image1, image2, image3],
435
+ )
436
+
437
+ # Wire evaluate button
438
  submit_btn.click(
439
  fn=model_inference,
440
+ inputs=[task_selector, instruction, image1, image2, image3],
441
+ outputs=output,
442
  )
443
 
444
+ # Examples for different tasks
445
  gr.Examples(
446
  examples=[
447
+ ["Pointwise - Image Editing", "Remove the arrows from the blue sign and add the text of Detour ahead, no right turns.", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_source.png", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_ovis_u1_Image A.png", None],
448
  ],
449
+ inputs=[task_selector, instruction, image1, image2, image3],
450
  )
451
+
452
  gr.Markdown(tos_markdown)
453
  gr.Markdown(learn_more_markdown)
454
  gr.Markdown(bibtext)
455
 
456
+ demo.launch(debug=True)