Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,9 +15,18 @@ model = AutoModelForImageTextToText.from_pretrained(
|
|
| 15 |
torch_dtype=torch.bfloat16
|
| 16 |
).to("cuda").eval()
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
User Instruction: {prompt}
|
| 23 |
You are provided with two images:
|
|
@@ -83,13 +92,11 @@ Output your evaluation in the following format:
|
|
| 83 |
# User Request Refinement:
|
| 84 |
## Refinement Comments: [Specific suggestions for improving the user request]
|
| 85 |
## Refined Request: [The improved, more specific user request for editing like a standard user instruction]"""
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
return f"""You are an expert image generation evaluator. Your task is to evaluate the quality of a generated image based on a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better images (if any).
|
| 89 |
|
| 90 |
User Instruction: {prompt}
|
| 91 |
-
|
| 92 |
-
1. Generated Image <image>
|
| 93 |
|
| 94 |
Your task is to evaluate the Generated Image against the User Instruction.
|
| 95 |
To do this, you must first assess the image on three critical aspects, provide justifications and absolute scores in 1-4 scale.
|
|
@@ -102,7 +109,7 @@ To do this, you must first assess the image on three critical aspects, provide j
|
|
| 102 |
- **1 (Major deviations):** Key elements are completely missing, altered, or contradicted. Instruction is ignored.
|
| 103 |
|
| 104 |
**2. Physical and Visual Quality** (Technical errors, composition, realism, and physics)
|
| 105 |
-
- **4 (No noticeable flaws):** The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
|
| 106 |
- **3 (Minor flaws):** Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
|
| 107 |
- **2 (Some flaws):** Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
|
| 108 |
- **1 (Severe flaws):** Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).
|
|
@@ -114,11 +121,12 @@ To do this, you must first assess the image on three critical aspects, provide j
|
|
| 114 |
- **1 (Major deviations):** Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).
|
| 115 |
|
| 116 |
### Scoring Methodology (CRITICAL)
|
| 117 |
-
During assessment for each aspect, recall the initial user request and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
|
| 118 |
-
1. **Anchor:** Have a global inspection based on the user request and the resulting generation. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided.
|
| 119 |
2. **Justify and Adjust:** Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
|
|
|
|
| 120 |
|
| 121 |
-
Afterwards, try to construct a refined user request that helps the visual generation model to produce better
|
| 122 |
Think of the weaknesses identified in the judgement, then map them to instruction details and apply specific fixes.
|
| 123 |
Provide a final new user request that enrich the initial user request.
|
| 124 |
|
|
@@ -138,56 +146,219 @@ Output your evaluation in the following format:
|
|
| 138 |
# Summary: [ Summary of the evaluation ]
|
| 139 |
|
| 140 |
# User Request Refinement:
|
| 141 |
-
## Refinement Comments: [Specific suggestions for improving
|
| 142 |
-
## Refined Request: [The improved, more specific user request
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
@spaces.GPU
|
| 145 |
-
def model_inference(instruction_text,
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
if
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
return
|
| 157 |
|
| 158 |
-
# Determine mode based on number of images
|
| 159 |
-
is_editing = len(files) >= 2
|
| 160 |
-
|
| 161 |
# Load images
|
| 162 |
loaded_images = [load_image(image) for image in files]
|
| 163 |
-
|
| 164 |
-
#
|
| 165 |
-
instruction =
|
| 166 |
-
|
| 167 |
# Interleave images into the <image> placeholders
|
| 168 |
content = []
|
| 169 |
parts = instruction.split("<image>")
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
content.append({"type": "image", "image": loaded_images[1]}) # Edited
|
| 178 |
-
if len(parts) > 2:
|
| 179 |
-
content.append({"type": "text", "text": parts[2]})
|
| 180 |
-
else:
|
| 181 |
-
# Expect 1 image
|
| 182 |
-
content.append({"type": "text", "text": parts[0]})
|
| 183 |
-
content.append({"type": "image", "image": loaded_images[0]}) # Generated
|
| 184 |
-
if len(parts) > 1:
|
| 185 |
-
content.append({"type": "text", "text": parts[1]})
|
| 186 |
-
|
| 187 |
-
messages = [
|
| 188 |
-
{"role": "user", "content": content}
|
| 189 |
-
]
|
| 190 |
-
|
| 191 |
# Generate and stream text
|
| 192 |
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 193 |
inputs = processor(
|
|
@@ -196,7 +367,7 @@ def model_inference(instruction_text, source_image, edited_image):
|
|
| 196 |
return_tensors="pt",
|
| 197 |
padding=True,
|
| 198 |
).to("cuda")
|
| 199 |
-
|
| 200 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 201 |
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
|
| 202 |
|
|
@@ -208,37 +379,78 @@ def model_inference(instruction_text, source_image, edited_image):
|
|
| 208 |
buffer += new_text
|
| 209 |
yield buffer
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
with gr.Blocks() as demo:
|
| 212 |
gr.HTML(html_header)
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
with gr.Row():
|
| 215 |
with gr.Column(scale=1):
|
| 216 |
-
instruction = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
submit_btn = gr.Button("Evaluate", variant="primary")
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
with gr.Column(scale=1):
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
with gr.Column(scale=1):
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
output = gr.Textbox(label="Evaluation Result", lines=25)
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
submit_btn.click(
|
| 228 |
fn=model_inference,
|
| 229 |
-
inputs=[instruction,
|
| 230 |
-
outputs=output
|
| 231 |
)
|
| 232 |
|
|
|
|
| 233 |
gr.Examples(
|
| 234 |
examples=[
|
| 235 |
-
["Remove the arrows from the blue sign and add the text of Detour ahead, no right turns.", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_source.png", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_ovis_u1_Image A.png"]
|
| 236 |
],
|
| 237 |
-
inputs=[instruction,
|
| 238 |
)
|
| 239 |
-
|
| 240 |
gr.Markdown(tos_markdown)
|
| 241 |
gr.Markdown(learn_more_markdown)
|
| 242 |
gr.Markdown(bibtext)
|
| 243 |
|
| 244 |
-
demo.launch(debug=True)
|
|
|
|
| 15 |
torch_dtype=torch.bfloat16
|
| 16 |
).to("cuda").eval()
|
| 17 |
|
| 18 |
+
TASK_CHOICES = [
|
| 19 |
+
"Pointwise - Image Editing",
|
| 20 |
+
"Pointwise - T2I Generation",
|
| 21 |
+
"Pairwise - Image Editing",
|
| 22 |
+
"Pairwise - T2I Generation",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
# ============================================================
|
| 26 |
+
# Instruction Templates
|
| 27 |
+
# ============================================================
|
| 28 |
+
|
| 29 |
+
POINTWISE_EDITING_INSTRUCTION = """You are an expert image editing evaluator. Your task is to evaluate the quality of an edited image based on a source image and a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better image edits (if any).
|
| 30 |
|
| 31 |
User Instruction: {prompt}
|
| 32 |
You are provided with two images:
|
|
|
|
| 92 |
# User Request Refinement:
|
| 93 |
## Refinement Comments: [Specific suggestions for improving the user request]
|
| 94 |
## Refined Request: [The improved, more specific user request for editing like a standard user instruction]"""
|
| 95 |
+
|
| 96 |
+
POINTWISE_T2I_INSTRUCTION = """You are an expert image generation evaluator. Your task is to evaluate the quality of a generated image based on a user instruction. Afterwards, you need to suggest how to refine the original user request to produce better image generation (if any).
|
|
|
|
| 97 |
|
| 98 |
User Instruction: {prompt}
|
| 99 |
+
Generated Image: <image>
|
|
|
|
| 100 |
|
| 101 |
Your task is to evaluate the Generated Image against the User Instruction.
|
| 102 |
To do this, you must first assess the image on three critical aspects, provide justifications and absolute scores in 1-4 scale.
|
|
|
|
| 109 |
- **1 (Major deviations):** Key elements are completely missing, altered, or contradicted. Instruction is ignored.
|
| 110 |
|
| 111 |
**2. Physical and Visual Quality** (Technical errors, composition, realism, and physics)
|
| 112 |
+
- **4 (No noticeable flaws):** The image is physically plausible (correct lighting, shadows, geometry, human anatomy). No visible artifacts (seams, blurring, noise). And all elements work together cohesively.
|
| 113 |
- **3 (Minor flaws):** Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
|
| 114 |
- **2 (Some flaws):** Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
|
| 115 |
- **1 (Severe flaws):** Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).
|
|
|
|
| 121 |
- **1 (Major deviations):** Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).
|
| 122 |
|
| 123 |
### Scoring Methodology (CRITICAL)
|
| 124 |
+
During assessment for each aspect, recall the initial user request, source image and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
|
| 125 |
+
1. **Anchor:** Have a global inspection based on the user request and the resulting generation. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided .
|
| 126 |
2. **Justify and Adjust:** Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
|
| 127 |
+
- *Example:* deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.
|
| 128 |
|
| 129 |
+
Afterwards, try to construct a refined user request that helps the visual generation model to produce better image edits.
|
| 130 |
Think of the weaknesses identified in the judgement, then map them to instruction details and apply specific fixes.
|
| 131 |
Provide a final new user request that enrich the initial user request.
|
| 132 |
|
|
|
|
| 146 |
# Summary: [ Summary of the evaluation ]
|
| 147 |
|
| 148 |
# User Request Refinement:
|
| 149 |
+
## Refinement Comments: [ Specific suggestions for improving generation quality ]
|
| 150 |
+
## Refined Request: [ The improved, more specific user request ]"""
|
| 151 |
+
|
| 152 |
+
PAIRWISE_EDITING_INSTRUCTION = """You are an expert image editing evaluator. Your task is to evaluate the quality of an edited image based on a source image and a user instruction.
|
| 153 |
+
|
| 154 |
+
User Instruction: {prompt}
|
| 155 |
+
You are provided with three images:
|
| 156 |
+
1. Source Image <image>
|
| 157 |
+
2. Edited Image A <image>
|
| 158 |
+
3. Edited Image B <image>
|
| 159 |
+
|
| 160 |
+
Your task is to compare the two Edited Images according to the User Instruction and source image.
|
| 161 |
+
To do this, you must compare the image on four critical aspects, provide absolute scores for each image and determine who wins.
|
| 162 |
+
|
| 163 |
+
### Critical Aspects & Scoring Rubric
|
| 164 |
+
**1. Text Faithfulness** (How accurately does the output follow the instruction?)
|
| 165 |
+
- **4 (Full match):** All key elements (objects, colors, actions) are represented exactly as described. No hallucinations or unrequested changes.
|
| 166 |
+
- **3 (Minor mismatch):** Most key elements are present, but minor details are missing, incorrect, or slightly inaccurate.
|
| 167 |
+
- **2 (Some mismatch):** Some key elements are missing, altered, or interpreted incorrectly.
|
| 168 |
+
- **1 (Major deviations):** Key elements are completely missing, altered, or contradicted. Instruction is ignored.
|
| 169 |
+
|
| 170 |
+
**2. Image Faithfulness** (How well are the non-edited parts and key input elements preserved?)
|
| 171 |
+
- **4 (Uses input fully):** All relevant elements from the input (background, style, lighting, identity) are accurately preserved or transformed as instructed.
|
| 172 |
+
- **3 (Minor mismatch):** Most relevant elements are preserved, but a few aspects (e.g., background details, lighting consistency) are missing or incorrectly handled.
|
| 173 |
+
- **2 (Partial mismatch):** Some elements are carried over, but key aspects of the original image are lost or distorted.
|
| 174 |
+
- **1 (Fails to use input):** Key elements of the input image are ignored, misinterpreted, or destroyed.
|
| 175 |
+
|
| 176 |
+
**3. Physical and Visual Quality** (Technical errors, composition, realism, and physics)
|
| 177 |
+
- **4 (No noticeable flaws):** The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
|
| 178 |
+
- **3 (Minor flaws):** Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
|
| 179 |
+
- **2 (Some flaws):** Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
|
| 180 |
+
- **1 (Severe flaws):** Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).
|
| 181 |
+
|
| 182 |
+
**4. Text Rendering** (Only if the instruction involves generating text)
|
| 183 |
+
- **4 (Full match):** Text is correct, legible, and integrated well.
|
| 184 |
+
- **3 (Mostly match):** Minor misspellings or inconsistent capitalization.
|
| 185 |
+
- **2 (Partial match):** Major misspellings or distorted text.
|
| 186 |
+
- **1 (Major deviations):** Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).
|
| 187 |
+
|
| 188 |
+
### Scoring Methodology (CRITICAL)
|
| 189 |
+
During assessment for each aspect, recall the initial user request, source image and the scoring rubrics of the aspect, provide scores with detailed justifications for each image and reflect fine-grained preferences.
|
| 190 |
+
1. **Anchor:** Have a global inspection. Determine the rough integer score level (1, 2, 3, or 4) according to the definitions provided (you can also refer to the given human preference or rating).
|
| 191 |
+
2. **Justify and Adjust:** Do careful visual analysis and identify specific flaws in generation. Justify the score with concrete evidence and scoring logic. Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
|
| 192 |
+
- *Example:* deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.
|
| 193 |
+
3. **Compare:** Ensure the difference between Score A and Score B reflects the correct preference.
|
| 194 |
+
|
| 195 |
+
Output your evaluation in the following format:
|
| 196 |
+
# User Request Analysis
|
| 197 |
+
[ understanding the user request, and what needs to be considered during image editing ]
|
| 198 |
+
# Detailed Judgement
|
| 199 |
+
1. Text Faithfulness:
|
| 200 |
+
## Justification: [ Comparative Analysis: Given the request, source image and the scoring rubrics, which image is better in this dimension? Provide concrete evidence and scoring logic. e.g., Image A is roughly [X] score level because [reason]. Deduct/Add points for [specific details] to reach final score. ]
|
| 201 |
+
## Score A: [float score for Image A]
|
| 202 |
+
## Score B: [float score for Image B]
|
| 203 |
+
## Winner: [Image A or Image B or It's a tie]
|
| 204 |
+
2. Image Faithfulness:
|
| 205 |
+
## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic for image faithfulness. ]
|
| 206 |
+
## Score A: [float score for Image A]
|
| 207 |
+
## Score B: [float score for Image B]
|
| 208 |
+
## Winner: [Image A or Image B or It's a tie]
|
| 209 |
+
3. Physical and Visual Quality:
|
| 210 |
+
## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since physical/visual quality is often not perfect, give 4.0 sparingly only when it is perfectly realistic. ]
|
| 211 |
+
## Score A: [float score for Image A]
|
| 212 |
+
## Score B: [float score for Image B]
|
| 213 |
+
## Winner: [Image A or Image B or It's a tie]
|
| 214 |
+
4. Text Rendering:
|
| 215 |
+
## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since text rendering is often challenging, give 4.0 sparingly only if it is perfect. ]
|
| 216 |
+
## Score A: [float score for Image A]
|
| 217 |
+
## Score B: [float score for Image B]
|
| 218 |
+
## Winner: [N/A or Image A or Image B or It's a tie]
|
| 219 |
+
# Summary: [Summary of the evaluation]"""
|
| 220 |
+
|
| 221 |
+
PAIRWISE_T2I_INSTRUCTION = """You are an expert image evaluator. Your task is to evaluate the quality of two generated images based on a user instruction.
|
| 222 |
+
|
| 223 |
+
User Instruction: {prompt}
|
| 224 |
+
You are provided with two images:
|
| 225 |
+
1. Generated Image A <image>
|
| 226 |
+
2. Generated Image B <image>
|
| 227 |
+
|
| 228 |
+
Your task is to compare the two Generated Images according to the User Instruction.
|
| 229 |
+
To do this, you must compare the image on three critical aspects, provide absolute scores for each image and determine who wins.
|
| 230 |
+
|
| 231 |
+
### Critical Aspects & Scoring Rubric
|
| 232 |
+
**1. Text Faithfulness** (How accurately does the output follow the instruction?)
|
| 233 |
+
- **4 (Full match):** All key elements (objects, colors, actions) are represented exactly as described. No hallucinations or unrequested changes.
|
| 234 |
+
- **3 (Minor mismatch):** Most key elements are present, but minor details are missing, incorrect, or slightly inaccurate.
|
| 235 |
+
- **2 (Some mismatch):** Some key elements are missing, altered, or interpreted incorrectly.
|
| 236 |
+
- **1 (Major deviations):** Key elements are completely missing, altered, or contradicted. Instruction is ignored.
|
| 237 |
+
|
| 238 |
+
**2. Physical and Visual Quality** (Technical errors, composition, realism, and physics)
|
| 239 |
+
- **4 (No noticeable flaws):** The image is physically plausible (correct lighting, shadows, geometry, anatomy). No visible artifacts (seams, blurring, noise).
|
| 240 |
+
- **3 (Minor flaws):** Small inaccuracies that are noticeable but not strongly disruptive (e.g., slight lighting mismatch, minor texture issues).
|
| 241 |
+
- **2 (Some flaws):** Clear physical or visual errors that disrupt the image (e.g., incorrect perspective, "floating" objects, wrong shadow direction, obvious seams).
|
| 242 |
+
- **1 (Severe flaws):** Major physical/visual errors (e.g., impossible geometry, distorted anatomy, garbled objects, severe artifacts).
|
| 243 |
+
|
| 244 |
+
**3. Text Rendering** (Only if the instruction involves generating text)
|
| 245 |
+
- **4 (Full match):** Text is correct, legible, and integrated well.
|
| 246 |
+
- **3 (Mostly match):** Minor misspellings or inconsistent capitalization.
|
| 247 |
+
- **2 (Partial match):** Major misspellings or distorted text.
|
| 248 |
+
- **1 (Major deviations):** Text is unreadable, severely distorted, or missing. (Use N/A if no text generation is required).
|
| 249 |
+
|
| 250 |
+
### Scoring Methodology (CRITICAL)
|
| 251 |
+
For every aspect, you must first recap the initial user request and the scoring rubrics of the aspect, then follow this "Anchor and Adjust" process to compare and score the two images:
|
| 252 |
+
1. **Anchor:** Determine the rough integer score level (1, 2, 3, or 4) based on the definitions provided.
|
| 253 |
+
2. **Adjust:** Fine-tune this anchor score into a float value. Add small increments for exceptional execution or deduct points for specific flaws.
|
| 254 |
+
- *Example:* deduct points from 4.0 for slight flaws if the assessed dimension is close to satisfaction. add increments from 1.0 or 2.0 based on severity of flaws.
|
| 255 |
+
3. **Compare:** Ensure the difference between Score A and Score B reflects the magnitude of the preference. (e.g., A large gap implies one is significantly better; if one is only slightly better, the fine-grained scorings based on identified flaws help explain the preference).
|
| 256 |
+
|
| 257 |
+
Output your evaluation in the following format:
|
| 258 |
+
# User Request Analysis
|
| 259 |
+
[ understanding the user request, try to analyze or decompose the user request deeper. Think of what the request might imply or what needs to be inferred to successfully execute the request. ]
|
| 260 |
+
# Detailed Judgement
|
| 261 |
+
1. Text Faithfulness:
|
| 262 |
+
## Justification: [ Comparative Analysis: Given the request and the scoring rubrics, which image is better in this dimension? Provide concrete evidence and scoring logic. e.g., Image A is roughly [X] score level because [reason]. Deduct/Add points for [specific details] to reach final score. ]
|
| 263 |
+
## Score A: [float score for Image A]
|
| 264 |
+
## Score B: [float score for Image B]
|
| 265 |
+
## Winner: [Image A or Image B or It's a tie]
|
| 266 |
+
2. Physical and Visual Quality:
|
| 267 |
+
## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since physical/visual quality is often not perfect, give 4.0 sparingly only when it is perfectly realistic. ]
|
| 268 |
+
## Score A: [float score for Image A]
|
| 269 |
+
## Score B: [float score for Image B]
|
| 270 |
+
## Winner: [Image A or Image B or It's a tie]
|
| 271 |
+
3. Text Rendering:
|
| 272 |
+
## Justification: [ Similar to above. Comparative analysis with concrete evidence and scoring logic. Since text rendering is often challenging, give 4.0 sparingly only if it is perfect. ]
|
| 273 |
+
## Score A: [float score for Image A]
|
| 274 |
+
## Score B: [float score for Image B]
|
| 275 |
+
## Winner: [N/A or Image A or Image B or It's a tie]
|
| 276 |
+
# Summary: [Summary of the evaluation]"""
|
| 277 |
+
|
| 278 |
+
def create_instruction(prompt, task_type):
|
| 279 |
+
"""Create the appropriate instruction based on the task type."""
|
| 280 |
+
if task_type == "Pointwise - Image Editing":
|
| 281 |
+
return POINTWISE_EDITING_INSTRUCTION.format(prompt=prompt)
|
| 282 |
+
elif task_type == "Pointwise - T2I Generation":
|
| 283 |
+
return POINTWISE_T2I_INSTRUCTION.format(prompt=prompt)
|
| 284 |
+
elif task_type == "Pairwise - Image Editing":
|
| 285 |
+
return PAIRWISE_EDITING_INSTRUCTION.format(prompt=prompt)
|
| 286 |
+
elif task_type == "Pairwise - T2I Generation":
|
| 287 |
+
return PAIRWISE_T2I_INSTRUCTION.format(prompt=prompt)
|
| 288 |
+
else:
|
| 289 |
+
raise ValueError(f"Unknown task type: {task_type}")
|
| 290 |
+
|
| 291 |
+
def update_ui_for_task(task_type):
|
| 292 |
+
"""Update image component visibility and labels based on selected task type."""
|
| 293 |
+
if task_type == "Pointwise - Image Editing":
|
| 294 |
+
return (
|
| 295 |
+
gr.update(visible=True, label="Source Image"),
|
| 296 |
+
gr.update(visible=True, label="Edited Image"),
|
| 297 |
+
gr.update(visible=False, label="Image B", value=None),
|
| 298 |
+
)
|
| 299 |
+
elif task_type == "Pointwise - T2I Generation":
|
| 300 |
+
return (
|
| 301 |
+
gr.update(visible=True, label="Generated Image"),
|
| 302 |
+
gr.update(visible=False, label="(unused)", value=None),
|
| 303 |
+
gr.update(visible=False, label="(unused)", value=None),
|
| 304 |
+
)
|
| 305 |
+
elif task_type == "Pairwise - Image Editing":
|
| 306 |
+
return (
|
| 307 |
+
gr.update(visible=True, label="Source Image"),
|
| 308 |
+
gr.update(visible=True, label="Image A"),
|
| 309 |
+
gr.update(visible=True, label="Image B"),
|
| 310 |
+
)
|
| 311 |
+
elif task_type == "Pairwise - T2I Generation":
|
| 312 |
+
return (
|
| 313 |
+
gr.update(visible=True, label="Image A"),
|
| 314 |
+
gr.update(visible=True, label="Image B"),
|
| 315 |
+
gr.update(visible=False, label="(unused)", value=None),
|
| 316 |
+
)
|
| 317 |
|
| 318 |
@spaces.GPU
|
| 319 |
+
def model_inference(task_type, instruction_text, image1, image2, image3):
|
| 320 |
+
"""Run model inference based on the selected task type and uploaded images."""
|
| 321 |
+
# Validate inputs and collect images based on task
|
| 322 |
+
if task_type == "Pointwise - Image Editing":
|
| 323 |
+
if not image1 or not image2:
|
| 324 |
+
yield "Error: Please upload both Source Image and Edited Image."
|
| 325 |
+
return
|
| 326 |
+
files = [image1, image2]
|
| 327 |
+
elif task_type == "Pointwise - T2I Generation":
|
| 328 |
+
if not image1:
|
| 329 |
+
yield "Error: Please upload the Generated Image."
|
| 330 |
+
return
|
| 331 |
+
files = [image1]
|
| 332 |
+
elif task_type == "Pairwise - Image Editing":
|
| 333 |
+
if not image1 or not image2 or not image3:
|
| 334 |
+
yield "Error: Please upload Source Image, Image A, and Image B."
|
| 335 |
+
return
|
| 336 |
+
files = [image1, image2, image3]
|
| 337 |
+
elif task_type == "Pairwise - T2I Generation":
|
| 338 |
+
if not image1 or not image2:
|
| 339 |
+
yield "Error: Please upload both Image A and Image B."
|
| 340 |
+
return
|
| 341 |
+
files = [image1, image2]
|
| 342 |
+
else:
|
| 343 |
+
yield "Error: Unknown task type selected."
|
| 344 |
return
|
| 345 |
|
|
|
|
|
|
|
|
|
|
| 346 |
# Load images
|
| 347 |
loaded_images = [load_image(image) for image in files]
|
| 348 |
+
|
| 349 |
+
# Build instruction with <image> placeholders
|
| 350 |
+
instruction = create_instruction(instruction_text, task_type)
|
| 351 |
+
|
| 352 |
# Interleave images into the <image> placeholders
|
| 353 |
content = []
|
| 354 |
parts = instruction.split("<image>")
|
| 355 |
+
for i, part in enumerate(parts):
|
| 356 |
+
content.append({"type": "text", "text": part})
|
| 357 |
+
if i < len(loaded_images):
|
| 358 |
+
content.append({"type": "image", "image": loaded_images[i]})
|
| 359 |
+
|
| 360 |
+
messages = [{"role": "user", "content": content}]
|
| 361 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
# Generate and stream text
|
| 363 |
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 364 |
inputs = processor(
|
|
|
|
| 367 |
return_tensors="pt",
|
| 368 |
padding=True,
|
| 369 |
).to("cuda")
|
| 370 |
+
|
| 371 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
| 372 |
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
|
| 373 |
|
|
|
|
| 379 |
buffer += new_text
|
| 380 |
yield buffer
|
| 381 |
|
| 382 |
+
# ============================================================
|
| 383 |
+
# Gradio UI
|
| 384 |
+
# ============================================================
|
| 385 |
+
|
| 386 |
with gr.Blocks() as demo:
|
| 387 |
gr.HTML(html_header)
|
| 388 |
+
|
| 389 |
+
with gr.Row():
|
| 390 |
+
task_selector = gr.Radio(
|
| 391 |
+
choices=TASK_CHOICES,
|
| 392 |
+
value="Pointwise - Image Editing",
|
| 393 |
+
label="Task Type",
|
| 394 |
+
info="Select the evaluation task",
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
with gr.Row():
|
| 398 |
with gr.Column(scale=1):
|
| 399 |
+
instruction = gr.Textbox(
|
| 400 |
+
label="User Instruction",
|
| 401 |
+
lines=5,
|
| 402 |
+
placeholder="Enter the user instruction / prompt here..."
|
| 403 |
+
)
|
| 404 |
submit_btn = gr.Button("Evaluate", variant="primary")
|
| 405 |
+
|
| 406 |
+
with gr.Column(scale=1):
|
| 407 |
+
image1 = gr.Image(
|
| 408 |
+
label="Source Image",
|
| 409 |
+
type="filepath",
|
| 410 |
+
sources=["upload", "clipboard"],
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
with gr.Column(scale=1):
|
| 414 |
+
image2 = gr.Image(
|
| 415 |
+
label="Edited Image",
|
| 416 |
+
type="filepath",
|
| 417 |
+
sources=["upload", "clipboard"],
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
with gr.Column(scale=1):
|
| 421 |
+
image3 = gr.Image(
|
| 422 |
+
label="Image B",
|
| 423 |
+
type="filepath",
|
| 424 |
+
sources=["upload", "clipboard"],
|
| 425 |
+
visible=False,
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
output = gr.Textbox(label="Evaluation Result", lines=25)
|
| 429 |
|
| 430 |
+
# Wire task selector to update image visibility/labels
|
| 431 |
+
task_selector.change(
|
| 432 |
+
fn=update_ui_for_task,
|
| 433 |
+
inputs=[task_selector],
|
| 434 |
+
outputs=[image1, image2, image3],
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
# Wire evaluate button
|
| 438 |
submit_btn.click(
|
| 439 |
fn=model_inference,
|
| 440 |
+
inputs=[task_selector, instruction, image1, image2, image3],
|
| 441 |
+
outputs=output,
|
| 442 |
)
|
| 443 |
|
| 444 |
+
# Examples for different tasks
|
| 445 |
gr.Examples(
|
| 446 |
examples=[
|
| 447 |
+
["Pointwise - Image Editing", "Remove the arrows from the blue sign and add the text of Detour ahead, no right turns.", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_source.png", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_ovis_u1_Image A.png", None],
|
| 448 |
],
|
| 449 |
+
inputs=[task_selector, instruction, image1, image2, image3],
|
| 450 |
)
|
| 451 |
+
|
| 452 |
gr.Markdown(tos_markdown)
|
| 453 |
gr.Markdown(learn_more_markdown)
|
| 454 |
gr.Markdown(bibtext)
|
| 455 |
|
| 456 |
+
demo.launch(debug=True)
|