Chris Addis commited on
Commit ·
612285f
1
Parent(s): 9883bdb
openrouter
Browse files- app.py +59 -19
- library/utils_prompt.py +8 -0
app.py
CHANGED
|
@@ -36,6 +36,15 @@ PREFERENCES_FILE = "data/user_preferences.csv"
|
|
| 36 |
# Ensure directory exists
|
| 37 |
os.makedirs(os.path.dirname(PREFERENCES_FILE), exist_ok=True)
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def save_preference(image_path, model_a_text, model_b_text, preferred_model):
|
| 40 |
"""Save user preference data to a CSV file"""
|
| 41 |
# Check if file exists, create with header if not
|
|
@@ -110,14 +119,22 @@ def create_demo():
|
|
| 110 |
file_count="multiple"
|
| 111 |
)
|
| 112 |
|
| 113 |
-
# Add model selection dropdown
|
| 114 |
model_choice = gr.Dropdown(
|
| 115 |
-
choices=["
|
| 116 |
label="Select Model",
|
| 117 |
-
value="
|
| 118 |
visible=True
|
| 119 |
)
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
# Add comparison mode checkbox
|
| 122 |
comparison_mode = gr.Checkbox(
|
| 123 |
label="Enable A/B Testing Mode",
|
|
@@ -127,8 +144,8 @@ def create_demo():
|
|
| 127 |
|
| 128 |
# Label the models in comparison mode
|
| 129 |
with gr.Group(visible=False) as comparison_labels:
|
| 130 |
-
gr.Markdown("### Model A:
|
| 131 |
-
gr.Markdown("### Model B:
|
| 132 |
|
| 133 |
# Preview gallery for uploaded images
|
| 134 |
gr.Markdown("### Uploaded Images")
|
|
@@ -188,7 +205,7 @@ def create_demo():
|
|
| 188 |
with gr.Row() as model_outputs:
|
| 189 |
# Model A output
|
| 190 |
with gr.Column():
|
| 191 |
-
gr.Markdown("#### Model A (
|
| 192 |
model_a_text = gr.Textbox(
|
| 193 |
label="",
|
| 194 |
value="",
|
|
@@ -201,7 +218,7 @@ def create_demo():
|
|
| 201 |
|
| 202 |
# Model B output
|
| 203 |
with gr.Column():
|
| 204 |
-
gr.Markdown("#### Model B (
|
| 205 |
model_b_text = gr.Textbox(
|
| 206 |
label="",
|
| 207 |
value="",
|
|
@@ -225,6 +242,7 @@ def create_demo():
|
|
| 225 |
def toggle_comparison_mode(enable_comparison):
|
| 226 |
return {
|
| 227 |
model_choice: not enable_comparison,
|
|
|
|
| 228 |
single_model_view: not enable_comparison,
|
| 229 |
comparison_view: enable_comparison,
|
| 230 |
comparison_labels: enable_comparison
|
|
@@ -233,7 +251,7 @@ def create_demo():
|
|
| 233 |
comparison_mode.change(
|
| 234 |
fn=toggle_comparison_mode,
|
| 235 |
inputs=[comparison_mode],
|
| 236 |
-
outputs=[model_choice, single_model_view, comparison_view, comparison_labels]
|
| 237 |
)
|
| 238 |
|
| 239 |
# Handle file uploads - store files for use during analysis
|
|
@@ -253,10 +271,13 @@ def create_demo():
|
|
| 253 |
)
|
| 254 |
|
| 255 |
# Function to analyze images
|
| 256 |
-
def analyze_images(image_paths, model_choice, comparison_mode, filenames):
|
| 257 |
if not image_paths:
|
| 258 |
return [], [], 0, "", "No images", "", "", "", [], ""
|
| 259 |
|
|
|
|
|
|
|
|
|
|
| 260 |
image_results = []
|
| 261 |
empty_preferences = [None] * len(image_paths) # Initialize with no preferences
|
| 262 |
|
|
@@ -270,15 +291,29 @@ def create_demo():
|
|
| 270 |
try:
|
| 271 |
# Open the image file for analysis
|
| 272 |
img = Image.open(image_path)
|
| 273 |
-
prompt0 =
|
| 274 |
|
| 275 |
# In comparison mode, always generate both outputs
|
| 276 |
if comparison_mode:
|
| 277 |
-
# Generate Model A output (
|
| 278 |
-
model_a_result =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
-
# Generate Model B output (
|
| 281 |
-
model_b_result =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
# Add to results
|
| 284 |
image_results.append({
|
|
@@ -289,10 +324,14 @@ def create_demo():
|
|
| 289 |
})
|
| 290 |
else:
|
| 291 |
# Use the selected model
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
|
| 297 |
# For single mode, we still keep the structure compatible with comparison mode
|
| 298 |
image_results.append({
|
|
@@ -437,7 +476,7 @@ def create_demo():
|
|
| 437 |
# Connect the analyze button
|
| 438 |
analyze_button.click(
|
| 439 |
fn=analyze_images,
|
| 440 |
-
inputs=[image_state, model_choice, comparison_mode, filename_state],
|
| 441 |
outputs=[
|
| 442 |
all_images, all_results, current_index, current_image, image_counter,
|
| 443 |
analysis_text, model_a_text, model_b_text, preference_state,
|
|
@@ -481,6 +520,7 @@ def create_demo():
|
|
| 481 |
This demo generates alt-text for uploaded images.
|
| 482 |
|
| 483 |
- Upload one or more images using the upload button
|
|
|
|
| 484 |
- Choose between standard mode or A/B testing mode
|
| 485 |
- In standard mode, select one model to generate alt-text
|
| 486 |
- In A/B testing mode, compare outputs from two models and select your preference
|
|
|
|
| 36 |
# Ensure directory exists
|
| 37 |
os.makedirs(os.path.dirname(PREFERENCES_FILE), exist_ok=True)
|
| 38 |
|
| 39 |
+
def get_sys_prompt(length="medium"):
|
| 40 |
+
if length == "short":
|
| 41 |
+
dev_prompt = """You are a museum curator tasked with generating alt-text (as defined in WCAG 2.1) of museum objects for visually impaired and blind users from images. Use British English and follow museum accessibility best practices. Do not start with phrases like 'The image shows' or 'This is an image of'. Be precise, concise and avoid filler and subjective statements. Repsonses should be a maximum of 130 characters."""
|
| 42 |
+
elif length == "medium":
|
| 43 |
+
dev_prompt = """You are a museum curator tasked with generating long descriptions (as defined in WCAG 2.1) of museum objects for visually impaired and blind users from images. Use British English and follow museum accessibility best practices. Do not start with phrases like 'The image shows' or 'This is an image of'. Be precise, concise and avoid filler and subjective statements. Repsonses should be between 250-300 characters in length."""
|
| 44 |
+
else:
|
| 45 |
+
dev_prompt = """You are a museum curator tasked with generating long descriptions (as defined in WCAG 2.1) of museum objects for visually impaired and blind users from images. Use British English and follow museum accessibility best practices. Do not start with phrases like 'The image shows' or 'This is an image of'. Be precise, concise and avoid filler and subjective statements. Repsonses should be a maxium of 450 characters."""
|
| 46 |
+
return dev_prompt
|
| 47 |
+
|
| 48 |
def save_preference(image_path, model_a_text, model_b_text, preferred_model):
|
| 49 |
"""Save user preference data to a CSV file"""
|
| 50 |
# Check if file exists, create with header if not
|
|
|
|
| 119 |
file_count="multiple"
|
| 120 |
)
|
| 121 |
|
| 122 |
+
# Add model selection dropdown with new model choices
|
| 123 |
model_choice = gr.Dropdown(
|
| 124 |
+
choices=["google/gemini-2.0-flash-001", "anthropic/claude-3.7-sonnet", "openai/chatgpt-4o-latest"],
|
| 125 |
label="Select Model",
|
| 126 |
+
value="anthropic/claude-3.7-sonnet",
|
| 127 |
visible=True
|
| 128 |
)
|
| 129 |
|
| 130 |
+
# Add response length selection
|
| 131 |
+
length_choice = gr.Radio(
|
| 132 |
+
choices=["short", "medium", "long"],
|
| 133 |
+
label="Response Length",
|
| 134 |
+
value="medium",
|
| 135 |
+
info="Short: max 130 chars | Medium: 250-300 chars | Long: max 450 chars"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
# Add comparison mode checkbox
|
| 139 |
comparison_mode = gr.Checkbox(
|
| 140 |
label="Enable A/B Testing Mode",
|
|
|
|
| 144 |
|
| 145 |
# Label the models in comparison mode
|
| 146 |
with gr.Group(visible=False) as comparison_labels:
|
| 147 |
+
gr.Markdown("### Model A: Claude")
|
| 148 |
+
gr.Markdown("### Model B: GPT-4o")
|
| 149 |
|
| 150 |
# Preview gallery for uploaded images
|
| 151 |
gr.Markdown("### Uploaded Images")
|
|
|
|
| 205 |
with gr.Row() as model_outputs:
|
| 206 |
# Model A output
|
| 207 |
with gr.Column():
|
| 208 |
+
gr.Markdown("#### Model A (Claude)")
|
| 209 |
model_a_text = gr.Textbox(
|
| 210 |
label="",
|
| 211 |
value="",
|
|
|
|
| 218 |
|
| 219 |
# Model B output
|
| 220 |
with gr.Column():
|
| 221 |
+
gr.Markdown("#### Model B (GPT-4o)")
|
| 222 |
model_b_text = gr.Textbox(
|
| 223 |
label="",
|
| 224 |
value="",
|
|
|
|
| 242 |
def toggle_comparison_mode(enable_comparison):
|
| 243 |
return {
|
| 244 |
model_choice: not enable_comparison,
|
| 245 |
+
length_choice: not enable_comparison,
|
| 246 |
single_model_view: not enable_comparison,
|
| 247 |
comparison_view: enable_comparison,
|
| 248 |
comparison_labels: enable_comparison
|
|
|
|
| 251 |
comparison_mode.change(
|
| 252 |
fn=toggle_comparison_mode,
|
| 253 |
inputs=[comparison_mode],
|
| 254 |
+
outputs=[model_choice, length_choice, single_model_view, comparison_view, comparison_labels]
|
| 255 |
)
|
| 256 |
|
| 257 |
# Handle file uploads - store files for use during analysis
|
|
|
|
| 271 |
)
|
| 272 |
|
| 273 |
# Function to analyze images
|
| 274 |
+
def analyze_images(image_paths, model_choice, length_choice, comparison_mode, filenames):
|
| 275 |
if not image_paths:
|
| 276 |
return [], [], 0, "", "No images", "", "", "", [], ""
|
| 277 |
|
| 278 |
+
# Get system prompt based on length selection
|
| 279 |
+
sys_prompt = get_sys_prompt(length_choice)
|
| 280 |
+
|
| 281 |
image_results = []
|
| 282 |
empty_preferences = [None] * len(image_paths) # Initialize with no preferences
|
| 283 |
|
|
|
|
| 291 |
try:
|
| 292 |
# Open the image file for analysis
|
| 293 |
img = Image.open(image_path)
|
| 294 |
+
prompt0 = prompt_new() # Using the new prompt function
|
| 295 |
|
| 296 |
# In comparison mode, always generate both outputs
|
| 297 |
if comparison_mode:
|
| 298 |
+
# Generate Model A output (Claude)
|
| 299 |
+
model_a_result = OR.generate_caption(
|
| 300 |
+
img,
|
| 301 |
+
model="anthropic/claude-3.7-sonnet",
|
| 302 |
+
max_image_size=512,
|
| 303 |
+
prompt=prompt0,
|
| 304 |
+
prompt_dev=sys_prompt,
|
| 305 |
+
temperature=1
|
| 306 |
+
)
|
| 307 |
|
| 308 |
+
# Generate Model B output (GPT-4o)
|
| 309 |
+
model_b_result = OR.generate_caption(
|
| 310 |
+
img,
|
| 311 |
+
model="openai/chatgpt-4o-latest",
|
| 312 |
+
max_image_size=512,
|
| 313 |
+
prompt=prompt0,
|
| 314 |
+
prompt_dev=sys_prompt,
|
| 315 |
+
temperature=1
|
| 316 |
+
)
|
| 317 |
|
| 318 |
# Add to results
|
| 319 |
image_results.append({
|
|
|
|
| 324 |
})
|
| 325 |
else:
|
| 326 |
# Use the selected model
|
| 327 |
+
result = OR.generate_caption(
|
| 328 |
+
img,
|
| 329 |
+
model=model_choice,
|
| 330 |
+
max_image_size=512,
|
| 331 |
+
prompt=prompt0,
|
| 332 |
+
prompt_dev=sys_prompt,
|
| 333 |
+
temperature=1
|
| 334 |
+
)
|
| 335 |
|
| 336 |
# For single mode, we still keep the structure compatible with comparison mode
|
| 337 |
image_results.append({
|
|
|
|
| 476 |
# Connect the analyze button
|
| 477 |
analyze_button.click(
|
| 478 |
fn=analyze_images,
|
| 479 |
+
inputs=[image_state, model_choice, length_choice, comparison_mode, filename_state],
|
| 480 |
outputs=[
|
| 481 |
all_images, all_results, current_index, current_image, image_counter,
|
| 482 |
analysis_text, model_a_text, model_b_text, preference_state,
|
|
|
|
| 520 |
This demo generates alt-text for uploaded images.
|
| 521 |
|
| 522 |
- Upload one or more images using the upload button
|
| 523 |
+
- Choose a model and response length for generation
|
| 524 |
- Choose between standard mode or A/B testing mode
|
| 525 |
- In standard mode, select one model to generate alt-text
|
| 526 |
- In A/B testing mode, compare outputs from two models and select your preference
|
library/utils_prompt.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def prompt_new(title=None):
|
| 2 |
if title == None:
|
| 3 |
title_info = {}
|
|
|
|
| 1 |
+
def get_sys_prompt(length="medium"):
|
| 2 |
+
if length== "short":
|
| 3 |
+
dev_prompt = """You are a museum curator tasked with generating alt-text (as defined in WCAG 2.1) of museum objects for visually impaired and blind users from images. Use British English and follow museum accessibility best practices. Do not start with phrases like 'The image shows' or 'This is an image of'. Be precise, concise and avoid filler and subjective statements. Repsonses should be a maximum of 130 characters."""
|
| 4 |
+
elif length== "medium":
|
| 5 |
+
dev_prompt = """You are a museum curator tasked with generating long descriptions (as defined in WCAG 2.1) of museum objects for visually impaired and blind users from images. Use British English and follow museum accessibility best practices. Do not start with phrases like 'The image shows' or 'This is an image of'. Be precise, concise and avoid filler and subjective statements. Repsonses should be between 250-300 characters in length."""
|
| 6 |
+
else:
|
| 7 |
+
dev_prompt = """You are a museum curator tasked with generating long descriptions (as defined in WCAG 2.1) of museum objects for visually impaired and blind users from images. Use British English and follow museum accessibility best practices. Do not start with phrases like 'The image shows' or 'This is an image of'. Be precise, concise and avoid filler and subjective statements. Repsonses should be a maxium of 450 characters."""
|
| 8 |
+
|
| 9 |
def prompt_new(title=None):
|
| 10 |
if title == None:
|
| 11 |
title_info = {}
|