SorrowTea commited on
Commit
01f4cb5
·
0 Parent(s):

Initial PhotoBench-Protected Leaderboard

Browse files
.gitignore ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data files
2
+ data/gt/
3
+ *.DS_Store
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.pyc
8
+ *.pyo
9
+ *.egg-info/
10
+ .venv/
11
+ venv/
12
+
13
+ # IDE
14
+ .vscode/
15
+ .idea/
16
+
17
+ # OS
18
+ *.swp
19
+ *.swo
20
+ *~
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PhotoBench-Protected
3
+ emoji: 🛡️
4
+ colorFrom: green
5
+ colorTo: green
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ short_description: PhotoBench-Protected agent-only leaderboard
11
+ tags:
12
+ - leaderboard
13
+ - image-retrieval
14
+ - benchmark
15
+ - agent
16
+ ---
17
+
18
+ # PhotoBench-Protected Leaderboard
19
+
20
+ Agent-only leaderboard for PhotoBench-Protected, where only partial captions, embeddings, and metadata are provided.
21
+
22
+ ## Quick Start
23
+
24
+ 1. Download the protected dataset from [PhotoBench-Protected Dataset](https://huggingface.co/datasets/SorrowTea/PhotoBench-Protected)
25
+ 2. Build your agent-based retrieval system using the provided features
26
+ 3. Submit predictions in JSON format
27
+
28
+ ## Important Notice
29
+
30
+ **PhotoBench-Protected** is our initial open-source release with limited information sources. This benchmark focuses exclusively on **agent planning** ability.
31
+
32
+ - For **unrestricted retrieval with raw images**, please use the [full PhotoBench Leaderboard](https://huggingface.co/spaces/SorrowTea/PhotoBench/).
33
+ - The test sets for PhotoBench-Protected and PhotoBench (full) are **different**.
34
+ - Please confirm you are submitting to the correct leaderboard before uploading.
35
+
36
+ ## Evaluation Metrics
37
+
38
+ - **Recall@k** for k ∈ {1, 5, 10, 20, 50, 100}
39
+ - **NDCG@k** for k ∈ {1, 5, 10, 20, 50, 100}
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import uuid
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+
9
+ from src.about import (
10
+ EVALUATION_INFO,
11
+ INTRODUCTION,
12
+ NAVIGATION,
13
+ SUBMISSION_GUIDE,
14
+ TITLE,
15
+ custom_css,
16
+ )
17
+ from src.evaluator import Evaluator
18
+ from src.leaderboard_manager import (
19
+ ALL_METRIC_COLS,
20
+ DEFAULT_DISPLAY_METRICS,
21
+ LeaderboardManager,
22
+ )
23
+ from src.storage import (
24
+ check_rate_limit,
25
+ record_submission_time,
26
+ save_submission,
27
+ )
28
+
29
+ # Initialize components
30
+ try:
31
+ manager = LeaderboardManager()
32
+ except Exception as e:
33
+ print(f"[WARN] Failed to init LeaderboardManager: {e}")
34
+ manager = None
35
+
36
+ evaluator = Evaluator()
37
+
38
+
39
+ def refresh_leaderboard(sort_by):
40
+ if manager is None:
41
+ return pd.DataFrame(columns=["rank", "model_name"])
42
+ try:
43
+ return manager.get_display_df(
44
+ method_filter="Agent",
45
+ sort_by=sort_by,
46
+ ascending=False,
47
+ top_n=30,
48
+ metric_cols=DEFAULT_DISPLAY_METRICS,
49
+ )
50
+ except Exception as e:
51
+ return pd.DataFrame({"Error": [str(e)]})
52
+
53
+
54
+ def handle_submission(file_obj, email, model_name, opt_in):
55
+ if manager is None:
56
+ return {"error": "Leaderboard service unavailable."}, None
57
+
58
+ if file_obj is None:
59
+ return {"error": "Please upload a JSON file."}, None
60
+
61
+ if not email or not email.strip() or "@" not in email:
62
+ return {"error": "Please enter a valid email address."}, None
63
+
64
+ email = email.strip().lower()
65
+
66
+ if not model_name or not model_name.strip():
67
+ return {"error": "Please enter a model / system name."}, None
68
+
69
+ # Rate limit check
70
+ allowed, msg = check_rate_limit(email)
71
+ if not allowed:
72
+ return {"error": msg}, None
73
+
74
+ # Read uploaded file
75
+ file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
76
+ try:
77
+ with open(file_path, "r", encoding="utf-8") as f:
78
+ data = json.load(f)
79
+ except Exception as e:
80
+ return {"error": f"Failed to parse JSON: {e}"}, None
81
+
82
+ # Validate format
83
+ errors = evaluator.validate_json_format(data)
84
+ if errors:
85
+ return {"error": "Validation failed", "details": errors}, None
86
+
87
+ # Run evaluation
88
+ try:
89
+ result = evaluator.evaluate(data)
90
+ except Exception as e:
91
+ return {"error": f"Evaluation failed: {e}"}, None
92
+
93
+ # Extract album coverage
94
+ albums = sorted({str(item["album_id"]) for item in data})
95
+
96
+ # Record rate limit
97
+ record_submission_time(email)
98
+
99
+ # Save submission
100
+ submission_id = str(uuid.uuid4())
101
+ try:
102
+ save_submission(
103
+ submission_id,
104
+ {
105
+ "meta": {
106
+ "submission_id": submission_id,
107
+ "email": email,
108
+ "method": "Agent",
109
+ "model_name": model_name.strip(),
110
+ "albums": albums,
111
+ "opt_in": opt_in,
112
+ },
113
+ "submission": data,
114
+ "result": result,
115
+ },
116
+ )
117
+ except Exception as e:
118
+ return {"error": f"Failed to save submission: {e}"}, None
119
+
120
+ # Update leaderboard only if opted in and full submission
121
+ leaderboard_msg = ""
122
+ if opt_in:
123
+ entry = manager.add_result(
124
+ email=email,
125
+ method="Agent",
126
+ model_name=model_name.strip(),
127
+ albums=albums,
128
+ evaluated_queries=result["evaluated_queries"],
129
+ total_gt_queries=result["total_gt_queries"],
130
+ global_metrics=result["global_metrics"],
131
+ )
132
+ if entry is None:
133
+ leaderboard_msg = "Result saved but not eligible for leaderboard (incomplete submission). Only full submissions across all 3 albums are ranked."
134
+ else:
135
+ leaderboard_msg = "Result published to leaderboard."
136
+ else:
137
+ leaderboard_msg = "Result recorded privately. Not published to leaderboard."
138
+
139
+ # Build result summary
140
+ summary = {
141
+ "status": "Success",
142
+ "submission_id": submission_id,
143
+ "email": email,
144
+ "model_name": model_name.strip(),
145
+ "albums": albums,
146
+ "evaluated_queries": result["evaluated_queries"],
147
+ "total_gt_queries": result["total_gt_queries"],
148
+ "metrics": result["global_metrics"],
149
+ "leaderboard_status": leaderboard_msg,
150
+ "notice": "Please download and save your results. Submission data is retained for 30 days only.",
151
+ }
152
+ if result.get("is_partial"):
153
+ summary["warning"] = result["warning"]
154
+
155
+ updated_df = refresh_leaderboard("Recall@10")
156
+ return summary, updated_df
157
+
158
+
159
+ # Gradio interface
160
+ with gr.Blocks(css=custom_css, title="PhotoBench-Protected Leaderboard") as demo:
161
+ gr.HTML(TITLE)
162
+ gr.HTML(NAVIGATION)
163
+ gr.Markdown(INTRODUCTION, elem_classes="markdown-text")
164
+
165
+ with gr.Tabs(elem_classes="tab-buttons"):
166
+ # === Tab 1: Leaderboard ===
167
+ with gr.TabItem("🏅 Leaderboard"):
168
+ with gr.Row():
169
+ sort_by = gr.Dropdown(
170
+ choices=ALL_METRIC_COLS,
171
+ value="Recall@10",
172
+ label="Sort by",
173
+ )
174
+ refresh_btn = gr.Button("Refresh", variant="primary")
175
+ leaderboard_table = gr.DataFrame(
176
+ label="Top 30",
177
+ interactive=False,
178
+ wrap=True,
179
+ )
180
+
181
+ refresh_btn.click(
182
+ refresh_leaderboard,
183
+ inputs=[sort_by],
184
+ outputs=leaderboard_table,
185
+ )
186
+ demo.load(
187
+ refresh_leaderboard,
188
+ inputs=[sort_by],
189
+ outputs=leaderboard_table,
190
+ )
191
+
192
+ # === Tab 2: Submit ===
193
+ with gr.TabItem("📝 Submit"):
194
+ gr.Markdown(SUBMISSION_GUIDE, elem_classes="markdown-text")
195
+
196
+ with gr.Row():
197
+ with gr.Column(scale=1):
198
+ pass
199
+ with gr.Column(scale=3):
200
+ with gr.Row():
201
+ with gr.Column():
202
+ upload_file = gr.File(
203
+ label="Upload results JSON",
204
+ file_types=[".json"],
205
+ )
206
+ email_input = gr.Textbox(
207
+ label="Email",
208
+ placeholder="your@email.com",
209
+ )
210
+ model_name_input = gr.Textbox(
211
+ label="Model / System Name",
212
+ placeholder="e.g., GPT-4V-Agent",
213
+ )
214
+ opt_in_toggle = gr.Checkbox(
215
+ label="Publish to public leaderboard",
216
+ value=True,
217
+ )
218
+ submit_btn = gr.Button("Submit for Evaluation", variant="primary")
219
+
220
+ with gr.Column():
221
+ result_json = gr.JSON(label="Evaluation Results")
222
+ with gr.Column(scale=1):
223
+ pass
224
+
225
+ submit_btn.click(
226
+ handle_submission,
227
+ inputs=[upload_file, email_input, model_name_input, opt_in_toggle],
228
+ outputs=[result_json, leaderboard_table],
229
+ )
230
+
231
+ # === Tab 3: About ===
232
+ with gr.TabItem("ℹ️ About"):
233
+ gr.Markdown(EVALUATION_INFO, elem_classes="markdown-text")
234
+
235
+
236
+ demo.launch()
assets/leaderboard.jsonl ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"submission_id": "dbd32abd6ef16eef", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "clip-ViT-B-32-multilingual-v1", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 1.2, "Recall@5": 4.1, "Recall@10": 6.1, "Recall@20": 8.8, "NDCG@1": 3.2, "NDCG@5": 4, "NDCG@10": 4.7, "NDCG@20": 5.5}
2
+ {"submission_id": "a59247d2809373ba", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "siglip2-base-patch16-224", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 16.4, "Recall@5": 33.7, "Recall@10": 40, "Recall@20": 47.3, "NDCG@1": 33.1, "NDCG@5": 34.6, "NDCG@10": 36, "NDCG@20": 38.2}
3
+ {"submission_id": "95f663a7c7d53495", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "siglip2-giant-opt-patch16-256", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 20.8, "Recall@5": 40.5, "Recall@10": 47.6, "Recall@20": 54.5, "NDCG@1": 42, "NDCG@5": 42.4, "NDCG@10": 44.1, "NDCG@20": 46}
4
+ {"submission_id": "695d2f72f79a2e1b", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "VLM2Vec", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 23.1, "Recall@5": 44.4, "Recall@10": 52.4, "Recall@20": 60, "NDCG@1": 46, "NDCG@5": 47, "NDCG@10": 48.9, "NDCG@20": 51.3}
5
+ {"submission_id": "141053c18e438d49", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "B3_Qwen2_7B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 20.9, "Recall@5": 41.4, "Recall@10": 49.9, "Recall@20": 57.1, "NDCG@1": 41.9, "NDCG@5": 43.6, "NDCG@10": 45.6, "NDCG@20": 47.8}
6
+ {"submission_id": "200e8177772d5001", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Qwen3-VL-Embedding-2B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 22.7, "Recall@5": 42.5, "Recall@10": 50.6, "Recall@20": 58.2, "NDCG@1": 44.6, "NDCG@5": 45.4, "NDCG@10": 47.4, "NDCG@20": 49.5}
7
+ {"submission_id": "4b46272277c90449", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Qwen3-VL-Embedding-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24.9, "Recall@5": 46.2, "Recall@10": 53, "Recall@20": 59.2, "NDCG@1": 49.7, "NDCG@5": 49.7, "NDCG@10": 50.9, "NDCG@20": 52.6}
8
+ {"submission_id": "d30ad9a666c5a684", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Ops-MM-embedding-v1", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.8, "Recall@5": 48.7, "Recall@10": 56.6, "Recall@20": 63.7, "NDCG@1": 49.8, "NDCG@5": 51.7, "NDCG@10": 53.5, "NDCG@20": 55.5}
9
+ {"submission_id": "4bd67195881c1ba9", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "RzenEmbed-v2-7B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 27.2, "Recall@5": 49.9, "Recall@10": 58, "Recall@20": 65.1, "NDCG@1": 54.1, "NDCG@5": 54.3, "NDCG@10": 56, "NDCG@20": 57.9}
10
+ {"submission_id": "1a6b01520b759117", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "QQMM-embed-v2", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 26.6, "Recall@5": 50, "Recall@10": 57.8, "Recall@20": 65.4, "NDCG@1": 52.3, "NDCG@5": 53.7, "NDCG@10": 55.4, "NDCG@20": 57.6}
11
+ {"submission_id": "2da78d02a132aec3", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-small", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.6, "Recall@5": 37.5, "Recall@10": 42.8, "Recall@20": 48.8, "NDCG@1": 40.7, "NDCG@5": 39.6, "NDCG@10": 40.7, "NDCG@20": 42.5}
12
+ {"submission_id": "c3a345e59fb8a397", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-base", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.8, "Recall@5": 38.1, "Recall@10": 44.8, "Recall@20": 51.9, "NDCG@1": 41.6, "NDCG@5": 40.5, "NDCG@10": 42.2, "NDCG@20": 44.3}
13
+ {"submission_id": "4e2e72470575fb93", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-large", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24.2, "Recall@5": 40.6, "Recall@10": 47.1, "Recall@20": 54.6, "NDCG@1": 46.3, "NDCG@5": 44.7, "NDCG@10": 45.9, "NDCG@20": 48}
14
+ {"submission_id": "ee3448449553ac08", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "bge-m3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 23, "Recall@5": 41.6, "Recall@10": 48.5, "Recall@20": 56.6, "NDCG@1": 43.8, "NDCG@5": 44.5, "NDCG@10": 46, "NDCG@20": 48.5}
15
+ {"submission_id": "b0fa2bb87791bb47", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-0.6B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24, "Recall@5": 42, "Recall@10": 48.4, "Recall@20": 54.9, "NDCG@1": 45.9, "NDCG@5": 45.5, "NDCG@10": 46.8, "NDCG@20": 48.6}
16
+ {"submission_id": "7b6b91a8b64da303", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-4B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.6, "Recall@5": 44.5, "Recall@10": 51.6, "Recall@20": 57.9, "NDCG@1": 49.1, "NDCG@5": 48.3, "NDCG@10": 49.8, "NDCG@20": 51.4}
17
+ {"submission_id": "b38199eded78c794", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.4, "Recall@5": 44.8, "Recall@10": 51.7, "Recall@20": 57.8, "NDCG@1": 48.3, "NDCG@5": 48.2, "NDCG@10": 49.7, "NDCG@20": 51.2}
18
+ {"submission_id": "0dbc32eb7e40e180", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "ToolACE-2-Llama-3.1-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.3, "Recall@5": 44, "Recall@10": 47.7, "Recall@20": 51.2, "NDCG@1": 48.8, "NDCG@5": 50.9, "NDCG@10": 50.2, "NDCG@20": 50.7}
19
+ {"submission_id": "62d9047f0402776f", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 14.5, "Recall@5": 51, "Recall@10": 60.4, "Recall@20": 64.1, "NDCG@1": 35, "NDCG@5": 53.5, "NDCG@10": 56.4, "NDCG@20": 56.4}
20
+ {"submission_id": "6921c1b285657dfe", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-32B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 28.3, "Recall@5": 54.1, "Recall@10": 62.5, "Recall@20": 64.5, "NDCG@1": 63.6, "NDCG@5": 63, "NDCG@10": 63.5, "NDCG@20": 62.4}
21
+ {"submission_id": "60fd3fdf06d3f313", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "DeepSeek-v3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 26.8, "Recall@5": 51.6, "Recall@10": 60.1, "Recall@20": 62.4, "NDCG@1": 59.6, "NDCG@5": 59.9, "NDCG@10": 61, "NDCG@20": 60.6}
22
+ {"submission_id": "45249ed7e1ebc2f2", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-235B-A22B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 31, "Recall@5": 60.8, "Recall@10": 67, "Recall@20": 69.6, "NDCG@1": 70, "NDCG@5": 71.4, "NDCG@10": 72, "NDCG@20": 71.8}
23
+ {"submission_id": "695f07e99fefa554", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "GPT-4o", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 28, "Recall@5": 53.1, "Recall@10": 58.5, "Recall@20": 60.3, "NDCG@1": 60.3, "NDCG@5": 61.1, "NDCG@10": 60.5, "NDCG@20": 59.5}
24
+ {"submission_id": "2750290902c58d51", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "OpenAI-o3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 31.2, "Recall@5": 59.5, "Recall@10": 68.7, "Recall@20": 71.4, "NDCG@1": 70.2, "NDCG@5": 69.8, "NDCG@10": 70.6, "NDCG@20": 69.9}
25
+ {"submission_id": "d80bf05df2437697", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Claude-Sonnet-4-5", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 30.5, "Recall@5": 60.4, "Recall@10": 69.5, "Recall@20": 73.1, "NDCG@1": 69, "NDCG@5": 69.7, "NDCG@10": 70.3, "NDCG@20": 70}
26
+ {"submission_id": "16b8c0a0ce88851b", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Claude-Opus-4-5", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 32.1, "Recall@5": 60.6, "Recall@10": 68.7, "Recall@20": 71.7, "NDCG@1": 69.8, "NDCG@5": 69.9, "NDCG@10": 70.3, "NDCG@20": 69.9}
data/leaderboard.jsonl ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"submission_id": "dbd32abd6ef16eef", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "clip-ViT-B-32-multilingual-v1", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 1.2, "Recall@5": 4.1, "Recall@10": 6.1, "Recall@20": 8.8, "NDCG@1": 3.2, "NDCG@5": 4, "NDCG@10": 4.7, "NDCG@20": 5.5}
2
+ {"submission_id": "a59247d2809373ba", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "siglip2-base-patch16-224", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 16.4, "Recall@5": 33.7, "Recall@10": 40, "Recall@20": 47.3, "NDCG@1": 33.1, "NDCG@5": 34.6, "NDCG@10": 36, "NDCG@20": 38.2}
3
+ {"submission_id": "95f663a7c7d53495", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "siglip2-giant-opt-patch16-256", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 20.8, "Recall@5": 40.5, "Recall@10": 47.6, "Recall@20": 54.5, "NDCG@1": 42, "NDCG@5": 42.4, "NDCG@10": 44.1, "NDCG@20": 46}
4
+ {"submission_id": "695d2f72f79a2e1b", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "VLM2Vec", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 23.1, "Recall@5": 44.4, "Recall@10": 52.4, "Recall@20": 60, "NDCG@1": 46, "NDCG@5": 47, "NDCG@10": 48.9, "NDCG@20": 51.3}
5
+ {"submission_id": "141053c18e438d49", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "B3_Qwen2_7B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 20.9, "Recall@5": 41.4, "Recall@10": 49.9, "Recall@20": 57.1, "NDCG@1": 41.9, "NDCG@5": 43.6, "NDCG@10": 45.6, "NDCG@20": 47.8}
6
+ {"submission_id": "200e8177772d5001", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Qwen3-VL-Embedding-2B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 22.7, "Recall@5": 42.5, "Recall@10": 50.6, "Recall@20": 58.2, "NDCG@1": 44.6, "NDCG@5": 45.4, "NDCG@10": 47.4, "NDCG@20": 49.5}
7
+ {"submission_id": "4b46272277c90449", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Qwen3-VL-Embedding-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24.9, "Recall@5": 46.2, "Recall@10": 53, "Recall@20": 59.2, "NDCG@1": 49.7, "NDCG@5": 49.7, "NDCG@10": 50.9, "NDCG@20": 52.6}
8
+ {"submission_id": "d30ad9a666c5a684", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Ops-MM-embedding-v1", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.8, "Recall@5": 48.7, "Recall@10": 56.6, "Recall@20": 63.7, "NDCG@1": 49.8, "NDCG@5": 51.7, "NDCG@10": 53.5, "NDCG@20": 55.5}
9
+ {"submission_id": "4bd67195881c1ba9", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "RzenEmbed-v2-7B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 27.2, "Recall@5": 49.9, "Recall@10": 58, "Recall@20": 65.1, "NDCG@1": 54.1, "NDCG@5": 54.3, "NDCG@10": 56, "NDCG@20": 57.9}
10
+ {"submission_id": "1a6b01520b759117", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "QQMM-embed-v2", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 26.6, "Recall@5": 50, "Recall@10": 57.8, "Recall@20": 65.4, "NDCG@1": 52.3, "NDCG@5": 53.7, "NDCG@10": 55.4, "NDCG@20": 57.6}
11
+ {"submission_id": "2da78d02a132aec3", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-small", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.6, "Recall@5": 37.5, "Recall@10": 42.8, "Recall@20": 48.8, "NDCG@1": 40.7, "NDCG@5": 39.6, "NDCG@10": 40.7, "NDCG@20": 42.5}
12
+ {"submission_id": "c3a345e59fb8a397", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-base", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.8, "Recall@5": 38.1, "Recall@10": 44.8, "Recall@20": 51.9, "NDCG@1": 41.6, "NDCG@5": 40.5, "NDCG@10": 42.2, "NDCG@20": 44.3}
13
+ {"submission_id": "4e2e72470575fb93", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-large", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24.2, "Recall@5": 40.6, "Recall@10": 47.1, "Recall@20": 54.6, "NDCG@1": 46.3, "NDCG@5": 44.7, "NDCG@10": 45.9, "NDCG@20": 48}
14
+ {"submission_id": "ee3448449553ac08", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "bge-m3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 23, "Recall@5": 41.6, "Recall@10": 48.5, "Recall@20": 56.6, "NDCG@1": 43.8, "NDCG@5": 44.5, "NDCG@10": 46, "NDCG@20": 48.5}
15
+ {"submission_id": "b0fa2bb87791bb47", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-0.6B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24, "Recall@5": 42, "Recall@10": 48.4, "Recall@20": 54.9, "NDCG@1": 45.9, "NDCG@5": 45.5, "NDCG@10": 46.8, "NDCG@20": 48.6}
16
+ {"submission_id": "7b6b91a8b64da303", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-4B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.6, "Recall@5": 44.5, "Recall@10": 51.6, "Recall@20": 57.9, "NDCG@1": 49.1, "NDCG@5": 48.3, "NDCG@10": 49.8, "NDCG@20": 51.4}
17
+ {"submission_id": "b38199eded78c794", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.4, "Recall@5": 44.8, "Recall@10": 51.7, "Recall@20": 57.8, "NDCG@1": 48.3, "NDCG@5": 48.2, "NDCG@10": 49.7, "NDCG@20": 51.2}
18
+ {"submission_id": "0dbc32eb7e40e180", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "ToolACE-2-Llama-3.1-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.3, "Recall@5": 44, "Recall@10": 47.7, "Recall@20": 51.2, "NDCG@1": 48.8, "NDCG@5": 50.9, "NDCG@10": 50.2, "NDCG@20": 50.7}
19
+ {"submission_id": "62d9047f0402776f", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 14.5, "Recall@5": 51, "Recall@10": 60.4, "Recall@20": 64.1, "NDCG@1": 35, "NDCG@5": 53.5, "NDCG@10": 56.4, "NDCG@20": 56.4}
20
+ {"submission_id": "6921c1b285657dfe", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-32B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 28.3, "Recall@5": 54.1, "Recall@10": 62.5, "Recall@20": 64.5, "NDCG@1": 63.6, "NDCG@5": 63, "NDCG@10": 63.5, "NDCG@20": 62.4}
21
+ {"submission_id": "60fd3fdf06d3f313", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "DeepSeek-v3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 26.8, "Recall@5": 51.6, "Recall@10": 60.1, "Recall@20": 62.4, "NDCG@1": 59.6, "NDCG@5": 59.9, "NDCG@10": 61, "NDCG@20": 60.6}
22
+ {"submission_id": "45249ed7e1ebc2f2", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-235B-A22B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 31, "Recall@5": 60.8, "Recall@10": 67, "Recall@20": 69.6, "NDCG@1": 70, "NDCG@5": 71.4, "NDCG@10": 72, "NDCG@20": 71.8}
23
+ {"submission_id": "695f07e99fefa554", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "GPT-4o", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 28, "Recall@5": 53.1, "Recall@10": 58.5, "Recall@20": 60.3, "NDCG@1": 60.3, "NDCG@5": 61.1, "NDCG@10": 60.5, "NDCG@20": 59.5}
24
+ {"submission_id": "2750290902c58d51", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "OpenAI-o3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 31.2, "Recall@5": 59.5, "Recall@10": 68.7, "Recall@20": 71.4, "NDCG@1": 70.2, "NDCG@5": 69.8, "NDCG@10": 70.6, "NDCG@20": 69.9}
25
+ {"submission_id": "d80bf05df2437697", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Claude-Sonnet-4-5", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 30.5, "Recall@5": 60.4, "Recall@10": 69.5, "Recall@20": 73.1, "NDCG@1": 69, "NDCG@5": 69.7, "NDCG@10": 70.3, "NDCG@20": 70}
26
+ {"submission_id": "16b8c0a0ce88851b", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Claude-Opus-4-5", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 32.1, "Recall@5": 60.6, "Recall@10": 68.7, "Recall@20": 71.7, "NDCG@1": 69.8, "NDCG@5": 69.9, "NDCG@10": 70.3, "NDCG@20": 69.9}
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ pandas
3
+ numpy
src/__init__.py ADDED
File without changes
src/about.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NAVIGATION = """
2
+ <div style="text-align:center; margin-bottom: 24px;">
3
+ <div style="display:inline-flex; flex-wrap:wrap; gap:8px; justify-content:center;">
4
+ <a href="https://github.com/LaVieEnRose365/PhotoBench" target="_blank" style="text-decoration:none;">
5
+ <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500; transition:all 0.2s;">🏠 GitHub</span>
6
+ </a>
7
+ <a href="https://arxiv.org/abs/2603.01493v1" target="_blank" style="text-decoration:none;">
8
+ <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">📄 arXiv</span>
9
+ </a>
10
+ <a href="https://huggingface.co/spaces/SorrowTea/PhotoBench/" target="_blank" style="text-decoration:none;">
11
+ <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">🏅 Leaderboard</span>
12
+ </a>
13
+ <a href="https://huggingface.co/datasets/SorrowTea/PhotoBench" target="_blank" style="text-decoration:none;">
14
+ <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">📊 Dataset</span>
15
+ </a>
16
+ <span style="display:inline-block; padding:8px 14px; background:#7CB342; border:1px solid #7CB342; border-radius:10px; color:#fff; font-size:0.85em; font-weight:600;">🛡️ Protected LB</span>
17
+ <a href="https://huggingface.co/datasets/SorrowTea/PhotoBench-Protected" target="_blank" style="text-decoration:none;">
18
+ <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">📁 Protected Data</span>
19
+ </a>
20
+ <a href="https://sbox.myoas.com/l/Be5be4053f6b43840" target="_blank" style="text-decoration:none;">
21
+ <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">🔒 Full Data</span>
22
+ </a>
23
+ </div>
24
+ </div>
25
+ """
26
+
27
+ TITLE = """
28
+ <div style="text-align:center; padding: 48px 20px; background: linear-gradient(160deg, #f5f9f0 0%, #e8f0e0 100%); border-radius: 20px; margin-bottom: 32px; border: 1px solid #d4e0c8;">
29
+ <h1 style="color:#1a1a1a; font-size:3em; font-weight:600; letter-spacing:-1px; margin:0; font-family:-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;">
30
+ PhotoBench-Protected
31
+ </h1>
32
+ <p style="color:#7CB342; font-size:1.1em; font-weight:500; letter-spacing:2px; margin:12px 0 0 0;">
33
+ AGENT-ONLY LEADERBOARD
34
+ </p>
35
+ <div style="width:60px; height:3px; background:#7CB342; margin:20px auto; border-radius:2px;"></div>
36
+ <p style="color:#666; font-size:0.95em; margin-top:12px;">
37
+ Limited Information Source Benchmark
38
+ </p>
39
+ </div>
40
+ """
41
+
42
+ INTRODUCTION = """
43
+ <div style="text-align:center; max-width:720px; margin:0 auto 40px; color:#444; line-height:1.8;">
44
+
45
+ <strong>PhotoBench-Protected</strong> is our initial open-source release.
46
+ Because only partial model captions, embeddings, and metadata are provided,
47
+ this benchmark focuses exclusively on <strong>agent planning</strong> ability.
48
+
49
+ <p style="margin-top:16px; color:#7CB342; font-weight:600;">
50
+ ⚠️ Please confirm you are submitting to the correct leaderboard.
51
+ </p>
52
+
53
+ <p style="margin-top:12px;">
54
+ The test sets for PhotoBench-Protected and <a href="https://huggingface.co/spaces/SorrowTea/PhotoBench/" target="_blank" style="color:#7CB342; font-weight:600; text-decoration:none;">PhotoBench (full) ↗</a> are different.
55
+ For unrestricted retrieval with raw images, please use the
56
+ <a href="https://huggingface.co/spaces/SorrowTea/PhotoBench/" target="_blank" style="color:#7CB342; font-weight:600; text-decoration:none;">full PhotoBench leaderboard ↗</a>.
57
+ Full dataset download: <a href="https://sbox.myoas.com/l/Be5be4053f6b43840" target="_blank" style="color:#7CB342; font-weight:600; text-decoration:none;">OneBox ↗</a>.
58
+ </p>
59
+
60
+ </div>
61
+ """
62
+
63
+ SUBMISSION_GUIDE = """
64
+ ### Submission Format
65
+
66
+ Upload a JSON file containing an array of prediction objects:
67
+
68
+ ```json
69
+ [
70
+ {
71
+ "album_id": "1",
72
+ "query_en": "cluttered desk",
73
+ "pred": ["IMG_1234.jpg", "IMG_5678.jpg", ...]
74
+ }
75
+ ]
76
+ ```
77
+
78
+ **Required fields:**
79
+ - `album_id`: Album number (1, 2, or 3)
80
+ - `query_en`: The English query text (must match exactly)
81
+ - `pred`: Ordered list of predicted image filenames
82
+
83
+ You may submit results for any subset of albums. Partial submissions are accepted and evaluated.
84
+ """
85
+
86
+ EVALUATION_INFO = """
87
+ ### Evaluation Metrics
88
+
89
+ | Metric | Description |
90
+ |--------|-------------|
91
+ | **Recall@k** | Proportion of ground-truth images found in top-k predictions |
92
+ | **NDCG@k** | Normalized Discounted Cumulative Gain at rank k |
93
+
94
+ Supported k values: **1, 5, 10, 20, 50, 100**
95
+
96
+ Results are averaged across all evaluated queries per album, then averaged across albums for the final leaderboard score.
97
+ """
98
+
99
+ custom_css = """
100
+ /* Grass-green clean theme */
101
+ body {
102
+ background: #f5f9f0 !important;
103
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif !important;
104
+ font-size: 17px !important;
105
+ }
106
+
107
+ /* Tab buttons */
108
+ .tab-buttons button {
109
+ font-weight: 500 !important;
110
+ font-size: 0.9em !important;
111
+ border-radius: 10px 10px 0 0 !important;
112
+ padding: 12px 24px !important;
113
+ background: #e0ead8 !important;
114
+ color: #555 !important;
115
+ border: none !important;
116
+ transition: all 0.25s ease !important;
117
+ }
118
+
119
+ .tab-buttons button.selected {
120
+ background: #fff !important;
121
+ color: #1a1a1a !important;
122
+ box-shadow: 0 -2px 0 #7CB342 inset !important;
123
+ }
124
+
125
+ /* Primary buttons */
126
+ .gr-button-primary {
127
+ background: #7CB342 !important;
128
+ border: none !important;
129
+ border-radius: 10px !important;
130
+ color: #fff !important;
131
+ font-weight: 600 !important;
132
+ font-size: 0.95em !important;
133
+ padding: 12px 28px !important;
134
+ transition: all 0.25s ease !important;
135
+ }
136
+
137
+ .gr-button-primary:hover {
138
+ background: #6ba32e !important;
139
+ transform: translateY(-1px) !important;
140
+ box-shadow: 0 6px 20px rgba(124,179,66,0.25) !important;
141
+ }
142
+
143
+ /* Markdown */
144
+ .markdown-text {
145
+ max-width: 780px;
146
+ margin: 0 auto;
147
+ color: #333;
148
+ line-height: 1.8;
149
+ font-size: 1.05em;
150
+ }
151
+
152
+ /* DataFrame Table */
153
+ .gr-dataframe {
154
+ border-radius: 14px !important;
155
+ overflow: hidden !important;
156
+ box-shadow: 0 2px 16px rgba(0,0,0,0.06) !important;
157
+ border: 1px solid #d4e0c8 !important;
158
+ font-size: 0.95em !important;
159
+ }
160
+
161
+ .gr-dataframe th {
162
+ background: #e8f0e0 !important;
163
+ color: #444 !important;
164
+ font-weight: 600 !important;
165
+ font-size: 0.8em !important;
166
+ text-transform: uppercase !important;
167
+ letter-spacing: 0.5px !important;
168
+ padding: 14px 10px !important;
169
+ border-bottom: 2px solid #d4e0c8 !important;
170
+ }
171
+
172
+ .gr-dataframe td {
173
+ padding: 12px 10px !important;
174
+ border-bottom: 1px solid #e0ead8 !important;
175
+ color: #333 !important;
176
+ }
177
+
178
+ .gr-dataframe tr:hover td {
179
+ background: #f0f7e8 !important;
180
+ }
181
+
182
+ /* Inputs */
183
+ input, textarea, select {
184
+ border-radius: 10px !important;
185
+ border: 1px solid #c4d4b4 !important;
186
+ background: #fff !important;
187
+ font-size: 1em !important;
188
+ padding: 10px 14px !important;
189
+ }
190
+
191
+ input:focus, textarea:focus, select:focus {
192
+ border-color: #7CB342 !important;
193
+ box-shadow: 0 0 0 3px rgba(124,179,66,0.12) !important;
194
+ outline: none !important;
195
+ }
196
+
197
+ /* Form containers */
198
+ .gr-form .gr-box {
199
+ border-radius: 14px !important;
200
+ background: #fff !important;
201
+ border: 1px solid #d4e0c8 !important;
202
+ padding: 24px !important;
203
+ }
204
+
205
+ /* Labels */
206
+ .gr-input-label, .gr-dropdown-label {
207
+ font-weight: 500 !important;
208
+ color: #444 !important;
209
+ font-size: 0.9em !important;
210
+ margin-bottom: 6px !important;
211
+ }
212
+
213
+ /* JSON output */
214
+ .gr-json {
215
+ border-radius: 12px !important;
216
+ background: #f5f9f0 !important;
217
+ border: 1px solid #d4e0c8 !important;
218
+ font-size: 0.9em !important;
219
+ }
220
+
221
+ /* Center submit form */
222
+ #submit-form-container {
223
+ max-width: 600px;
224
+ margin: 0 auto;
225
+ }
226
+
227
+ /* Section headers */
228
+ .gr-tab-item h3 {
229
+ color: #1a1a1a !important;
230
+ font-weight: 600 !important;
231
+ font-size: 1.2em !important;
232
+ margin-top: 24px;
233
+ margin-bottom: 12px;
234
+ }
235
+ """
src/evaluator.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ from math import log2
4
+ from pathlib import Path
5
+ from typing import Any, Dict, List
6
+
7
+ import numpy as np
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ GT_DIR = Path("/data") / "gt"
12
+
13
+ K_VALUES = [1, 5, 10, 20, 50, 100]
14
+
15
+
16
+ class Evaluator:
17
+ def __init__(self, gt_dir: str | Path | None = None):
18
+ self.gt_dir = Path(gt_dir) if gt_dir else GT_DIR
19
+ self._gt_cache: Dict[str, list] = {}
20
+
21
+ def _load_gt(self, album_id: str) -> list:
22
+ if album_id in self._gt_cache:
23
+ return self._gt_cache[album_id]
24
+ gt_file = self.gt_dir / f"album{album_id}_test_answer.json"
25
+ if not gt_file.exists():
26
+ raise FileNotFoundError(f"Ground truth file not found: {gt_file}")
27
+ with open(gt_file, "r", encoding="utf-8") as f:
28
+ data = json.load(f)
29
+ self._gt_cache[album_id] = data
30
+ return data
31
+
32
+ def validate_json_format(self, data: Any) -> list[str]:
33
+ errors = []
34
+ if not isinstance(data, list):
35
+ return ["Root must be a JSON array"]
36
+ if len(data) == 0:
37
+ return ["Submission is empty"]
38
+ for i, item in enumerate(data):
39
+ if not isinstance(item, dict):
40
+ errors.append(f"Item #{i} must be an object")
41
+ continue
42
+ if "album_id" not in item or str(item["album_id"]) not in ["1", "2", "3"]:
43
+ errors.append(f"Item #{i} 'album_id' must be '1', '2', or '3'")
44
+ if "query_en" not in item or not isinstance(item["query_en"], str):
45
+ errors.append(f"Item #{i} 'query_en' must be a string")
46
+ if (
47
+ "pred" not in item
48
+ or not isinstance(item["pred"], list)
49
+ or not all(isinstance(x, str) for x in item["pred"])
50
+ ):
51
+ errors.append(f"Item #{i} 'pred' must be a list of strings")
52
+ return errors
53
+
54
+ def _dcg_at_k(self, r, k):
55
+ r = np.asarray(r, dtype=float)[:k]
56
+ if r.size:
57
+ return np.sum(r / np.log2(np.arange(2, r.size + 2)))
58
+ return 0.0
59
+
60
+ def _ndcg_at_k(self, r, k):
61
+ dcg_max = self._dcg_at_k(sorted(r, reverse=True), k)
62
+ if not dcg_max:
63
+ return 0.0
64
+ return self._dcg_at_k(r, k) / dcg_max
65
+
66
+ def _recall_at_k(self, ground_truth, predictions, k):
67
+ k_preds = predictions[:k]
68
+ hits = len(set(ground_truth) & set(k_preds))
69
+ if len(ground_truth) == 0:
70
+ return 0.0
71
+ return hits / len(ground_truth)
72
+
73
+ def _evaluate_album(self, album_submissions: dict, album_id: str) -> dict:
74
+ """Evaluate a single album."""
75
+ gt_data = self._load_gt(album_id)
76
+ gt_map = {item["query_en"]: item for item in gt_data}
77
+
78
+ metrics_accum = {f"Recall@{k}": [] for k in K_VALUES}
79
+ metrics_accum.update({f"NDCG@{k}": [] for k in K_VALUES})
80
+ metrics_accum["Recall"] = []
81
+ metrics_accum["NDCG"] = []
82
+ source_accum = {}
83
+ empty_gt_queries = 0
84
+ evaluated_queries = 0
85
+
86
+ for q, pred in album_submissions.items():
87
+ if q not in gt_map:
88
+ continue
89
+
90
+ gt_item = gt_map[q]
91
+ gt_answers = gt_item.get("ground_truth", [])
92
+ source = gt_item.get("Source")
93
+ evaluated_queries += 1
94
+
95
+ if not gt_answers:
96
+ empty_gt_queries += 1
97
+ continue
98
+
99
+ r = [1 if p in gt_answers else 0 for p in pred]
100
+ dcg_r = [1.0] * len(gt_answers)
101
+ m = {}
102
+
103
+ for k in K_VALUES:
104
+ m[f"Recall@{k}"] = self._recall_at_k(gt_answers, pred, k)
105
+ idcg = self._dcg_at_k(dcg_r, k)
106
+ ndcg = self._dcg_at_k(r, k) / idcg if idcg > 0 else 0.0
107
+ m[f"NDCG@{k}"] = ndcg
108
+ metrics_accum[f"Recall@{k}"].append(m[f"Recall@{k}"])
109
+ metrics_accum[f"NDCG@{k}"].append(m[f"NDCG@{k}"])
110
+
111
+ m["Recall"] = sum(r) / len(gt_answers)
112
+ idcg_all = self._dcg_at_k(dcg_r, len(gt_answers))
113
+ ndcg_all = self._dcg_at_k(r, len(r)) / idcg_all if idcg_all > 0 else 0.0
114
+ m["NDCG"] = ndcg_all
115
+ metrics_accum["Recall"].append(m["Recall"])
116
+ metrics_accum["NDCG"].append(m["NDCG"])
117
+
118
+ if source is not None:
119
+ if source not in source_accum:
120
+ source_accum[source] = {f"Recall@{_k}": [] for _k in K_VALUES}
121
+ source_accum[source].update({f"NDCG@{_k}": [] for _k in K_VALUES})
122
+ source_accum[source]["Recall"] = []
123
+ source_accum[source]["NDCG"] = []
124
+ for k in K_VALUES:
125
+ source_accum[source][f"Recall@{k}"].append(m[f"Recall@{k}"])
126
+ source_accum[source][f"NDCG@{k}"].append(m[f"NDCG@{k}"])
127
+ source_accum[source]["Recall"].append(m["Recall"])
128
+ source_accum[source]["NDCG"].append(m["NDCG"])
129
+
130
+ global_metrics = {
131
+ k: float(np.mean(v)) if v else 0.0 for k, v in metrics_accum.items()
132
+ }
133
+ return {
134
+ "global_metrics": global_metrics,
135
+ "source_metrics": {
136
+ src: {k: float(np.mean(v)) if v else 0.0 for k, v in m_dict.items()}
137
+ for src, m_dict in source_accum.items()
138
+ },
139
+ "empty_gt_ratio": empty_gt_queries / evaluated_queries if evaluated_queries > 0 else 0.0,
140
+ "evaluated_queries": evaluated_queries,
141
+ "total_gt_queries": len(gt_data),
142
+ "is_partial": evaluated_queries < len(gt_data),
143
+ }
144
+
145
+ def evaluate(self, submission_data: list) -> dict:
146
+ albums = {}
147
+ for item in submission_data:
148
+ a_id = str(item["album_id"])
149
+ if a_id not in albums:
150
+ albums[a_id] = {}
151
+ albums[a_id][item["query_en"]] = item["pred"]
152
+
153
+ if not albums:
154
+ raise ValueError("No valid albums found in submission.")
155
+
156
+ # Evaluate each album separately
157
+ per_album = {}
158
+ for a_id in sorted(albums.keys()):
159
+ per_album[a_id] = self._evaluate_album(albums[a_id], a_id)
160
+
161
+ # Compute averaged metrics across all albums
162
+ avg_metrics = {}
163
+ for metric_key in per_album[list(per_album.keys())[0]]["global_metrics"].keys():
164
+ values = [alb["global_metrics"][metric_key] for alb in per_album.values() if metric_key in alb["global_metrics"]]
165
+ avg_metrics[metric_key] = float(np.mean(values)) if values else 0.0
166
+
167
+ total_evaluated = sum(alb["evaluated_queries"] for alb in per_album.values())
168
+ total_gt = sum(alb["total_gt_queries"] for alb in per_album.values())
169
+
170
+ result = {
171
+ "per_album": per_album,
172
+ "global_metrics": avg_metrics,
173
+ "evaluated_queries": total_evaluated,
174
+ "total_gt_queries": total_gt,
175
+ "is_partial": total_evaluated < total_gt,
176
+ "albums": sorted(albums.keys()),
177
+ }
178
+
179
+ if result["is_partial"]:
180
+ missing = [a for a in ["1", "2", "3"] if a not in albums]
181
+ result["warning"] = f"Submission incomplete. Missing albums: {', '.join(missing)}. Averaged results across submitted albums shown below."
182
+
183
+ return result
src/leaderboard_manager.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ from datetime import datetime, timedelta
4
+ from typing import Any
5
+
6
+ import pandas as pd
7
+
8
+ from src.storage import load_leaderboard, save_leaderboard
9
+
10
+ # All available metric columns (computed)
11
+ ALL_METRIC_COLS = [
12
+ "Recall@1", "Recall@5", "Recall@10", "Recall@20", "Recall@50", "Recall@100",
13
+ "NDCG@1", "NDCG@5", "NDCG@10", "NDCG@20", "NDCG@50", "NDCG@100",
14
+ ]
15
+
16
+ # Default columns shown on leaderboard
17
+ DEFAULT_DISPLAY_METRICS = [
18
+ "Recall@1", "Recall@5", "Recall@20", "Recall@50",
19
+ "NDCG@1", "NDCG@5", "NDCG@20", "NDCG@50",
20
+ ]
21
+
22
+ # Base columns always shown
23
+ BASE_COLS = ["rank", "model_name"]
24
+
25
+ _DEFAULT_SORT = "Recall@10"
26
+ _TOP_N = 30
27
+ _RETENTION_DAYS = 30
28
+
29
+
30
+ def make_id(email: str, model_name: str) -> str:
31
+ return hashlib.sha256(f"{email}:{model_name}".encode()).hexdigest()[:16]
32
+
33
+
34
+ class LeaderboardManager:
35
+ def __init__(self):
36
+ self._entries: list[dict] = []
37
+ self._load()
38
+ self._cleanup()
39
+
40
+ def _load(self):
41
+ raw = load_leaderboard()
42
+ self._entries = raw
43
+
44
+ def _save(self):
45
+ save_leaderboard(self._entries)
46
+
47
+ def _cleanup(self):
48
+ """Remove non-paper entries older than 30 days that are not in top 30."""
49
+ if not self._entries:
50
+ return
51
+
52
+ df = pd.DataFrame(self._entries)
53
+ if _DEFAULT_SORT in df.columns:
54
+ top_ids = set(
55
+ df.sort_values(by=_DEFAULT_SORT, ascending=False)
56
+ .head(_TOP_N)["submission_id"]
57
+ .tolist()
58
+ )
59
+ else:
60
+ top_ids = set()
61
+
62
+ cutoff = datetime.utcnow() - timedelta(days=_RETENTION_DAYS)
63
+ kept = []
64
+ for e in self._entries:
65
+ sid = e.get("submission_id", "")
66
+ is_paper = e.get("is_paper_data", False)
67
+ ts_str = e.get("timestamp", "")
68
+ try:
69
+ ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
70
+ except Exception:
71
+ ts = datetime.utcnow()
72
+
73
+ if is_paper or sid in top_ids or ts >= cutoff:
74
+ kept.append(e)
75
+
76
+ removed = len(self._entries) - len(kept)
77
+ if removed > 0:
78
+ print(f"[CLEANUP] Removed {removed} expired entries")
79
+ self._entries = kept
80
+ self._save()
81
+
82
+ def add_result(
83
+ self,
84
+ email: str,
85
+ method: str,
86
+ model_name: str,
87
+ albums: list[str],
88
+ evaluated_queries: int,
89
+ total_gt_queries: int,
90
+ global_metrics: dict,
91
+ ) -> dict | None:
92
+ """Add a new evaluation result. Returns entry if added, None if not eligible."""
93
+ # Must be a full submission (all 3 albums, all queries matched)
94
+ if set(albums) != {"1", "2", "3"}:
95
+ return None
96
+ if evaluated_queries < total_gt_queries:
97
+ return None
98
+
99
+ submission_id = make_id(email, model_name)
100
+ entry = {
101
+ "submission_id": submission_id,
102
+ "timestamp": datetime.utcnow().isoformat() + "Z",
103
+ "email": email,
104
+ "method": method,
105
+ "model_name": model_name,
106
+ "albums": ",".join(albums),
107
+ "is_paper_data": False,
108
+ **{k: round(v, 4) for k, v in global_metrics.items() if k in ALL_METRIC_COLS or k in ("Recall", "NDCG")},
109
+ }
110
+
111
+ # Keep best score per (email, model_name)
112
+ key = (email, model_name)
113
+ existing_idx = None
114
+ for i, e in enumerate(self._entries):
115
+ if (e.get("email"), e.get("model_name")) == key:
116
+ existing_idx = i
117
+ break
118
+
119
+ if existing_idx is not None:
120
+ old = self._entries[existing_idx]
121
+ if global_metrics.get(_DEFAULT_SORT, 0) >= old.get(_DEFAULT_SORT, 0):
122
+ self._entries[existing_idx] = entry
123
+ else:
124
+ self._entries.append(entry)
125
+
126
+ self._save()
127
+ return entry
128
+
129
+ def get_display_df(
130
+ self,
131
+ method_filter: str | None = None,
132
+ sort_by: str = _DEFAULT_SORT,
133
+ ascending: bool = False,
134
+ top_n: int = _TOP_N,
135
+ metric_cols: list[str] | None = None,
136
+ ) -> pd.DataFrame:
137
+ """Return a pandas DataFrame ready for gr.DataFrame."""
138
+ cols_to_show = BASE_COLS + (metric_cols or DEFAULT_DISPLAY_METRICS)
139
+
140
+ if not self._entries:
141
+ return pd.DataFrame(columns=cols_to_show)
142
+
143
+ df = pd.DataFrame(self._entries)
144
+
145
+ if method_filter and method_filter != "All":
146
+ df = df[df["method"] == method_filter]
147
+
148
+ if sort_by not in df.columns:
149
+ sort_by = _DEFAULT_SORT
150
+
151
+ df = df.sort_values(by=sort_by, ascending=ascending)
152
+ df = df.head(top_n).reset_index(drop=True)
153
+ df["rank"] = df.index + 1
154
+
155
+ available = [c for c in cols_to_show if c in df.columns]
156
+ df = df[available]
157
+ return df
158
+
159
+ def remove_entry(self, submission_id: str) -> bool:
160
+ """Remove an entry by submission_id. Returns True if removed."""
161
+ original_len = len(self._entries)
162
+ self._entries = [e for e in self._entries if e.get("submission_id") != submission_id]
163
+ if len(self._entries) < original_len:
164
+ self._save()
165
+ return True
166
+ return False
src/storage.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timedelta
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ STORAGE_DIR = Path("/data")
8
+ SUBMISSIONS_DIR = STORAGE_DIR / "submissions"
9
+ LEADERBOARD_FILE = STORAGE_DIR / "leaderboard.jsonl"
10
+ RATE_LIMIT_FILE = STORAGE_DIR / "rate_limits.json"
11
+
12
+ # Seed data bundled with the app (used on first boot)
13
+ SEED_LEADERBOARD = Path(__file__).parent.parent / "assets" / "leaderboard.jsonl"
14
+
15
+
16
+ def ensure_dirs():
17
+ STORAGE_DIR.mkdir(parents=True, exist_ok=True)
18
+ SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
19
+
20
+
21
+ def _seed_leaderboard():
22
+ """Copy bundled leaderboard data to /data on first boot."""
23
+ if LEADERBOARD_FILE.exists():
24
+ return
25
+ if SEED_LEADERBOARD.exists():
26
+ import shutil
27
+ shutil.copy(SEED_LEADERBOARD, LEADERBOARD_FILE)
28
+ print(f"[SEED] Copied leaderboard data from {SEED_LEADERBOARD} to {LEADERBOARD_FILE}")
29
+
30
+
31
+ def save_submission(submission_id: str, payload: dict) -> str:
32
+ """Save raw submission JSON to local storage."""
33
+ ensure_dirs()
34
+ file_path = SUBMISSIONS_DIR / f"{submission_id}.json"
35
+ with open(file_path, "w", encoding="utf-8") as f:
36
+ json.dump(payload, f, ensure_ascii=False, indent=2)
37
+ return str(file_path)
38
+
39
+
40
+ def list_submissions() -> list[dict]:
41
+ """List all submission metadata."""
42
+ ensure_dirs()
43
+ results = []
44
+ for f in sorted(SUBMISSIONS_DIR.glob("*.json")):
45
+ try:
46
+ with open(f, "r", encoding="utf-8") as fp:
47
+ data = json.load(fp)
48
+ meta = data.get("meta", {})
49
+ meta["file"] = str(f.name)
50
+ results.append(meta)
51
+ except Exception:
52
+ continue
53
+ return results
54
+
55
+
56
+ def load_leaderboard() -> list[dict]:
57
+ """Load current leaderboard data from local storage."""
58
+ _seed_leaderboard()
59
+ if not LEADERBOARD_FILE.exists():
60
+ return []
61
+ entries = []
62
+ with open(LEADERBOARD_FILE, "r", encoding="utf-8") as f:
63
+ for line in f:
64
+ line = line.strip()
65
+ if line:
66
+ entries.append(json.loads(line))
67
+ return entries
68
+
69
+
70
+ def save_leaderboard(entries: list[dict]) -> None:
71
+ """Overwrite leaderboard file with the current entries."""
72
+ ensure_dirs()
73
+ with open(LEADERBOARD_FILE, "w", encoding="utf-8") as f:
74
+ for entry in entries:
75
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
76
+
77
+
78
+ # ---- Rate limiting ----
79
+
80
+ def check_rate_limit(email: str, cooldown_minutes: int = 60) -> tuple[bool, str]:
81
+ """Check if email is allowed to submit. Returns (allowed, message)."""
82
+ ensure_dirs()
83
+ limits = {}
84
+ if RATE_LIMIT_FILE.exists():
85
+ with open(RATE_LIMIT_FILE, "r", encoding="utf-8") as f:
86
+ limits = json.load(f)
87
+
88
+ last_str = limits.get(email)
89
+ if last_str:
90
+ last_time = datetime.fromisoformat(last_str)
91
+ next_allowed = last_time + timedelta(minutes=cooldown_minutes)
92
+ if datetime.utcnow() < next_allowed:
93
+ remaining = int((next_allowed - datetime.utcnow()).total_seconds() / 60)
94
+ return False, f"This email has already submitted within the last hour. Please wait {remaining} minutes."
95
+
96
+ return True, ""
97
+
98
+
99
+ def record_submission_time(email: str) -> None:
100
+ """Record the current submission time for an email."""
101
+ ensure_dirs()
102
+ limits = {}
103
+ if RATE_LIMIT_FILE.exists():
104
+ with open(RATE_LIMIT_FILE, "r", encoding="utf-8") as f:
105
+ limits = json.load(f)
106
+ limits[email] = datetime.utcnow().isoformat()
107
+ with open(RATE_LIMIT_FILE, "w", encoding="utf-8") as f:
108
+ json.dump(limits, f, ensure_ascii=False, indent=2)