Spaces:

SorrowTea
/

PhotoBench-Protected

Running

App Files Files Community

SorrowTea commited on Apr 23

Commit

01f4cb5

0 Parent(s):

Initial PhotoBench-Protected Leaderboard

Browse files

Files changed (11) hide show

.gitignore +20 -0
README.md +39 -0
app.py +236 -0
assets/leaderboard.jsonl +26 -0
data/leaderboard.jsonl +26 -0
requirements.txt +3 -0
src/__init__.py +0 -0
src/about.py +235 -0
src/evaluator.py +183 -0
src/leaderboard_manager.py +166 -0
src/storage.py +108 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,20 @@

+# Data files
+data/gt/
+*.DS_Store
+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+.venv/
+venv/
+# IDE
+.vscode/
+.idea/
+# OS
+*.swp
+*.swo
+*~

README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+---
+title: PhotoBench-Protected
+emoji: 🛡️
+colorFrom: green
+colorTo: green
+sdk: gradio
+app_file: app.py
+pinned: true
+license: apache-2.0
+short_description: PhotoBench-Protected agent-only leaderboard
+tags:
+- leaderboard
+- image-retrieval
+- benchmark
+- agent
+---
+# PhotoBench-Protected Leaderboard
+Agent-only leaderboard for PhotoBench-Protected, where only partial captions, embeddings, and metadata are provided.
+## Quick Start
+1. Download the protected dataset from [PhotoBench-Protected Dataset](https://huggingface.co/datasets/SorrowTea/PhotoBench-Protected)
+2. Build your agent-based retrieval system using the provided features
+3. Submit predictions in JSON format
+## Important Notice
+**PhotoBench-Protected** is our initial open-source release with limited information sources. This benchmark focuses exclusively on **agent planning** ability.
+- For **unrestricted retrieval with raw images**, please use the [full PhotoBench Leaderboard](https://huggingface.co/spaces/SorrowTea/PhotoBench/).
+- The test sets for PhotoBench-Protected and PhotoBench (full) are **different**.
+- Please confirm you are submitting to the correct leaderboard before uploading.
+## Evaluation Metrics
+- **Recall@k** for k ∈ {1, 5, 10, 20, 50, 100}
+- **NDCG@k** for k ∈ {1, 5, 10, 20, 50, 100}

app.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import json
+import os
+import uuid
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+from src.about import (
+    EVALUATION_INFO,
+    INTRODUCTION,
+    NAVIGATION,
+    SUBMISSION_GUIDE,
+    TITLE,
+    custom_css,
+)
+from src.evaluator import Evaluator
+from src.leaderboard_manager import (
+    ALL_METRIC_COLS,
+    DEFAULT_DISPLAY_METRICS,
+    LeaderboardManager,
+)
+from src.storage import (
+    check_rate_limit,
+    record_submission_time,
+    save_submission,
+)
+# Initialize components
+try:
+    manager = LeaderboardManager()
+except Exception as e:
+    print(f"[WARN] Failed to init LeaderboardManager: {e}")
+    manager = None
+evaluator = Evaluator()
+def refresh_leaderboard(sort_by):
+    if manager is None:
+        return pd.DataFrame(columns=["rank", "model_name"])
+    try:
+        return manager.get_display_df(
+            method_filter="Agent",
+            sort_by=sort_by,
+            ascending=False,
+            top_n=30,
+            metric_cols=DEFAULT_DISPLAY_METRICS,
+        )
+    except Exception as e:
+        return pd.DataFrame({"Error": [str(e)]})
+def handle_submission(file_obj, email, model_name, opt_in):
+    if manager is None:
+        return {"error": "Leaderboard service unavailable."}, None
+    if file_obj is None:
+        return {"error": "Please upload a JSON file."}, None
+    if not email or not email.strip() or "@" not in email:
+        return {"error": "Please enter a valid email address."}, None
+    email = email.strip().lower()
+    if not model_name or not model_name.strip():
+        return {"error": "Please enter a model / system name."}, None
+    # Rate limit check
+    allowed, msg = check_rate_limit(email)
+    if not allowed:
+        return {"error": msg}, None
+    # Read uploaded file
+    file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    except Exception as e:
+        return {"error": f"Failed to parse JSON: {e}"}, None
+    # Validate format
+    errors = evaluator.validate_json_format(data)
+    if errors:
+        return {"error": "Validation failed", "details": errors}, None
+    # Run evaluation
+    try:
+        result = evaluator.evaluate(data)
+    except Exception as e:
+        return {"error": f"Evaluation failed: {e}"}, None
+    # Extract album coverage
+    albums = sorted({str(item["album_id"]) for item in data})
+    # Record rate limit
+    record_submission_time(email)
+    # Save submission
+    submission_id = str(uuid.uuid4())
+    try:
+        save_submission(
+            submission_id,
+            {
+                "meta": {
+                    "submission_id": submission_id,
+                    "email": email,
+                    "method": "Agent",
+                    "model_name": model_name.strip(),
+                    "albums": albums,
+                    "opt_in": opt_in,
+                },
+                "submission": data,
+                "result": result,
+            },
+        )
+    except Exception as e:
+        return {"error": f"Failed to save submission: {e}"}, None
+    # Update leaderboard only if opted in and full submission
+    leaderboard_msg = ""
+    if opt_in:
+        entry = manager.add_result(
+            email=email,
+            method="Agent",
+            model_name=model_name.strip(),
+            albums=albums,
+            evaluated_queries=result["evaluated_queries"],
+            total_gt_queries=result["total_gt_queries"],
+            global_metrics=result["global_metrics"],
+        )
+        if entry is None:
+            leaderboard_msg = "Result saved but not eligible for leaderboard (incomplete submission). Only full submissions across all 3 albums are ranked."
+        else:
+            leaderboard_msg = "Result published to leaderboard."
+    else:
+        leaderboard_msg = "Result recorded privately. Not published to leaderboard."
+    # Build result summary
+    summary = {
+        "status": "Success",
+        "submission_id": submission_id,
+        "email": email,
+        "model_name": model_name.strip(),
+        "albums": albums,
+        "evaluated_queries": result["evaluated_queries"],
+        "total_gt_queries": result["total_gt_queries"],
+        "metrics": result["global_metrics"],
+        "leaderboard_status": leaderboard_msg,
+        "notice": "Please download and save your results. Submission data is retained for 30 days only.",
+    }
+    if result.get("is_partial"):
+        summary["warning"] = result["warning"]
+    updated_df = refresh_leaderboard("Recall@10")
+    return summary, updated_df
+# Gradio interface
+with gr.Blocks(css=custom_css, title="PhotoBench-Protected Leaderboard") as demo:
+    gr.HTML(TITLE)
+    gr.HTML(NAVIGATION)
+    gr.Markdown(INTRODUCTION, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons"):
+        # === Tab 1: Leaderboard ===
+        with gr.TabItem("🏅 Leaderboard"):
+            with gr.Row():
+                sort_by = gr.Dropdown(
+                    choices=ALL_METRIC_COLS,
+                    value="Recall@10",
+                    label="Sort by",
+                )
+            refresh_btn = gr.Button("Refresh", variant="primary")
+            leaderboard_table = gr.DataFrame(
+                label="Top 30",
+                interactive=False,
+                wrap=True,
+            )
+            refresh_btn.click(
+                refresh_leaderboard,
+                inputs=[sort_by],
+                outputs=leaderboard_table,
+            )
+            demo.load(
+                refresh_leaderboard,
+                inputs=[sort_by],
+                outputs=leaderboard_table,
+            )
+        # === Tab 2: Submit ===
+        with gr.TabItem("📝 Submit"):
+            gr.Markdown(SUBMISSION_GUIDE, elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    pass
+                with gr.Column(scale=3):
+                    with gr.Row():
+                        with gr.Column():
+                            upload_file = gr.File(
+                                label="Upload results JSON",
+                                file_types=[".json"],
+                            )
+                            email_input = gr.Textbox(
+                                label="Email",
+                                placeholder="your@email.com",
+                            )
+                            model_name_input = gr.Textbox(
+                                label="Model / System Name",
+                                placeholder="e.g., GPT-4V-Agent",
+                            )
+                            opt_in_toggle = gr.Checkbox(
+                                label="Publish to public leaderboard",
+                                value=True,
+                            )
+                            submit_btn = gr.Button("Submit for Evaluation", variant="primary")
+                        with gr.Column():
+                            result_json = gr.JSON(label="Evaluation Results")
+                with gr.Column(scale=1):
+                    pass
+            submit_btn.click(
+                handle_submission,
+                inputs=[upload_file, email_input, model_name_input, opt_in_toggle],
+                outputs=[result_json, leaderboard_table],
+            )
+        # === Tab 3: About ===
+        with gr.TabItem("ℹ️ About"):
+            gr.Markdown(EVALUATION_INFO, elem_classes="markdown-text")
+demo.launch()

assets/leaderboard.jsonl ADDED Viewed

	@@ -0,0 +1,26 @@

+{"submission_id": "dbd32abd6ef16eef", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "clip-ViT-B-32-multilingual-v1", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 1.2, "Recall@5": 4.1, "Recall@10": 6.1, "Recall@20": 8.8, "NDCG@1": 3.2, "NDCG@5": 4, "NDCG@10": 4.7, "NDCG@20": 5.5}
+{"submission_id": "a59247d2809373ba", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "siglip2-base-patch16-224", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 16.4, "Recall@5": 33.7, "Recall@10": 40, "Recall@20": 47.3, "NDCG@1": 33.1, "NDCG@5": 34.6, "NDCG@10": 36, "NDCG@20": 38.2}
+{"submission_id": "95f663a7c7d53495", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "siglip2-giant-opt-patch16-256", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 20.8, "Recall@5": 40.5, "Recall@10": 47.6, "Recall@20": 54.5, "NDCG@1": 42, "NDCG@5": 42.4, "NDCG@10": 44.1, "NDCG@20": 46}
+{"submission_id": "695d2f72f79a2e1b", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "VLM2Vec", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 23.1, "Recall@5": 44.4, "Recall@10": 52.4, "Recall@20": 60, "NDCG@1": 46, "NDCG@5": 47, "NDCG@10": 48.9, "NDCG@20": 51.3}
+{"submission_id": "141053c18e438d49", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "B3_Qwen2_7B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 20.9, "Recall@5": 41.4, "Recall@10": 49.9, "Recall@20": 57.1, "NDCG@1": 41.9, "NDCG@5": 43.6, "NDCG@10": 45.6, "NDCG@20": 47.8}
+{"submission_id": "200e8177772d5001", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Qwen3-VL-Embedding-2B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 22.7, "Recall@5": 42.5, "Recall@10": 50.6, "Recall@20": 58.2, "NDCG@1": 44.6, "NDCG@5": 45.4, "NDCG@10": 47.4, "NDCG@20": 49.5}
+{"submission_id": "4b46272277c90449", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Qwen3-VL-Embedding-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24.9, "Recall@5": 46.2, "Recall@10": 53, "Recall@20": 59.2, "NDCG@1": 49.7, "NDCG@5": 49.7, "NDCG@10": 50.9, "NDCG@20": 52.6}
+{"submission_id": "d30ad9a666c5a684", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Ops-MM-embedding-v1", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.8, "Recall@5": 48.7, "Recall@10": 56.6, "Recall@20": 63.7, "NDCG@1": 49.8, "NDCG@5": 51.7, "NDCG@10": 53.5, "NDCG@20": 55.5}
+{"submission_id": "4bd67195881c1ba9", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "RzenEmbed-v2-7B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 27.2, "Recall@5": 49.9, "Recall@10": 58, "Recall@20": 65.1, "NDCG@1": 54.1, "NDCG@5": 54.3, "NDCG@10": 56, "NDCG@20": 57.9}
+{"submission_id": "1a6b01520b759117", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "QQMM-embed-v2", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 26.6, "Recall@5": 50, "Recall@10": 57.8, "Recall@20": 65.4, "NDCG@1": 52.3, "NDCG@5": 53.7, "NDCG@10": 55.4, "NDCG@20": 57.6}
+{"submission_id": "2da78d02a132aec3", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-small", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.6, "Recall@5": 37.5, "Recall@10": 42.8, "Recall@20": 48.8, "NDCG@1": 40.7, "NDCG@5": 39.6, "NDCG@10": 40.7, "NDCG@20": 42.5}
+{"submission_id": "c3a345e59fb8a397", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-base", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.8, "Recall@5": 38.1, "Recall@10": 44.8, "Recall@20": 51.9, "NDCG@1": 41.6, "NDCG@5": 40.5, "NDCG@10": 42.2, "NDCG@20": 44.3}
+{"submission_id": "4e2e72470575fb93", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-large", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24.2, "Recall@5": 40.6, "Recall@10": 47.1, "Recall@20": 54.6, "NDCG@1": 46.3, "NDCG@5": 44.7, "NDCG@10": 45.9, "NDCG@20": 48}
+{"submission_id": "ee3448449553ac08", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "bge-m3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 23, "Recall@5": 41.6, "Recall@10": 48.5, "Recall@20": 56.6, "NDCG@1": 43.8, "NDCG@5": 44.5, "NDCG@10": 46, "NDCG@20": 48.5}
+{"submission_id": "b0fa2bb87791bb47", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-0.6B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24, "Recall@5": 42, "Recall@10": 48.4, "Recall@20": 54.9, "NDCG@1": 45.9, "NDCG@5": 45.5, "NDCG@10": 46.8, "NDCG@20": 48.6}
+{"submission_id": "7b6b91a8b64da303", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-4B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.6, "Recall@5": 44.5, "Recall@10": 51.6, "Recall@20": 57.9, "NDCG@1": 49.1, "NDCG@5": 48.3, "NDCG@10": 49.8, "NDCG@20": 51.4}
+{"submission_id": "b38199eded78c794", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.4, "Recall@5": 44.8, "Recall@10": 51.7, "Recall@20": 57.8, "NDCG@1": 48.3, "NDCG@5": 48.2, "NDCG@10": 49.7, "NDCG@20": 51.2}
+{"submission_id": "0dbc32eb7e40e180", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "ToolACE-2-Llama-3.1-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.3, "Recall@5": 44, "Recall@10": 47.7, "Recall@20": 51.2, "NDCG@1": 48.8, "NDCG@5": 50.9, "NDCG@10": 50.2, "NDCG@20": 50.7}
+{"submission_id": "62d9047f0402776f", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 14.5, "Recall@5": 51, "Recall@10": 60.4, "Recall@20": 64.1, "NDCG@1": 35, "NDCG@5": 53.5, "NDCG@10": 56.4, "NDCG@20": 56.4}
+{"submission_id": "6921c1b285657dfe", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-32B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 28.3, "Recall@5": 54.1, "Recall@10": 62.5, "Recall@20": 64.5, "NDCG@1": 63.6, "NDCG@5": 63, "NDCG@10": 63.5, "NDCG@20": 62.4}
+{"submission_id": "60fd3fdf06d3f313", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "DeepSeek-v3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 26.8, "Recall@5": 51.6, "Recall@10": 60.1, "Recall@20": 62.4, "NDCG@1": 59.6, "NDCG@5": 59.9, "NDCG@10": 61, "NDCG@20": 60.6}
+{"submission_id": "45249ed7e1ebc2f2", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-235B-A22B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 31, "Recall@5": 60.8, "Recall@10": 67, "Recall@20": 69.6, "NDCG@1": 70, "NDCG@5": 71.4, "NDCG@10": 72, "NDCG@20": 71.8}
+{"submission_id": "695f07e99fefa554", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "GPT-4o", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 28, "Recall@5": 53.1, "Recall@10": 58.5, "Recall@20": 60.3, "NDCG@1": 60.3, "NDCG@5": 61.1, "NDCG@10": 60.5, "NDCG@20": 59.5}
+{"submission_id": "2750290902c58d51", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "OpenAI-o3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 31.2, "Recall@5": 59.5, "Recall@10": 68.7, "Recall@20": 71.4, "NDCG@1": 70.2, "NDCG@5": 69.8, "NDCG@10": 70.6, "NDCG@20": 69.9}
+{"submission_id": "d80bf05df2437697", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Claude-Sonnet-4-5", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 30.5, "Recall@5": 60.4, "Recall@10": 69.5, "Recall@20": 73.1, "NDCG@1": 69, "NDCG@5": 69.7, "NDCG@10": 70.3, "NDCG@20": 70}
+{"submission_id": "16b8c0a0ce88851b", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Claude-Opus-4-5", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 32.1, "Recall@5": 60.6, "Recall@10": 68.7, "Recall@20": 71.7, "NDCG@1": 69.8, "NDCG@5": 69.9, "NDCG@10": 70.3, "NDCG@20": 69.9}

data/leaderboard.jsonl ADDED Viewed

	@@ -0,0 +1,26 @@

+{"submission_id": "dbd32abd6ef16eef", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "clip-ViT-B-32-multilingual-v1", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 1.2, "Recall@5": 4.1, "Recall@10": 6.1, "Recall@20": 8.8, "NDCG@1": 3.2, "NDCG@5": 4, "NDCG@10": 4.7, "NDCG@20": 5.5}
+{"submission_id": "a59247d2809373ba", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "siglip2-base-patch16-224", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 16.4, "Recall@5": 33.7, "Recall@10": 40, "Recall@20": 47.3, "NDCG@1": 33.1, "NDCG@5": 34.6, "NDCG@10": 36, "NDCG@20": 38.2}
+{"submission_id": "95f663a7c7d53495", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "siglip2-giant-opt-patch16-256", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 20.8, "Recall@5": 40.5, "Recall@10": 47.6, "Recall@20": 54.5, "NDCG@1": 42, "NDCG@5": 42.4, "NDCG@10": 44.1, "NDCG@20": 46}
+{"submission_id": "695d2f72f79a2e1b", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "VLM2Vec", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 23.1, "Recall@5": 44.4, "Recall@10": 52.4, "Recall@20": 60, "NDCG@1": 46, "NDCG@5": 47, "NDCG@10": 48.9, "NDCG@20": 51.3}
+{"submission_id": "141053c18e438d49", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "B3_Qwen2_7B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 20.9, "Recall@5": 41.4, "Recall@10": 49.9, "Recall@20": 57.1, "NDCG@1": 41.9, "NDCG@5": 43.6, "NDCG@10": 45.6, "NDCG@20": 47.8}
+{"submission_id": "200e8177772d5001", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Qwen3-VL-Embedding-2B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 22.7, "Recall@5": 42.5, "Recall@10": 50.6, "Recall@20": 58.2, "NDCG@1": 44.6, "NDCG@5": 45.4, "NDCG@10": 47.4, "NDCG@20": 49.5}
+{"submission_id": "4b46272277c90449", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Qwen3-VL-Embedding-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24.9, "Recall@5": 46.2, "Recall@10": 53, "Recall@20": 59.2, "NDCG@1": 49.7, "NDCG@5": 49.7, "NDCG@10": 50.9, "NDCG@20": 52.6}
+{"submission_id": "d30ad9a666c5a684", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "Ops-MM-embedding-v1", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.8, "Recall@5": 48.7, "Recall@10": 56.6, "Recall@20": 63.7, "NDCG@1": 49.8, "NDCG@5": 51.7, "NDCG@10": 53.5, "NDCG@20": 55.5}
+{"submission_id": "4bd67195881c1ba9", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "RzenEmbed-v2-7B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 27.2, "Recall@5": 49.9, "Recall@10": 58, "Recall@20": 65.1, "NDCG@1": 54.1, "NDCG@5": 54.3, "NDCG@10": 56, "NDCG@20": 57.9}
+{"submission_id": "1a6b01520b759117", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Embedding", "model_name": "QQMM-embed-v2", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 26.6, "Recall@5": 50, "Recall@10": 57.8, "Recall@20": 65.4, "NDCG@1": 52.3, "NDCG@5": 53.7, "NDCG@10": 55.4, "NDCG@20": 57.6}
+{"submission_id": "2da78d02a132aec3", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-small", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.6, "Recall@5": 37.5, "Recall@10": 42.8, "Recall@20": 48.8, "NDCG@1": 40.7, "NDCG@5": 39.6, "NDCG@10": 40.7, "NDCG@20": 42.5}
+{"submission_id": "c3a345e59fb8a397", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-base", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.8, "Recall@5": 38.1, "Recall@10": 44.8, "Recall@20": 51.9, "NDCG@1": 41.6, "NDCG@5": 40.5, "NDCG@10": 42.2, "NDCG@20": 44.3}
+{"submission_id": "4e2e72470575fb93", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "multilingual-e5-large", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24.2, "Recall@5": 40.6, "Recall@10": 47.1, "Recall@20": 54.6, "NDCG@1": 46.3, "NDCG@5": 44.7, "NDCG@10": 45.9, "NDCG@20": 48}
+{"submission_id": "ee3448449553ac08", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "bge-m3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 23, "Recall@5": 41.6, "Recall@10": 48.5, "Recall@20": 56.6, "NDCG@1": 43.8, "NDCG@5": 44.5, "NDCG@10": 46, "NDCG@20": 48.5}
+{"submission_id": "b0fa2bb87791bb47", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-0.6B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 24, "Recall@5": 42, "Recall@10": 48.4, "Recall@20": 54.9, "NDCG@1": 45.9, "NDCG@5": 45.5, "NDCG@10": 46.8, "NDCG@20": 48.6}
+{"submission_id": "7b6b91a8b64da303", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-4B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.6, "Recall@5": 44.5, "Recall@10": 51.6, "Recall@20": 57.9, "NDCG@1": 49.1, "NDCG@5": 48.3, "NDCG@10": 49.8, "NDCG@20": 51.4}
+{"submission_id": "b38199eded78c794", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Caption", "model_name": "Qwen3-Embedding-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 25.4, "Recall@5": 44.8, "Recall@10": 51.7, "Recall@20": 57.8, "NDCG@1": 48.3, "NDCG@5": 48.2, "NDCG@10": 49.7, "NDCG@20": 51.2}
+{"submission_id": "0dbc32eb7e40e180", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "ToolACE-2-Llama-3.1-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 21.3, "Recall@5": 44, "Recall@10": 47.7, "Recall@20": 51.2, "NDCG@1": 48.8, "NDCG@5": 50.9, "NDCG@10": 50.2, "NDCG@20": 50.7}
+{"submission_id": "62d9047f0402776f", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-8B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 14.5, "Recall@5": 51, "Recall@10": 60.4, "Recall@20": 64.1, "NDCG@1": 35, "NDCG@5": 53.5, "NDCG@10": 56.4, "NDCG@20": 56.4}
+{"submission_id": "6921c1b285657dfe", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-32B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 28.3, "Recall@5": 54.1, "Recall@10": 62.5, "Recall@20": 64.5, "NDCG@1": 63.6, "NDCG@5": 63, "NDCG@10": 63.5, "NDCG@20": 62.4}
+{"submission_id": "60fd3fdf06d3f313", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "DeepSeek-v3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 26.8, "Recall@5": 51.6, "Recall@10": 60.1, "Recall@20": 62.4, "NDCG@1": 59.6, "NDCG@5": 59.9, "NDCG@10": 61, "NDCG@20": 60.6}
+{"submission_id": "45249ed7e1ebc2f2", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Qwen3-235B-A22B", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 31, "Recall@5": 60.8, "Recall@10": 67, "Recall@20": 69.6, "NDCG@1": 70, "NDCG@5": 71.4, "NDCG@10": 72, "NDCG@20": 71.8}
+{"submission_id": "695f07e99fefa554", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "GPT-4o", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 28, "Recall@5": 53.1, "Recall@10": 58.5, "Recall@20": 60.3, "NDCG@1": 60.3, "NDCG@5": 61.1, "NDCG@10": 60.5, "NDCG@20": 59.5}
+{"submission_id": "2750290902c58d51", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "OpenAI-o3", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 31.2, "Recall@5": 59.5, "Recall@10": 68.7, "Recall@20": 71.4, "NDCG@1": 70.2, "NDCG@5": 69.8, "NDCG@10": 70.6, "NDCG@20": 69.9}
+{"submission_id": "d80bf05df2437697", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Claude-Sonnet-4-5", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 30.5, "Recall@5": 60.4, "Recall@10": 69.5, "Recall@20": 73.1, "NDCG@1": 69, "NDCG@5": 69.7, "NDCG@10": 70.3, "NDCG@20": 70}
+{"submission_id": "16b8c0a0ce88851b", "timestamp": "2026-04-22T17:31:10.846331Z", "email": "init@example.com", "method": "Agent", "model_name": "Claude-Opus-4-5", "albums": "1,2,3", "evaluated_queries": 0, "is_paper_data": true, "Recall@1": 32.1, "Recall@5": 60.6, "Recall@10": 68.7, "Recall@20": 71.7, "NDCG@1": 69.8, "NDCG@5": 69.9, "NDCG@10": 70.3, "NDCG@20": 69.9}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio
+pandas
+numpy

src/__init__.py ADDED Viewed

File without changes

src/about.py ADDED Viewed

	@@ -0,0 +1,235 @@

+NAVIGATION = """
+<div style="text-align:center; margin-bottom: 24px;">
+    <div style="display:inline-flex; flex-wrap:wrap; gap:8px; justify-content:center;">
+        <a href="https://github.com/LaVieEnRose365/PhotoBench" target="_blank" style="text-decoration:none;">
+            <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500; transition:all 0.2s;">🏠 GitHub</span>
+        </a>
+        <a href="https://arxiv.org/abs/2603.01493v1" target="_blank" style="text-decoration:none;">
+            <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">📄 arXiv</span>
+        </a>
+        <a href="https://huggingface.co/spaces/SorrowTea/PhotoBench/" target="_blank" style="text-decoration:none;">
+            <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">🏅 Leaderboard</span>
+        </a>
+        <a href="https://huggingface.co/datasets/SorrowTea/PhotoBench" target="_blank" style="text-decoration:none;">
+            <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">📊 Dataset</span>
+        </a>
+        <span style="display:inline-block; padding:8px 14px; background:#7CB342; border:1px solid #7CB342; border-radius:10px; color:#fff; font-size:0.85em; font-weight:600;">🛡️ Protected LB</span>
+        <a href="https://huggingface.co/datasets/SorrowTea/PhotoBench-Protected" target="_blank" style="text-decoration:none;">
+            <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">📁 Protected Data</span>
+        </a>
+        <a href="https://sbox.myoas.com/l/Be5be4053f6b43840" target="_blank" style="text-decoration:none;">
+            <span style="display:inline-block; padding:8px 14px; background:#fff; border:1px solid #d4e0c8; border-radius:10px; color:#555; font-size:0.85em; font-weight:500;">🔒 Full Data</span>
+        </a>
+    </div>
+</div>
+"""
+TITLE = """
+<div style="text-align:center; padding: 48px 20px; background: linear-gradient(160deg, #f5f9f0 0%, #e8f0e0 100%); border-radius: 20px; margin-bottom: 32px; border: 1px solid #d4e0c8;">
+    <h1 style="color:#1a1a1a; font-size:3em; font-weight:600; letter-spacing:-1px; margin:0; font-family:-apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;">
+        PhotoBench-Protected
+    </h1>
+    <p style="color:#7CB342; font-size:1.1em; font-weight:500; letter-spacing:2px; margin:12px 0 0 0;">
+        AGENT-ONLY LEADERBOARD
+    </p>
+    <div style="width:60px; height:3px; background:#7CB342; margin:20px auto; border-radius:2px;"></div>
+    <p style="color:#666; font-size:0.95em; margin-top:12px;">
+        Limited Information Source Benchmark
+    </p>
+</div>
+"""
+INTRODUCTION = """
+<div style="text-align:center; max-width:720px; margin:0 auto 40px; color:#444; line-height:1.8;">
+<strong>PhotoBench-Protected</strong> is our initial open-source release.
+Because only partial model captions, embeddings, and metadata are provided,
+this benchmark focuses exclusively on <strong>agent planning</strong> ability.
+<p style="margin-top:16px; color:#7CB342; font-weight:600;">
+⚠️ Please confirm you are submitting to the correct leaderboard.
+</p>
+<p style="margin-top:12px;">
+The test sets for PhotoBench-Protected and <a href="https://huggingface.co/spaces/SorrowTea/PhotoBench/" target="_blank" style="color:#7CB342; font-weight:600; text-decoration:none;">PhotoBench (full) ↗</a> are different.
+For unrestricted retrieval with raw images, please use the
+<a href="https://huggingface.co/spaces/SorrowTea/PhotoBench/" target="_blank" style="color:#7CB342; font-weight:600; text-decoration:none;">full PhotoBench leaderboard ↗</a>.
+Full dataset download: <a href="https://sbox.myoas.com/l/Be5be4053f6b43840" target="_blank" style="color:#7CB342; font-weight:600; text-decoration:none;">OneBox ↗</a>.
+</p>
+</div>
+"""
+SUBMISSION_GUIDE = """
+### Submission Format
+Upload a JSON file containing an array of prediction objects:
+```json
+[
+  {
+    "album_id": "1",
+    "query_en": "cluttered desk",
+    "pred": ["IMG_1234.jpg", "IMG_5678.jpg", ...]
+  }
+]
+```
+**Required fields:**
+- `album_id`: Album number (1, 2, or 3)
+- `query_en`: The English query text (must match exactly)
+- `pred`: Ordered list of predicted image filenames
+You may submit results for any subset of albums. Partial submissions are accepted and evaluated.
+"""
+EVALUATION_INFO = """
+### Evaluation Metrics
+| Metric | Description |
+|--------|-------------|
+| **Recall@k** | Proportion of ground-truth images found in top-k predictions |
+| **NDCG@k** | Normalized Discounted Cumulative Gain at rank k |
+Supported k values: **1, 5, 10, 20, 50, 100**
+Results are averaged across all evaluated queries per album, then averaged across albums for the final leaderboard score.
+"""
+custom_css = """
+/* Grass-green clean theme */
+body {
+    background: #f5f9f0 !important;
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif !important;
+    font-size: 17px !important;
+}
+/* Tab buttons */
+.tab-buttons button {
+    font-weight: 500 !important;
+    font-size: 0.9em !important;
+    border-radius: 10px 10px 0 0 !important;
+    padding: 12px 24px !important;
+    background: #e0ead8 !important;
+    color: #555 !important;
+    border: none !important;
+    transition: all 0.25s ease !important;
+}
+.tab-buttons button.selected {
+    background: #fff !important;
+    color: #1a1a1a !important;
+    box-shadow: 0 -2px 0 #7CB342 inset !important;
+}
+/* Primary buttons */
+.gr-button-primary {
+    background: #7CB342 !important;
+    border: none !important;
+    border-radius: 10px !important;
+    color: #fff !important;
+    font-weight: 600 !important;
+    font-size: 0.95em !important;
+    padding: 12px 28px !important;
+    transition: all 0.25s ease !important;
+}
+.gr-button-primary:hover {
+    background: #6ba32e !important;
+    transform: translateY(-1px) !important;
+    box-shadow: 0 6px 20px rgba(124,179,66,0.25) !important;
+}
+/* Markdown */
+.markdown-text {
+    max-width: 780px;
+    margin: 0 auto;
+    color: #333;
+    line-height: 1.8;
+    font-size: 1.05em;
+}
+/* DataFrame Table */
+.gr-dataframe {
+    border-radius: 14px !important;
+    overflow: hidden !important;
+    box-shadow: 0 2px 16px rgba(0,0,0,0.06) !important;
+    border: 1px solid #d4e0c8 !important;
+    font-size: 0.95em !important;
+}
+.gr-dataframe th {
+    background: #e8f0e0 !important;
+    color: #444 !important;
+    font-weight: 600 !important;
+    font-size: 0.8em !important;
+    text-transform: uppercase !important;
+    letter-spacing: 0.5px !important;
+    padding: 14px 10px !important;
+    border-bottom: 2px solid #d4e0c8 !important;
+}
+.gr-dataframe td {
+    padding: 12px 10px !important;
+    border-bottom: 1px solid #e0ead8 !important;
+    color: #333 !important;
+}
+.gr-dataframe tr:hover td {
+    background: #f0f7e8 !important;
+}
+/* Inputs */
+input, textarea, select {
+    border-radius: 10px !important;
+    border: 1px solid #c4d4b4 !important;
+    background: #fff !important;
+    font-size: 1em !important;
+    padding: 10px 14px !important;
+}
+input:focus, textarea:focus, select:focus {
+    border-color: #7CB342 !important;
+    box-shadow: 0 0 0 3px rgba(124,179,66,0.12) !important;
+    outline: none !important;
+}
+/* Form containers */
+.gr-form .gr-box {
+    border-radius: 14px !important;
+    background: #fff !important;
+    border: 1px solid #d4e0c8 !important;
+    padding: 24px !important;
+}
+/* Labels */
+.gr-input-label, .gr-dropdown-label {
+    font-weight: 500 !important;
+    color: #444 !important;
+    font-size: 0.9em !important;
+    margin-bottom: 6px !important;
+}
+/* JSON output */
+.gr-json {
+    border-radius: 12px !important;
+    background: #f5f9f0 !important;
+    border: 1px solid #d4e0c8 !important;
+    font-size: 0.9em !important;
+}
+/* Center submit form */
+#submit-form-container {
+    max-width: 600px;
+    margin: 0 auto;
+}
+/* Section headers */
+.gr-tab-item h3 {
+    color: #1a1a1a !important;
+    font-weight: 600 !important;
+    font-size: 1.2em !important;
+    margin-top: 24px;
+    margin-bottom: 12px;
+}
+"""

src/evaluator.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import json
+import logging
+from math import log2
+from pathlib import Path
+from typing import Any, Dict, List
+import numpy as np
+logger = logging.getLogger(__name__)
+GT_DIR = Path("/data") / "gt"
+K_VALUES = [1, 5, 10, 20, 50, 100]
+class Evaluator:
+    def __init__(self, gt_dir: str | Path | None = None):
+        self.gt_dir = Path(gt_dir) if gt_dir else GT_DIR
+        self._gt_cache: Dict[str, list] = {}
+    def _load_gt(self, album_id: str) -> list:
+        if album_id in self._gt_cache:
+            return self._gt_cache[album_id]
+        gt_file = self.gt_dir / f"album{album_id}_test_answer.json"
+        if not gt_file.exists():
+            raise FileNotFoundError(f"Ground truth file not found: {gt_file}")
+        with open(gt_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        self._gt_cache[album_id] = data
+        return data
+    def validate_json_format(self, data: Any) -> list[str]:
+        errors = []
+        if not isinstance(data, list):
+            return ["Root must be a JSON array"]
+        if len(data) == 0:
+            return ["Submission is empty"]
+        for i, item in enumerate(data):
+            if not isinstance(item, dict):
+                errors.append(f"Item #{i} must be an object")
+                continue
+            if "album_id" not in item or str(item["album_id"]) not in ["1", "2", "3"]:
+                errors.append(f"Item #{i} 'album_id' must be '1', '2', or '3'")
+            if "query_en" not in item or not isinstance(item["query_en"], str):
+                errors.append(f"Item #{i} 'query_en' must be a string")
+            if (
+                "pred" not in item
+                or not isinstance(item["pred"], list)
+                or not all(isinstance(x, str) for x in item["pred"])
+            ):
+                errors.append(f"Item #{i} 'pred' must be a list of strings")
+        return errors
+    def _dcg_at_k(self, r, k):
+        r = np.asarray(r, dtype=float)[:k]
+        if r.size:
+            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
+        return 0.0
+    def _ndcg_at_k(self, r, k):
+        dcg_max = self._dcg_at_k(sorted(r, reverse=True), k)
+        if not dcg_max:
+            return 0.0
+        return self._dcg_at_k(r, k) / dcg_max
+    def _recall_at_k(self, ground_truth, predictions, k):
+        k_preds = predictions[:k]
+        hits = len(set(ground_truth) & set(k_preds))
+        if len(ground_truth) == 0:
+            return 0.0
+        return hits / len(ground_truth)
+    def _evaluate_album(self, album_submissions: dict, album_id: str) -> dict:
+        """Evaluate a single album."""
+        gt_data = self._load_gt(album_id)
+        gt_map = {item["query_en"]: item for item in gt_data}
+        metrics_accum = {f"Recall@{k}": [] for k in K_VALUES}
+        metrics_accum.update({f"NDCG@{k}": [] for k in K_VALUES})
+        metrics_accum["Recall"] = []
+        metrics_accum["NDCG"] = []
+        source_accum = {}
+        empty_gt_queries = 0
+        evaluated_queries = 0
+        for q, pred in album_submissions.items():
+            if q not in gt_map:
+                continue
+            gt_item = gt_map[q]
+            gt_answers = gt_item.get("ground_truth", [])
+            source = gt_item.get("Source")
+            evaluated_queries += 1
+            if not gt_answers:
+                empty_gt_queries += 1
+                continue
+            r = [1 if p in gt_answers else 0 for p in pred]
+            dcg_r = [1.0] * len(gt_answers)
+            m = {}
+            for k in K_VALUES:
+                m[f"Recall@{k}"] = self._recall_at_k(gt_answers, pred, k)
+                idcg = self._dcg_at_k(dcg_r, k)
+                ndcg = self._dcg_at_k(r, k) / idcg if idcg > 0 else 0.0
+                m[f"NDCG@{k}"] = ndcg
+                metrics_accum[f"Recall@{k}"].append(m[f"Recall@{k}"])
+                metrics_accum[f"NDCG@{k}"].append(m[f"NDCG@{k}"])
+            m["Recall"] = sum(r) / len(gt_answers)
+            idcg_all = self._dcg_at_k(dcg_r, len(gt_answers))
+            ndcg_all = self._dcg_at_k(r, len(r)) / idcg_all if idcg_all > 0 else 0.0
+            m["NDCG"] = ndcg_all
+            metrics_accum["Recall"].append(m["Recall"])
+            metrics_accum["NDCG"].append(m["NDCG"])
+            if source is not None:
+                if source not in source_accum:
+                    source_accum[source] = {f"Recall@{_k}": [] for _k in K_VALUES}
+                    source_accum[source].update({f"NDCG@{_k}": [] for _k in K_VALUES})
+                    source_accum[source]["Recall"] = []
+                    source_accum[source]["NDCG"] = []
+                for k in K_VALUES:
+                    source_accum[source][f"Recall@{k}"].append(m[f"Recall@{k}"])
+                    source_accum[source][f"NDCG@{k}"].append(m[f"NDCG@{k}"])
+                source_accum[source]["Recall"].append(m["Recall"])
+                source_accum[source]["NDCG"].append(m["NDCG"])
+        global_metrics = {
+            k: float(np.mean(v)) if v else 0.0 for k, v in metrics_accum.items()
+        }
+        return {
+            "global_metrics": global_metrics,
+            "source_metrics": {
+                src: {k: float(np.mean(v)) if v else 0.0 for k, v in m_dict.items()}
+                for src, m_dict in source_accum.items()
+            },
+            "empty_gt_ratio": empty_gt_queries / evaluated_queries if evaluated_queries > 0 else 0.0,
+            "evaluated_queries": evaluated_queries,
+            "total_gt_queries": len(gt_data),
+            "is_partial": evaluated_queries < len(gt_data),
+        }
+    def evaluate(self, submission_data: list) -> dict:
+        albums = {}
+        for item in submission_data:
+            a_id = str(item["album_id"])
+            if a_id not in albums:
+                albums[a_id] = {}
+            albums[a_id][item["query_en"]] = item["pred"]
+        if not albums:
+            raise ValueError("No valid albums found in submission.")
+        # Evaluate each album separately
+        per_album = {}
+        for a_id in sorted(albums.keys()):
+            per_album[a_id] = self._evaluate_album(albums[a_id], a_id)
+        # Compute averaged metrics across all albums
+        avg_metrics = {}
+        for metric_key in per_album[list(per_album.keys())[0]]["global_metrics"].keys():
+            values = [alb["global_metrics"][metric_key] for alb in per_album.values() if metric_key in alb["global_metrics"]]
+            avg_metrics[metric_key] = float(np.mean(values)) if values else 0.0
+        total_evaluated = sum(alb["evaluated_queries"] for alb in per_album.values())
+        total_gt = sum(alb["total_gt_queries"] for alb in per_album.values())
+        result = {
+            "per_album": per_album,
+            "global_metrics": avg_metrics,
+            "evaluated_queries": total_evaluated,
+            "total_gt_queries": total_gt,
+            "is_partial": total_evaluated < total_gt,
+            "albums": sorted(albums.keys()),
+        }
+        if result["is_partial"]:
+            missing = [a for a in ["1", "2", "3"] if a not in albums]
+            result["warning"] = f"Submission incomplete. Missing albums: {', '.join(missing)}. Averaged results across submitted albums shown below."
+        return result

src/leaderboard_manager.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import hashlib
+import json
+from datetime import datetime, timedelta
+from typing import Any
+import pandas as pd
+from src.storage import load_leaderboard, save_leaderboard
+# All available metric columns (computed)
+ALL_METRIC_COLS = [
+    "Recall@1", "Recall@5", "Recall@10", "Recall@20", "Recall@50", "Recall@100",
+    "NDCG@1", "NDCG@5", "NDCG@10", "NDCG@20", "NDCG@50", "NDCG@100",
+]
+# Default columns shown on leaderboard
+DEFAULT_DISPLAY_METRICS = [
+    "Recall@1", "Recall@5", "Recall@20", "Recall@50",
+    "NDCG@1", "NDCG@5", "NDCG@20", "NDCG@50",
+]
+# Base columns always shown
+BASE_COLS = ["rank", "model_name"]
+_DEFAULT_SORT = "Recall@10"
+_TOP_N = 30
+_RETENTION_DAYS = 30
+def make_id(email: str, model_name: str) -> str:
+    return hashlib.sha256(f"{email}:{model_name}".encode()).hexdigest()[:16]
+class LeaderboardManager:
+    def __init__(self):
+        self._entries: list[dict] = []
+        self._load()
+        self._cleanup()
+    def _load(self):
+        raw = load_leaderboard()
+        self._entries = raw
+    def _save(self):
+        save_leaderboard(self._entries)
+    def _cleanup(self):
+        """Remove non-paper entries older than 30 days that are not in top 30."""
+        if not self._entries:
+            return
+        df = pd.DataFrame(self._entries)
+        if _DEFAULT_SORT in df.columns:
+            top_ids = set(
+                df.sort_values(by=_DEFAULT_SORT, ascending=False)
+                .head(_TOP_N)["submission_id"]
+                .tolist()
+            )
+        else:
+            top_ids = set()
+        cutoff = datetime.utcnow() - timedelta(days=_RETENTION_DAYS)
+        kept = []
+        for e in self._entries:
+            sid = e.get("submission_id", "")
+            is_paper = e.get("is_paper_data", False)
+            ts_str = e.get("timestamp", "")
+            try:
+                ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
+            except Exception:
+                ts = datetime.utcnow()
+            if is_paper or sid in top_ids or ts >= cutoff:
+                kept.append(e)
+        removed = len(self._entries) - len(kept)
+        if removed > 0:
+            print(f"[CLEANUP] Removed {removed} expired entries")
+            self._entries = kept
+            self._save()
+    def add_result(
+        self,
+        email: str,
+        method: str,
+        model_name: str,
+        albums: list[str],
+        evaluated_queries: int,
+        total_gt_queries: int,
+        global_metrics: dict,
+    ) -> dict | None:
+        """Add a new evaluation result. Returns entry if added, None if not eligible."""
+        # Must be a full submission (all 3 albums, all queries matched)
+        if set(albums) != {"1", "2", "3"}:
+            return None
+        if evaluated_queries < total_gt_queries:
+            return None
+        submission_id = make_id(email, model_name)
+        entry = {
+            "submission_id": submission_id,
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "email": email,
+            "method": method,
+            "model_name": model_name,
+            "albums": ",".join(albums),
+            "is_paper_data": False,
+            **{k: round(v, 4) for k, v in global_metrics.items() if k in ALL_METRIC_COLS or k in ("Recall", "NDCG")},
+        }
+        # Keep best score per (email, model_name)
+        key = (email, model_name)
+        existing_idx = None
+        for i, e in enumerate(self._entries):
+            if (e.get("email"), e.get("model_name")) == key:
+                existing_idx = i
+                break
+        if existing_idx is not None:
+            old = self._entries[existing_idx]
+            if global_metrics.get(_DEFAULT_SORT, 0) >= old.get(_DEFAULT_SORT, 0):
+                self._entries[existing_idx] = entry
+        else:
+            self._entries.append(entry)
+        self._save()
+        return entry
+    def get_display_df(
+        self,
+        method_filter: str | None = None,
+        sort_by: str = _DEFAULT_SORT,
+        ascending: bool = False,
+        top_n: int = _TOP_N,
+        metric_cols: list[str] | None = None,
+    ) -> pd.DataFrame:
+        """Return a pandas DataFrame ready for gr.DataFrame."""
+        cols_to_show = BASE_COLS + (metric_cols or DEFAULT_DISPLAY_METRICS)
+        if not self._entries:
+            return pd.DataFrame(columns=cols_to_show)
+        df = pd.DataFrame(self._entries)
+        if method_filter and method_filter != "All":
+            df = df[df["method"] == method_filter]
+        if sort_by not in df.columns:
+            sort_by = _DEFAULT_SORT
+        df = df.sort_values(by=sort_by, ascending=ascending)
+        df = df.head(top_n).reset_index(drop=True)
+        df["rank"] = df.index + 1
+        available = [c for c in cols_to_show if c in df.columns]
+        df = df[available]
+        return df
+    def remove_entry(self, submission_id: str) -> bool:
+        """Remove an entry by submission_id. Returns True if removed."""
+        original_len = len(self._entries)
+        self._entries = [e for e in self._entries if e.get("submission_id") != submission_id]
+        if len(self._entries) < original_len:
+            self._save()
+            return True
+        return False

src/storage.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import json
+import os
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Any
+STORAGE_DIR = Path("/data")
+SUBMISSIONS_DIR = STORAGE_DIR / "submissions"
+LEADERBOARD_FILE = STORAGE_DIR / "leaderboard.jsonl"
+RATE_LIMIT_FILE = STORAGE_DIR / "rate_limits.json"
+# Seed data bundled with the app (used on first boot)
+SEED_LEADERBOARD = Path(__file__).parent.parent / "assets" / "leaderboard.jsonl"
+def ensure_dirs():
+    STORAGE_DIR.mkdir(parents=True, exist_ok=True)
+    SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
+def _seed_leaderboard():
+    """Copy bundled leaderboard data to /data on first boot."""
+    if LEADERBOARD_FILE.exists():
+        return
+    if SEED_LEADERBOARD.exists():
+        import shutil
+        shutil.copy(SEED_LEADERBOARD, LEADERBOARD_FILE)
+        print(f"[SEED] Copied leaderboard data from {SEED_LEADERBOARD} to {LEADERBOARD_FILE}")
+def save_submission(submission_id: str, payload: dict) -> str:
+    """Save raw submission JSON to local storage."""
+    ensure_dirs()
+    file_path = SUBMISSIONS_DIR / f"{submission_id}.json"
+    with open(file_path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, ensure_ascii=False, indent=2)
+    return str(file_path)
+def list_submissions() -> list[dict]:
+    """List all submission metadata."""
+    ensure_dirs()
+    results = []
+    for f in sorted(SUBMISSIONS_DIR.glob("*.json")):
+        try:
+            with open(f, "r", encoding="utf-8") as fp:
+                data = json.load(fp)
+            meta = data.get("meta", {})
+            meta["file"] = str(f.name)
+            results.append(meta)
+        except Exception:
+            continue
+    return results
+def load_leaderboard() -> list[dict]:
+    """Load current leaderboard data from local storage."""
+    _seed_leaderboard()
+    if not LEADERBOARD_FILE.exists():
+        return []
+    entries = []
+    with open(LEADERBOARD_FILE, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                entries.append(json.loads(line))
+    return entries
+def save_leaderboard(entries: list[dict]) -> None:
+    """Overwrite leaderboard file with the current entries."""
+    ensure_dirs()
+    with open(LEADERBOARD_FILE, "w", encoding="utf-8") as f:
+        for entry in entries:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+# ---- Rate limiting ----
+def check_rate_limit(email: str, cooldown_minutes: int = 60) -> tuple[bool, str]:
+    """Check if email is allowed to submit. Returns (allowed, message)."""
+    ensure_dirs()
+    limits = {}
+    if RATE_LIMIT_FILE.exists():
+        with open(RATE_LIMIT_FILE, "r", encoding="utf-8") as f:
+            limits = json.load(f)
+    last_str = limits.get(email)
+    if last_str:
+        last_time = datetime.fromisoformat(last_str)
+        next_allowed = last_time + timedelta(minutes=cooldown_minutes)
+        if datetime.utcnow() < next_allowed:
+            remaining = int((next_allowed - datetime.utcnow()).total_seconds() / 60)
+            return False, f"This email has already submitted within the last hour. Please wait {remaining} minutes."
+    return True, ""
+def record_submission_time(email: str) -> None:
+    """Record the current submission time for an email."""
+    ensure_dirs()
+    limits = {}
+    if RATE_LIMIT_FILE.exists():
+        with open(RATE_LIMIT_FILE, "r", encoding="utf-8") as f:
+            limits = json.load(f)
+    limits[email] = datetime.utcnow().isoformat()
+    with open(RATE_LIMIT_FILE, "w", encoding="utf-8") as f:
+        json.dump(limits, f, ensure_ascii=False, indent=2)