|
|
""" |
|
|
app.py - DISBench Leaderboard Main Application |
|
|
|
|
|
Startup Flow: |
|
|
1. Space rebuild (triggered by PR merge) → Docker container starts |
|
|
2. Call evaluate.run_evaluation() to scan new submissions in submissions/ |
|
|
3. Calculate EM/F1 scores for new submissions, update leaderboard_data.json |
|
|
4. Commit updated data back to repository (persistence) |
|
|
5. Start Flask Web server |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
import logging |
|
|
from datetime import datetime |
|
|
from flask import Flask, render_template, request, redirect, url_for, jsonify |
|
|
from huggingface_hub import HfApi, CommitOperationAdd |
|
|
|
|
|
|
|
|
from evaluate import run_evaluation, commit_leaderboard_to_repo |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
app = Flask(__name__) |
|
|
app.secret_key = os.environ.get("SECRET_KEY", "disbench-leaderboard-secret-key") |
|
|
|
|
|
|
|
|
LEADERBOARD_FILE = "leaderboard_data.json" |
|
|
SUBMISSIONS_DIR = "submissions" |
|
|
os.makedirs(SUBMISSIONS_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
SPACE_ID = os.environ.get("SPACE_ID", "RUC-NLPIR/DISBench-Leaderboard") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def startup_evaluation(): |
|
|
""" |
|
|
Automatically run evaluation when the app starts. |
|
|
|
|
|
When maintainers merge a PR containing new submission files, |
|
|
HF Space will automatically rebuild and restart, and this function will be called: |
|
|
- Scan all files in submissions/ directory |
|
|
- Re-evaluate all submissions (deduplicate using configuration combinations) |
|
|
- Compare with groundtruth.jsonl to calculate scores |
|
|
- Update leaderboard_data.json |
|
|
- Commit results back to repository for persistence |
|
|
|
|
|
Note: |
|
|
- Every startup re-evaluates all files, making the logic simpler |
|
|
- submissions/ is the single source of truth |
|
|
- Evaluation is fast and won't affect startup speed |
|
|
""" |
|
|
logger.info("=" * 60) |
|
|
logger.info("DISBench: Running startup evaluation...") |
|
|
logger.info("=" * 60) |
|
|
|
|
|
try: |
|
|
total, _ = run_evaluation() |
|
|
|
|
|
if total > 0: |
|
|
logger.info(f"Evaluated all submissions. Committing to repo...") |
|
|
commit_leaderboard_to_repo() |
|
|
else: |
|
|
logger.info("No submissions found.") |
|
|
|
|
|
logger.info(f"Leaderboard has {total} unique configurations. Ready to serve.") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Startup evaluation failed: {e}") |
|
|
logger.info("Continuing with existing leaderboard data...") |
|
|
|
|
|
|
|
|
|
|
|
startup_evaluation() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_leaderboard(): |
|
|
if os.path.exists(LEADERBOARD_FILE): |
|
|
with open(LEADERBOARD_FILE, 'r', encoding='utf-8') as f: |
|
|
return json.load(f) |
|
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def validate_submission(submission): |
|
|
errors = [] |
|
|
if not isinstance(submission, dict): |
|
|
return ["Submission must be a JSON object with 'meta' and 'predictions' fields."] |
|
|
|
|
|
meta = submission.get("meta") |
|
|
preds = submission.get("predictions") |
|
|
|
|
|
if not meta or not isinstance(meta, dict): |
|
|
errors.append("Missing or invalid 'meta' field.") |
|
|
else: |
|
|
required_meta = ["method_name"] |
|
|
for field in required_meta: |
|
|
if field not in meta: |
|
|
errors.append(f"Missing required field: meta.{field}") |
|
|
|
|
|
valid_tracks = ["Standard", "Open"] |
|
|
if meta.get("track") and meta["track"] not in valid_tracks: |
|
|
errors.append(f"meta.track must be one of: {valid_tracks}") |
|
|
|
|
|
if not preds or not isinstance(preds, dict): |
|
|
errors.append("Missing or invalid 'predictions' field.") |
|
|
|
|
|
return errors |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_pr_submission(submission_json, method_name): |
|
|
"""Create a PR via HF Hub API, upload submission file to submissions/ directory""" |
|
|
if not HF_TOKEN: |
|
|
raise RuntimeError( |
|
|
"HF_TOKEN not configured. Please set the HF_TOKEN secret in your Space settings." |
|
|
) |
|
|
|
|
|
api = HfApi(token=HF_TOKEN) |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
safe_name = method_name.replace(" ", "-").replace("/", "_") |
|
|
filename = f"{safe_name}_{timestamp}.json" |
|
|
path_in_repo = f"submissions/{filename}" |
|
|
|
|
|
content = json.dumps(submission_json, indent=2, ensure_ascii=False).encode("utf-8") |
|
|
|
|
|
commit_info = api.create_commit( |
|
|
repo_id=SPACE_ID, |
|
|
repo_type="space", |
|
|
operations=[ |
|
|
CommitOperationAdd( |
|
|
path_in_repo=path_in_repo, |
|
|
path_or_fileobj=content, |
|
|
) |
|
|
], |
|
|
commit_message=f"[Submission] Add results for {method_name}", |
|
|
commit_description=( |
|
|
f"**Method**: {method_name}\n" |
|
|
f"**Organization**: {submission_json.get('meta', {}).get('organization', 'N/A')}\n" |
|
|
f"**Track**: {submission_json.get('meta', {}).get('track', 'N/A')}\n" |
|
|
f"**Agent**: {submission_json.get('meta', {}).get('agent_framework', 'N/A')}\n" |
|
|
f"**Backbone**: {submission_json.get('meta', {}).get('backbone_model', 'N/A')}\n" |
|
|
f"**Retriever**: {submission_json.get('meta', {}).get('retriever_model', 'N/A')}\n\n" |
|
|
f"Submitted at {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}" |
|
|
), |
|
|
create_pr=True, |
|
|
) |
|
|
|
|
|
return commit_info |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/') |
|
|
def index(): |
|
|
data = load_leaderboard() |
|
|
return render_template('index.html', data=data) |
|
|
|
|
|
|
|
|
@app.route('/upload', methods=['POST']) |
|
|
def upload_file(): |
|
|
"""Handle submission: validate → create PR → return result""" |
|
|
if 'file' not in request.files: |
|
|
return jsonify({"success": False, "error": "No file uploaded."}), 400 |
|
|
|
|
|
file = request.files['file'] |
|
|
if file.filename == '': |
|
|
return jsonify({"success": False, "error": "No file selected."}), 400 |
|
|
|
|
|
try: |
|
|
submission = json.load(file) |
|
|
except json.JSONDecodeError as e: |
|
|
return jsonify({"success": False, "error": f"Invalid JSON file: {e}"}), 400 |
|
|
|
|
|
errors = validate_submission(submission) |
|
|
if errors: |
|
|
return jsonify({"success": False, "error": "Validation failed.", "details": errors}), 400 |
|
|
|
|
|
method_name = submission["meta"]["method_name"] |
|
|
|
|
|
|
|
|
safe_name = method_name.replace(" ", "-").replace("/", "_") |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
local_path = os.path.join(SUBMISSIONS_DIR, f"{safe_name}_{timestamp}.json") |
|
|
with open(local_path, 'w', encoding='utf-8') as f: |
|
|
json.dump(submission, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
try: |
|
|
commit_info = create_pr_submission(submission, method_name) |
|
|
pr_url = getattr(commit_info, 'pr_url', None) |
|
|
return jsonify({ |
|
|
"success": True, |
|
|
"message": f"Submission for '{method_name}' has been submitted as a Pull Request!", |
|
|
"pr_url": pr_url or f"https://huggingface.co/spaces/{SPACE_ID}/discussions", |
|
|
}) |
|
|
except RuntimeError as e: |
|
|
return jsonify({ |
|
|
"success": True, |
|
|
"message": ( |
|
|
f"Submission for '{method_name}' saved locally. " |
|
|
f"PR creation skipped: {str(e)}. " |
|
|
f"Maintainers will review it manually." |
|
|
), |
|
|
"pr_url": None, |
|
|
}) |
|
|
except Exception as e: |
|
|
return jsonify({ |
|
|
"success": True, |
|
|
"message": ( |
|
|
f"Submission for '{method_name}' saved locally, " |
|
|
f"but PR creation failed: {str(e)}. " |
|
|
f"Please contact the maintainers." |
|
|
), |
|
|
"pr_url": None, |
|
|
}) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
app.run(debug=False, host="0.0.0.0", port=7860) |