File size: 8,358 Bytes
45c9afd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | """
app.py - DISBench Leaderboard Main Application
Startup Flow:
1. Space rebuild (triggered by PR merge) → Docker container starts
2. Call evaluate.run_evaluation() to scan new submissions in submissions/
3. Calculate EM/F1 scores for new submissions, update leaderboard_data.json
4. Commit updated data back to repository (persistence)
5. Start Flask Web server
"""
import os
import json
import logging
from datetime import datetime
from flask import Flask, render_template, request, redirect, url_for, jsonify
from huggingface_hub import HfApi, CommitOperationAdd
# Evaluation module
from evaluate import run_evaluation, commit_leaderboard_to_repo
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
app = Flask(__name__)
app.secret_key = os.environ.get("SECRET_KEY", "disbench-leaderboard-secret-key")
# --- Configuration ---
LEADERBOARD_FILE = "leaderboard_data.json"
SUBMISSIONS_DIR = "submissions"
os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
# HuggingFace Space configuration
HF_TOKEN = os.environ.get("HF_TOKEN")
SPACE_ID = os.environ.get("SPACE_ID", "RUC-NLPIR/DISBench-Leaderboard")
# ============================================================
# Automatic Evaluation on Startup
# ============================================================
def startup_evaluation():
"""
Automatically run evaluation when the app starts.
When maintainers merge a PR containing new submission files,
HF Space will automatically rebuild and restart, and this function will be called:
- Scan all files in submissions/ directory
- Re-evaluate all submissions (deduplicate using configuration combinations)
- Compare with groundtruth.jsonl to calculate scores
- Update leaderboard_data.json
- Commit results back to repository for persistence
Note:
- Every startup re-evaluates all files, making the logic simpler
- submissions/ is the single source of truth
- Evaluation is fast and won't affect startup speed
"""
logger.info("=" * 60)
logger.info("DISBench: Running startup evaluation...")
logger.info("=" * 60)
try:
total, _ = run_evaluation()
if total > 0:
logger.info(f"Evaluated all submissions. Committing to repo...")
commit_leaderboard_to_repo()
else:
logger.info("No submissions found.")
logger.info(f"Leaderboard has {total} unique configurations. Ready to serve.")
except Exception as e:
logger.error(f"Startup evaluation failed: {e}")
logger.info("Continuing with existing leaderboard data...")
# Execute startup evaluation
startup_evaluation()
# ============================================================
# Data Loading
# ============================================================
def load_leaderboard():
if os.path.exists(LEADERBOARD_FILE):
with open(LEADERBOARD_FILE, 'r', encoding='utf-8') as f:
return json.load(f)
return []
# ============================================================
# Submission Validation
# ============================================================
def validate_submission(submission):
errors = []
if not isinstance(submission, dict):
return ["Submission must be a JSON object with 'meta' and 'predictions' fields."]
meta = submission.get("meta")
preds = submission.get("predictions")
if not meta or not isinstance(meta, dict):
errors.append("Missing or invalid 'meta' field.")
else:
required_meta = ["method_name"]
for field in required_meta:
if field not in meta:
errors.append(f"Missing required field: meta.{field}")
valid_tracks = ["Standard", "Open"]
if meta.get("track") and meta["track"] not in valid_tracks:
errors.append(f"meta.track must be one of: {valid_tracks}")
if not preds or not isinstance(preds, dict):
errors.append("Missing or invalid 'predictions' field.")
return errors
# ============================================================
# PR Creation
# ============================================================
def create_pr_submission(submission_json, method_name):
"""Create a PR via HF Hub API, upload submission file to submissions/ directory"""
if not HF_TOKEN:
raise RuntimeError(
"HF_TOKEN not configured. Please set the HF_TOKEN secret in your Space settings."
)
api = HfApi(token=HF_TOKEN)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_name = method_name.replace(" ", "-").replace("/", "_")
filename = f"{safe_name}_{timestamp}.json"
path_in_repo = f"submissions/{filename}"
content = json.dumps(submission_json, indent=2, ensure_ascii=False).encode("utf-8")
commit_info = api.create_commit(
repo_id=SPACE_ID,
repo_type="space",
operations=[
CommitOperationAdd(
path_in_repo=path_in_repo,
path_or_fileobj=content,
)
],
commit_message=f"[Submission] Add results for {method_name}",
commit_description=(
f"**Method**: {method_name}\n"
f"**Organization**: {submission_json.get('meta', {}).get('organization', 'N/A')}\n"
f"**Track**: {submission_json.get('meta', {}).get('track', 'N/A')}\n"
f"**Agent**: {submission_json.get('meta', {}).get('agent_framework', 'N/A')}\n"
f"**Backbone**: {submission_json.get('meta', {}).get('backbone_model', 'N/A')}\n"
f"**Retriever**: {submission_json.get('meta', {}).get('retriever_model', 'N/A')}\n\n"
f"Submitted at {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}"
),
create_pr=True,
)
return commit_info
# ============================================================
# Routes
# ============================================================
@app.route('/')
def index():
data = load_leaderboard()
return render_template('index.html', data=data)
@app.route('/upload', methods=['POST'])
def upload_file():
"""Handle submission: validate → create PR → return result"""
if 'file' not in request.files:
return jsonify({"success": False, "error": "No file uploaded."}), 400
file = request.files['file']
if file.filename == '':
return jsonify({"success": False, "error": "No file selected."}), 400
try:
submission = json.load(file)
except json.JSONDecodeError as e:
return jsonify({"success": False, "error": f"Invalid JSON file: {e}"}), 400
errors = validate_submission(submission)
if errors:
return jsonify({"success": False, "error": "Validation failed.", "details": errors}), 400
method_name = submission["meta"]["method_name"]
# Local backup
safe_name = method_name.replace(" ", "-").replace("/", "_")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
local_path = os.path.join(SUBMISSIONS_DIR, f"{safe_name}_{timestamp}.json")
with open(local_path, 'w', encoding='utf-8') as f:
json.dump(submission, f, indent=2, ensure_ascii=False)
# Create PR
try:
commit_info = create_pr_submission(submission, method_name)
pr_url = getattr(commit_info, 'pr_url', None)
return jsonify({
"success": True,
"message": f"Submission for '{method_name}' has been submitted as a Pull Request!",
"pr_url": pr_url or f"https://huggingface.co/spaces/{SPACE_ID}/discussions",
})
except RuntimeError as e:
return jsonify({
"success": True,
"message": (
f"Submission for '{method_name}' saved locally. "
f"PR creation skipped: {str(e)}. "
f"Maintainers will review it manually."
),
"pr_url": None,
})
except Exception as e:
return jsonify({
"success": True,
"message": (
f"Submission for '{method_name}' saved locally, "
f"but PR creation failed: {str(e)}. "
f"Please contact the maintainers."
),
"pr_url": None,
})
if __name__ == '__main__':
app.run(debug=False, host="0.0.0.0", port=7860) |