Spaces:

RUC-NLPIR
/

DISBench-Leaderboard

Running

File size: 8,358 Bytes

45c9afd

"""
app.py - DISBench Leaderboard Main Application

Startup Flow:
  1. Space rebuild (triggered by PR merge) → Docker container starts
  2. Call evaluate.run_evaluation() to scan new submissions in submissions/
  3. Calculate EM/F1 scores for new submissions, update leaderboard_data.json
  4. Commit updated data back to repository (persistence)
  5. Start Flask Web server
"""

import os
import json
import logging
from datetime import datetime
from flask import Flask, render_template, request, redirect, url_for, jsonify
from huggingface_hub import HfApi, CommitOperationAdd

# Evaluation module
from evaluate import run_evaluation, commit_leaderboard_to_repo

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)

app = Flask(__name__)
app.secret_key = os.environ.get("SECRET_KEY", "disbench-leaderboard-secret-key")

# --- Configuration ---
LEADERBOARD_FILE = "leaderboard_data.json"
SUBMISSIONS_DIR = "submissions"
os.makedirs(SUBMISSIONS_DIR, exist_ok=True)

# HuggingFace Space configuration
HF_TOKEN = os.environ.get("HF_TOKEN")
SPACE_ID = os.environ.get("SPACE_ID", "RUC-NLPIR/DISBench-Leaderboard")


# ============================================================
# Automatic Evaluation on Startup
# ============================================================

def startup_evaluation():
    """
    Automatically run evaluation when the app starts.

    When maintainers merge a PR containing new submission files,
    HF Space will automatically rebuild and restart, and this function will be called:
      - Scan all files in submissions/ directory
      - Re-evaluate all submissions (deduplicate using configuration combinations)
      - Compare with groundtruth.jsonl to calculate scores
      - Update leaderboard_data.json
      - Commit results back to repository for persistence
    
    Note:
      - Every startup re-evaluates all files, making the logic simpler
      - submissions/ is the single source of truth
      - Evaluation is fast and won't affect startup speed
    """
    logger.info("=" * 60)
    logger.info("DISBench: Running startup evaluation...")
    logger.info("=" * 60)

    try:
        total, _ = run_evaluation()

        if total > 0:
            logger.info(f"Evaluated all submissions. Committing to repo...")
            commit_leaderboard_to_repo()
        else:
            logger.info("No submissions found.")

        logger.info(f"Leaderboard has {total} unique configurations. Ready to serve.")

    except Exception as e:
        logger.error(f"Startup evaluation failed: {e}")
        logger.info("Continuing with existing leaderboard data...")


# Execute startup evaluation
startup_evaluation()


# ============================================================
# Data Loading
# ============================================================

def load_leaderboard():
    if os.path.exists(LEADERBOARD_FILE):
        with open(LEADERBOARD_FILE, 'r', encoding='utf-8') as f:
            return json.load(f)
    return []


# ============================================================
# Submission Validation
# ============================================================

def validate_submission(submission):
    errors = []
    if not isinstance(submission, dict):
        return ["Submission must be a JSON object with 'meta' and 'predictions' fields."]

    meta = submission.get("meta")
    preds = submission.get("predictions")

    if not meta or not isinstance(meta, dict):
        errors.append("Missing or invalid 'meta' field.")
    else:
        required_meta = ["method_name"]
        for field in required_meta:
            if field not in meta:
                errors.append(f"Missing required field: meta.{field}")

        valid_tracks = ["Standard", "Open"]
        if meta.get("track") and meta["track"] not in valid_tracks:
            errors.append(f"meta.track must be one of: {valid_tracks}")

    if not preds or not isinstance(preds, dict):
        errors.append("Missing or invalid 'predictions' field.")

    return errors


# ============================================================
# PR Creation
# ============================================================

def create_pr_submission(submission_json, method_name):
    """Create a PR via HF Hub API, upload submission file to submissions/ directory"""
    if not HF_TOKEN:
        raise RuntimeError(
            "HF_TOKEN not configured. Please set the HF_TOKEN secret in your Space settings."
        )

    api = HfApi(token=HF_TOKEN)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    safe_name = method_name.replace(" ", "-").replace("/", "_")
    filename = f"{safe_name}_{timestamp}.json"
    path_in_repo = f"submissions/{filename}"

    content = json.dumps(submission_json, indent=2, ensure_ascii=False).encode("utf-8")

    commit_info = api.create_commit(
        repo_id=SPACE_ID,
        repo_type="space",
        operations=[
            CommitOperationAdd(
                path_in_repo=path_in_repo,
                path_or_fileobj=content,
            )
        ],
        commit_message=f"[Submission] Add results for {method_name}",
        commit_description=(
            f"**Method**: {method_name}\n"
            f"**Organization**: {submission_json.get('meta', {}).get('organization', 'N/A')}\n"
            f"**Track**: {submission_json.get('meta', {}).get('track', 'N/A')}\n"
            f"**Agent**: {submission_json.get('meta', {}).get('agent_framework', 'N/A')}\n"
            f"**Backbone**: {submission_json.get('meta', {}).get('backbone_model', 'N/A')}\n"
            f"**Retriever**: {submission_json.get('meta', {}).get('retriever_model', 'N/A')}\n\n"
            f"Submitted at {datetime.now().strftime('%Y-%m-%d %H:%M:%S UTC')}"
        ),
        create_pr=True,
    )

    return commit_info


# ============================================================
# Routes
# ============================================================

@app.route('/')
def index():
    data = load_leaderboard()
    return render_template('index.html', data=data)


@app.route('/upload', methods=['POST'])
def upload_file():
    """Handle submission: validate → create PR → return result"""
    if 'file' not in request.files:
        return jsonify({"success": False, "error": "No file uploaded."}), 400

    file = request.files['file']
    if file.filename == '':
        return jsonify({"success": False, "error": "No file selected."}), 400

    try:
        submission = json.load(file)
    except json.JSONDecodeError as e:
        return jsonify({"success": False, "error": f"Invalid JSON file: {e}"}), 400

    errors = validate_submission(submission)
    if errors:
        return jsonify({"success": False, "error": "Validation failed.", "details": errors}), 400

    method_name = submission["meta"]["method_name"]

    # Local backup
    safe_name = method_name.replace(" ", "-").replace("/", "_")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    local_path = os.path.join(SUBMISSIONS_DIR, f"{safe_name}_{timestamp}.json")
    with open(local_path, 'w', encoding='utf-8') as f:
        json.dump(submission, f, indent=2, ensure_ascii=False)

    # Create PR
    try:
        commit_info = create_pr_submission(submission, method_name)
        pr_url = getattr(commit_info, 'pr_url', None)
        return jsonify({
            "success": True,
            "message": f"Submission for '{method_name}' has been submitted as a Pull Request!",
            "pr_url": pr_url or f"https://huggingface.co/spaces/{SPACE_ID}/discussions",
        })
    except RuntimeError as e:
        return jsonify({
            "success": True,
            "message": (
                f"Submission for '{method_name}' saved locally. "
                f"PR creation skipped: {str(e)}. "
                f"Maintainers will review it manually."
            ),
            "pr_url": None,
        })
    except Exception as e:
        return jsonify({
            "success": True,
            "message": (
                f"Submission for '{method_name}' saved locally, "
                f"but PR creation failed: {str(e)}. "
                f"Please contact the maintainers."
            ),
            "pr_url": None,
        })


if __name__ == '__main__':
    app.run(debug=False, host="0.0.0.0", port=7860)