Spaces:

Ray368
/

GateMem-Submit

Running

File size: 9,705 Bytes

d565ce0

import base64
import json
import os
import shutil
import subprocess
import time
import uuid
from pathlib import Path

import gradio as gr
import requests


GITHUB_OWNER = os.getenv("GITHUB_OWNER", "rzhub")
GITHUB_REPO = os.getenv("GITHUB_REPO", "GateMem")
GITHUB_BRANCH = os.getenv("GITHUB_BRANCH", "main")
LEADERBOARD_PATH = os.getenv("LEADERBOARD_PATH", "docs/assets/leaderboard.json")
PENDING_PATH = os.getenv("PENDING_PATH", "docs/assets/pending_submissions.json")
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
ADMIN_PASSWORD = os.getenv("ADMIN_PASSWORD", "")

WORKDIR = Path("/tmp/gatemem_submit")
REPO_DIR = WORKDIR / "GateMem"
SUBMISSIONS_DIR = WORKDIR / "submissions"


def ensure_repo():
    WORKDIR.mkdir(parents=True, exist_ok=True)
    SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)

    if not REPO_DIR.exists():
        subprocess.run(
            ["git", "clone", "--depth", "1", "https://github.com/rzhub/GateMem.git", str(REPO_DIR)],
            check=True,
        )
    else:
        subprocess.run(["git", "-C", str(REPO_DIR), "pull"], check=False)


def github_headers():
    if not GITHUB_TOKEN:
        raise RuntimeError("GITHUB_TOKEN is not configured.")
    return {
        "Authorization": f"Bearer {GITHUB_TOKEN}",
        "Accept": "application/vnd.github+json",
    }


def github_get_json_file(path):
    url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/contents/{path}"
    r = requests.get(url, headers=github_headers(), params={"ref": GITHUB_BRANCH}, timeout=30)

    if r.status_code == 404:
        return [], None

    r.raise_for_status()
    obj = r.json()
    content = base64.b64decode(obj["content"]).decode("utf-8")
    data = json.loads(content) if content.strip() else []
    return data, obj["sha"]


def github_put_json_file(path, data, message, sha=None):
    url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/contents/{path}"
    encoded = base64.b64encode(
        json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")
    ).decode("utf-8")

    payload = {
        "message": message,
        "content": encoded,
        "branch": GITHUB_BRANCH,
    }
    if sha:
        payload["sha"] = sha

    r = requests.put(url, headers=github_headers(), json=payload, timeout=30)
    r.raise_for_status()
    return r.json()


def extract_metrics(summary):
    """
    Adjust this if your summary.json uses slightly different field names.
    """
    return {
        "u": float(summary.get("utility_accuracy", 0.0)),
        "a": float(summary.get("privacy_leakage_rate", 0.0)),
        "f": float(summary.get("deletion_leakage_rate", 0.0)),
        "mgs": float(summary.get("compliance_utility_score", 0.0)),
    }


def run_scorer(predictions_path, domain, use_llm_judge=False):
    ensure_repo()

    domain_name = domain.lower()
    run_id = str(uuid.uuid4())[:8]
    out_dir = SUBMISSIONS_DIR / run_id / "eval"
    out_dir.mkdir(parents=True, exist_ok=True)

    data_dir = REPO_DIR / "bench" / "data" / domain_name

    cmd = [
        "python",
        str(REPO_DIR / "bench" / "scripts" / "score_predictions.py"),
        "--data_dir",
        str(data_dir),
        "--predictions",
        str(predictions_path),
        "--out_dir",
        str(out_dir),
    ]

    if use_llm_judge:
        cmd += [
            "--use_llm_judge",
            "--judge_provider",
            "openai",
            "--judge_model",
            "gpt-4o",
        ]

    env = os.environ.copy()
    proc = subprocess.run(
        cmd,
        cwd=str(REPO_DIR),
        env=env,
        text=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
    )

    if proc.returncode != 0:
        raise RuntimeError(f"Scoring failed:\n{proc.stdout}")

    summary_path = out_dir / "summary.json"
    if not summary_path.exists():
        raise RuntimeError(f"summary.json not found. Scorer output:\n{proc.stdout}")

    summary = json.loads(summary_path.read_text(encoding="utf-8"))
    metrics = extract_metrics(summary)

    return metrics, str(summary_path), proc.stdout


def submit_result(predictions_file, method, backbone, domain, family, contact, code_url, use_llm_judge):
    if predictions_file is None:
        return "Please upload predictions.jsonl."

    if not method.strip():
        return "Please provide a method name."

    submit_id = f"{int(time.time())}-{uuid.uuid4().hex[:8]}"
    submit_dir = SUBMISSIONS_DIR / submit_id
    submit_dir.mkdir(parents=True, exist_ok=True)

    pred_path = submit_dir / "predictions.jsonl"
    shutil.copy(predictions_file.name, pred_path)

    try:
        metrics, summary_path, logs = run_scorer(pred_path, domain, use_llm_judge=use_llm_judge)
    except Exception as e:
        return f"Submission failed:\n{e}"

    row = {
        "submission_id": submit_id,
        "method": method.strip(),
        "backbone": backbone,
        "domain": domain,
        "family": family,
        "u": round(metrics["u"], 1),
        "a": round(metrics["a"], 1),
        "f": round(metrics["f"], 1),
        "mgs": round(metrics["mgs"], 1),
        "source": "external",
        "verified": False,
        "contact": contact.strip(),
        "code_url": code_url.strip(),
        "created_at": int(time.time()),
    }

    pending, sha = github_get_json_file(PENDING_PATH)
    pending.append(row)
    github_put_json_file(
        PENDING_PATH,
        pending,
        f"Add pending GateMem submission: {method.strip()}",
        sha=sha,
    )

    return (
        "Submitted and scored successfully.\n\n"
        f"Status: pending maintainer approval\n"
        f"Submission ID: {submit_id}\n\n"
        f"U={row['u']}, A={row['a']}, F={row['f']}, MGS={row['mgs']}"
    )


def list_pending(password):
    if password != ADMIN_PASSWORD:
        return "Invalid admin password."

    pending, _ = github_get_json_file(PENDING_PATH)
    if not pending:
        return "No pending submissions."

    lines = []
    for item in pending:
        lines.append(
            f"{item['submission_id']} | {item['method']} | {item['backbone']} | "
            f"{item['domain']} | U={item['u']} A={item['a']} F={item['f']} MGS={item['mgs']}"
        )

    return "\n".join(lines)


def approve_submission(password, submission_id):
    if password != ADMIN_PASSWORD:
        return "Invalid admin password."

    pending, pending_sha = github_get_json_file(PENDING_PATH)
    leaderboard, leaderboard_sha = github_get_json_file(LEADERBOARD_PATH)

    target = None
    remaining = []
    for item in pending:
        if item["submission_id"] == submission_id.strip():
            target = item
        else:
            remaining.append(item)

    if target is None:
        return f"Submission not found: {submission_id}"

    target["verified"] = True
    target["approved_at"] = int(time.time())

    leaderboard.append(target)

    github_put_json_file(
        LEADERBOARD_PATH,
        leaderboard,
        f"Approve GateMem leaderboard submission: {target['method']}",
        sha=leaderboard_sha,
    )
    github_put_json_file(
        PENDING_PATH,
        remaining,
        f"Remove approved pending submission: {target['method']}",
        sha=pending_sha,
    )

    return (
        f"Approved and added to leaderboard:\n"
        f"{target['method']} | {target['backbone']} | {target['domain']} | "
        f"MGS={target['mgs']}"
    )


with gr.Blocks(title="GateMem Result Submission") as demo:
    gr.Markdown("# GateMem Result Submission")
    gr.Markdown(
        "Upload `predictions.jsonl` generated by your method. "
        "The server scores it with the official GateMem evaluator and stores it as a pending submission."
    )

    with gr.Tab("Submit Result"):
        predictions_file = gr.File(label="predictions.jsonl", file_types=[".jsonl"])
        method = gr.Textbox(label="Method name", placeholder="e.g., MyMemoryAgent")
        backbone = gr.Dropdown(
            ["GPT-5-mini", "GPT-4o-mini", "Gemini-2.5-Flash-Lite", "Other"],
            label="Backbone model",
            value="GPT-5-mini",
        )
        domain = gr.Dropdown(
            ["Medical", "Office", "Education", "Household"],
            label="Domain",
            value="Medical",
        )
        family = gr.Dropdown(
            ["Full-context", "RAG", "External memory", "Other"],
            label="Method family",
            value="Other",
        )
        contact = gr.Textbox(label="Contact email")
        code_url = gr.Textbox(label="Code URL / commit / artifact link")
        use_llm_judge = gr.Checkbox(
            label="Use LLM judge if server is configured",
            value=False,
        )
        submit_btn = gr.Button("Submit and Score", variant="primary")
        submit_out = gr.Textbox(label="Submission status", lines=8)

        submit_btn.click(
            submit_result,
            inputs=[predictions_file, method, backbone, domain, family, contact, code_url, use_llm_judge],
            outputs=submit_out,
        )

    with gr.Tab("Admin"):
        admin_password = gr.Textbox(label="Admin password", type="password")
        list_btn = gr.Button("List Pending Submissions")
        pending_out = gr.Textbox(label="Pending submissions", lines=12)

        submission_id = gr.Textbox(label="Submission ID to approve")
        approve_btn = gr.Button("Approve and Update Leaderboard", variant="primary")
        approve_out = gr.Textbox(label="Approval status", lines=6)

        list_btn.click(list_pending, inputs=[admin_password], outputs=pending_out)
        approve_btn.click(approve_submission, inputs=[admin_password, submission_id], outputs=approve_out)


if __name__ == "__main__":
    ensure_repo()
    demo.launch()