import base64 import json import os import shutil import subprocess import time import uuid from pathlib import Path import gradio as gr import requests GITHUB_OWNER = os.getenv("GITHUB_OWNER", "rzhub") GITHUB_REPO = os.getenv("GITHUB_REPO", "GateMem") GITHUB_BRANCH = os.getenv("GITHUB_BRANCH", "main") LEADERBOARD_PATH = os.getenv("LEADERBOARD_PATH", "docs/assets/leaderboard.json") PENDING_PATH = os.getenv("PENDING_PATH", "docs/assets/pending_submissions.json") GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") ADMIN_PASSWORD = os.getenv("ADMIN_PASSWORD", "") WORKDIR = Path("/tmp/gatemem_submit") REPO_DIR = WORKDIR / "GateMem" SUBMISSIONS_DIR = WORKDIR / "submissions" def ensure_repo(): WORKDIR.mkdir(parents=True, exist_ok=True) SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True) if not REPO_DIR.exists(): subprocess.run( ["git", "clone", "--depth", "1", "https://github.com/rzhub/GateMem.git", str(REPO_DIR)], check=True, ) else: subprocess.run(["git", "-C", str(REPO_DIR), "pull"], check=False) def github_headers(): if not GITHUB_TOKEN: raise RuntimeError("GITHUB_TOKEN is not configured.") return { "Authorization": f"Bearer {GITHUB_TOKEN}", "Accept": "application/vnd.github+json", } def github_get_json_file(path): url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/contents/{path}" r = requests.get(url, headers=github_headers(), params={"ref": GITHUB_BRANCH}, timeout=30) if r.status_code == 404: return [], None r.raise_for_status() obj = r.json() content = base64.b64decode(obj["content"]).decode("utf-8") data = json.loads(content) if content.strip() else [] return data, obj["sha"] def github_put_json_file(path, data, message, sha=None): url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/contents/{path}" encoded = base64.b64encode( json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8") ).decode("utf-8") payload = { "message": message, "content": encoded, "branch": GITHUB_BRANCH, } if sha: payload["sha"] = sha r = requests.put(url, headers=github_headers(), json=payload, timeout=30) r.raise_for_status() return r.json() def extract_metrics(summary): """ Adjust this if your summary.json uses slightly different field names. """ return { "u": float(summary.get("utility_accuracy", 0.0)), "a": float(summary.get("privacy_leakage_rate", 0.0)), "f": float(summary.get("deletion_leakage_rate", 0.0)), "mgs": float(summary.get("compliance_utility_score", 0.0)), } def run_scorer(predictions_path, domain, use_llm_judge=False): ensure_repo() domain_name = domain.lower() run_id = str(uuid.uuid4())[:8] out_dir = SUBMISSIONS_DIR / run_id / "eval" out_dir.mkdir(parents=True, exist_ok=True) data_dir = REPO_DIR / "bench" / "data" / domain_name cmd = [ "python", str(REPO_DIR / "bench" / "scripts" / "score_predictions.py"), "--data_dir", str(data_dir), "--predictions", str(predictions_path), "--out_dir", str(out_dir), ] if use_llm_judge: cmd += [ "--use_llm_judge", "--judge_provider", "openai", "--judge_model", "gpt-4o", ] env = os.environ.copy() proc = subprocess.run( cmd, cwd=str(REPO_DIR), env=env, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) if proc.returncode != 0: raise RuntimeError(f"Scoring failed:\n{proc.stdout}") summary_path = out_dir / "summary.json" if not summary_path.exists(): raise RuntimeError(f"summary.json not found. Scorer output:\n{proc.stdout}") summary = json.loads(summary_path.read_text(encoding="utf-8")) metrics = extract_metrics(summary) return metrics, str(summary_path), proc.stdout def submit_result(predictions_file, method, backbone, domain, family, contact, code_url, use_llm_judge): if predictions_file is None: return "Please upload predictions.jsonl." if not method.strip(): return "Please provide a method name." submit_id = f"{int(time.time())}-{uuid.uuid4().hex[:8]}" submit_dir = SUBMISSIONS_DIR / submit_id submit_dir.mkdir(parents=True, exist_ok=True) pred_path = submit_dir / "predictions.jsonl" shutil.copy(predictions_file.name, pred_path) try: metrics, summary_path, logs = run_scorer(pred_path, domain, use_llm_judge=use_llm_judge) except Exception as e: return f"Submission failed:\n{e}" row = { "submission_id": submit_id, "method": method.strip(), "backbone": backbone, "domain": domain, "family": family, "u": round(metrics["u"], 1), "a": round(metrics["a"], 1), "f": round(metrics["f"], 1), "mgs": round(metrics["mgs"], 1), "source": "external", "verified": False, "contact": contact.strip(), "code_url": code_url.strip(), "created_at": int(time.time()), } pending, sha = github_get_json_file(PENDING_PATH) pending.append(row) github_put_json_file( PENDING_PATH, pending, f"Add pending GateMem submission: {method.strip()}", sha=sha, ) return ( "Submitted and scored successfully.\n\n" f"Status: pending maintainer approval\n" f"Submission ID: {submit_id}\n\n" f"U={row['u']}, A={row['a']}, F={row['f']}, MGS={row['mgs']}" ) def list_pending(password): if password != ADMIN_PASSWORD: return "Invalid admin password." pending, _ = github_get_json_file(PENDING_PATH) if not pending: return "No pending submissions." lines = [] for item in pending: lines.append( f"{item['submission_id']} | {item['method']} | {item['backbone']} | " f"{item['domain']} | U={item['u']} A={item['a']} F={item['f']} MGS={item['mgs']}" ) return "\n".join(lines) def approve_submission(password, submission_id): if password != ADMIN_PASSWORD: return "Invalid admin password." pending, pending_sha = github_get_json_file(PENDING_PATH) leaderboard, leaderboard_sha = github_get_json_file(LEADERBOARD_PATH) target = None remaining = [] for item in pending: if item["submission_id"] == submission_id.strip(): target = item else: remaining.append(item) if target is None: return f"Submission not found: {submission_id}" target["verified"] = True target["approved_at"] = int(time.time()) leaderboard.append(target) github_put_json_file( LEADERBOARD_PATH, leaderboard, f"Approve GateMem leaderboard submission: {target['method']}", sha=leaderboard_sha, ) github_put_json_file( PENDING_PATH, remaining, f"Remove approved pending submission: {target['method']}", sha=pending_sha, ) return ( f"Approved and added to leaderboard:\n" f"{target['method']} | {target['backbone']} | {target['domain']} | " f"MGS={target['mgs']}" ) with gr.Blocks(title="GateMem Result Submission") as demo: gr.Markdown("# GateMem Result Submission") gr.Markdown( "Upload `predictions.jsonl` generated by your method. " "The server scores it with the official GateMem evaluator and stores it as a pending submission." ) with gr.Tab("Submit Result"): predictions_file = gr.File(label="predictions.jsonl", file_types=[".jsonl"]) method = gr.Textbox(label="Method name", placeholder="e.g., MyMemoryAgent") backbone = gr.Dropdown( ["GPT-5-mini", "GPT-4o-mini", "Gemini-2.5-Flash-Lite", "Other"], label="Backbone model", value="GPT-5-mini", ) domain = gr.Dropdown( ["Medical", "Office", "Education", "Household"], label="Domain", value="Medical", ) family = gr.Dropdown( ["Full-context", "RAG", "External memory", "Other"], label="Method family", value="Other", ) contact = gr.Textbox(label="Contact email") code_url = gr.Textbox(label="Code URL / commit / artifact link") use_llm_judge = gr.Checkbox( label="Use LLM judge if server is configured", value=False, ) submit_btn = gr.Button("Submit and Score", variant="primary") submit_out = gr.Textbox(label="Submission status", lines=8) submit_btn.click( submit_result, inputs=[predictions_file, method, backbone, domain, family, contact, code_url, use_llm_judge], outputs=submit_out, ) with gr.Tab("Admin"): admin_password = gr.Textbox(label="Admin password", type="password") list_btn = gr.Button("List Pending Submissions") pending_out = gr.Textbox(label="Pending submissions", lines=12) submission_id = gr.Textbox(label="Submission ID to approve") approve_btn = gr.Button("Approve and Update Leaderboard", variant="primary") approve_out = gr.Textbox(label="Approval status", lines=6) list_btn.click(list_pending, inputs=[admin_password], outputs=pending_out) approve_btn.click(approve_submission, inputs=[admin_password, submission_id], outputs=approve_out) if __name__ == "__main__": ensure_repo() demo.launch()