GateMem-Submit / app.py
Ray368's picture
Create app.py
d565ce0 verified
Raw
History Blame Contribute Delete
9.71 kB
import base64
import json
import os
import shutil
import subprocess
import time
import uuid
from pathlib import Path
import gradio as gr
import requests
GITHUB_OWNER = os.getenv("GITHUB_OWNER", "rzhub")
GITHUB_REPO = os.getenv("GITHUB_REPO", "GateMem")
GITHUB_BRANCH = os.getenv("GITHUB_BRANCH", "main")
LEADERBOARD_PATH = os.getenv("LEADERBOARD_PATH", "docs/assets/leaderboard.json")
PENDING_PATH = os.getenv("PENDING_PATH", "docs/assets/pending_submissions.json")
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
ADMIN_PASSWORD = os.getenv("ADMIN_PASSWORD", "")
WORKDIR = Path("/tmp/gatemem_submit")
REPO_DIR = WORKDIR / "GateMem"
SUBMISSIONS_DIR = WORKDIR / "submissions"
def ensure_repo():
WORKDIR.mkdir(parents=True, exist_ok=True)
SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
if not REPO_DIR.exists():
subprocess.run(
["git", "clone", "--depth", "1", "https://github.com/rzhub/GateMem.git", str(REPO_DIR)],
check=True,
)
else:
subprocess.run(["git", "-C", str(REPO_DIR), "pull"], check=False)
def github_headers():
if not GITHUB_TOKEN:
raise RuntimeError("GITHUB_TOKEN is not configured.")
return {
"Authorization": f"Bearer {GITHUB_TOKEN}",
"Accept": "application/vnd.github+json",
}
def github_get_json_file(path):
url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/contents/{path}"
r = requests.get(url, headers=github_headers(), params={"ref": GITHUB_BRANCH}, timeout=30)
if r.status_code == 404:
return [], None
r.raise_for_status()
obj = r.json()
content = base64.b64decode(obj["content"]).decode("utf-8")
data = json.loads(content) if content.strip() else []
return data, obj["sha"]
def github_put_json_file(path, data, message, sha=None):
url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/contents/{path}"
encoded = base64.b64encode(
json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")
).decode("utf-8")
payload = {
"message": message,
"content": encoded,
"branch": GITHUB_BRANCH,
}
if sha:
payload["sha"] = sha
r = requests.put(url, headers=github_headers(), json=payload, timeout=30)
r.raise_for_status()
return r.json()
def extract_metrics(summary):
"""
Adjust this if your summary.json uses slightly different field names.
"""
return {
"u": float(summary.get("utility_accuracy", 0.0)),
"a": float(summary.get("privacy_leakage_rate", 0.0)),
"f": float(summary.get("deletion_leakage_rate", 0.0)),
"mgs": float(summary.get("compliance_utility_score", 0.0)),
}
def run_scorer(predictions_path, domain, use_llm_judge=False):
ensure_repo()
domain_name = domain.lower()
run_id = str(uuid.uuid4())[:8]
out_dir = SUBMISSIONS_DIR / run_id / "eval"
out_dir.mkdir(parents=True, exist_ok=True)
data_dir = REPO_DIR / "bench" / "data" / domain_name
cmd = [
"python",
str(REPO_DIR / "bench" / "scripts" / "score_predictions.py"),
"--data_dir",
str(data_dir),
"--predictions",
str(predictions_path),
"--out_dir",
str(out_dir),
]
if use_llm_judge:
cmd += [
"--use_llm_judge",
"--judge_provider",
"openai",
"--judge_model",
"gpt-4o",
]
env = os.environ.copy()
proc = subprocess.run(
cmd,
cwd=str(REPO_DIR),
env=env,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
if proc.returncode != 0:
raise RuntimeError(f"Scoring failed:\n{proc.stdout}")
summary_path = out_dir / "summary.json"
if not summary_path.exists():
raise RuntimeError(f"summary.json not found. Scorer output:\n{proc.stdout}")
summary = json.loads(summary_path.read_text(encoding="utf-8"))
metrics = extract_metrics(summary)
return metrics, str(summary_path), proc.stdout
def submit_result(predictions_file, method, backbone, domain, family, contact, code_url, use_llm_judge):
if predictions_file is None:
return "Please upload predictions.jsonl."
if not method.strip():
return "Please provide a method name."
submit_id = f"{int(time.time())}-{uuid.uuid4().hex[:8]}"
submit_dir = SUBMISSIONS_DIR / submit_id
submit_dir.mkdir(parents=True, exist_ok=True)
pred_path = submit_dir / "predictions.jsonl"
shutil.copy(predictions_file.name, pred_path)
try:
metrics, summary_path, logs = run_scorer(pred_path, domain, use_llm_judge=use_llm_judge)
except Exception as e:
return f"Submission failed:\n{e}"
row = {
"submission_id": submit_id,
"method": method.strip(),
"backbone": backbone,
"domain": domain,
"family": family,
"u": round(metrics["u"], 1),
"a": round(metrics["a"], 1),
"f": round(metrics["f"], 1),
"mgs": round(metrics["mgs"], 1),
"source": "external",
"verified": False,
"contact": contact.strip(),
"code_url": code_url.strip(),
"created_at": int(time.time()),
}
pending, sha = github_get_json_file(PENDING_PATH)
pending.append(row)
github_put_json_file(
PENDING_PATH,
pending,
f"Add pending GateMem submission: {method.strip()}",
sha=sha,
)
return (
"Submitted and scored successfully.\n\n"
f"Status: pending maintainer approval\n"
f"Submission ID: {submit_id}\n\n"
f"U={row['u']}, A={row['a']}, F={row['f']}, MGS={row['mgs']}"
)
def list_pending(password):
if password != ADMIN_PASSWORD:
return "Invalid admin password."
pending, _ = github_get_json_file(PENDING_PATH)
if not pending:
return "No pending submissions."
lines = []
for item in pending:
lines.append(
f"{item['submission_id']} | {item['method']} | {item['backbone']} | "
f"{item['domain']} | U={item['u']} A={item['a']} F={item['f']} MGS={item['mgs']}"
)
return "\n".join(lines)
def approve_submission(password, submission_id):
if password != ADMIN_PASSWORD:
return "Invalid admin password."
pending, pending_sha = github_get_json_file(PENDING_PATH)
leaderboard, leaderboard_sha = github_get_json_file(LEADERBOARD_PATH)
target = None
remaining = []
for item in pending:
if item["submission_id"] == submission_id.strip():
target = item
else:
remaining.append(item)
if target is None:
return f"Submission not found: {submission_id}"
target["verified"] = True
target["approved_at"] = int(time.time())
leaderboard.append(target)
github_put_json_file(
LEADERBOARD_PATH,
leaderboard,
f"Approve GateMem leaderboard submission: {target['method']}",
sha=leaderboard_sha,
)
github_put_json_file(
PENDING_PATH,
remaining,
f"Remove approved pending submission: {target['method']}",
sha=pending_sha,
)
return (
f"Approved and added to leaderboard:\n"
f"{target['method']} | {target['backbone']} | {target['domain']} | "
f"MGS={target['mgs']}"
)
with gr.Blocks(title="GateMem Result Submission") as demo:
gr.Markdown("# GateMem Result Submission")
gr.Markdown(
"Upload `predictions.jsonl` generated by your method. "
"The server scores it with the official GateMem evaluator and stores it as a pending submission."
)
with gr.Tab("Submit Result"):
predictions_file = gr.File(label="predictions.jsonl", file_types=[".jsonl"])
method = gr.Textbox(label="Method name", placeholder="e.g., MyMemoryAgent")
backbone = gr.Dropdown(
["GPT-5-mini", "GPT-4o-mini", "Gemini-2.5-Flash-Lite", "Other"],
label="Backbone model",
value="GPT-5-mini",
)
domain = gr.Dropdown(
["Medical", "Office", "Education", "Household"],
label="Domain",
value="Medical",
)
family = gr.Dropdown(
["Full-context", "RAG", "External memory", "Other"],
label="Method family",
value="Other",
)
contact = gr.Textbox(label="Contact email")
code_url = gr.Textbox(label="Code URL / commit / artifact link")
use_llm_judge = gr.Checkbox(
label="Use LLM judge if server is configured",
value=False,
)
submit_btn = gr.Button("Submit and Score", variant="primary")
submit_out = gr.Textbox(label="Submission status", lines=8)
submit_btn.click(
submit_result,
inputs=[predictions_file, method, backbone, domain, family, contact, code_url, use_llm_judge],
outputs=submit_out,
)
with gr.Tab("Admin"):
admin_password = gr.Textbox(label="Admin password", type="password")
list_btn = gr.Button("List Pending Submissions")
pending_out = gr.Textbox(label="Pending submissions", lines=12)
submission_id = gr.Textbox(label="Submission ID to approve")
approve_btn = gr.Button("Approve and Update Leaderboard", variant="primary")
approve_out = gr.Textbox(label="Approval status", lines=6)
list_btn.click(list_pending, inputs=[admin_password], outputs=pending_out)
approve_btn.click(approve_submission, inputs=[admin_password, submission_id], outputs=approve_out)
if __name__ == "__main__":
ensure_repo()
demo.launch()