Spaces:
Running
Running
| import base64 | |
| import json | |
| import os | |
| import shutil | |
| import subprocess | |
| import time | |
| import uuid | |
| from pathlib import Path | |
| import gradio as gr | |
| import requests | |
| GITHUB_OWNER = os.getenv("GITHUB_OWNER", "rzhub") | |
| GITHUB_REPO = os.getenv("GITHUB_REPO", "GateMem") | |
| GITHUB_BRANCH = os.getenv("GITHUB_BRANCH", "main") | |
| LEADERBOARD_PATH = os.getenv("LEADERBOARD_PATH", "docs/assets/leaderboard.json") | |
| PENDING_PATH = os.getenv("PENDING_PATH", "docs/assets/pending_submissions.json") | |
| GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") | |
| ADMIN_PASSWORD = os.getenv("ADMIN_PASSWORD", "") | |
| WORKDIR = Path("/tmp/gatemem_submit") | |
| REPO_DIR = WORKDIR / "GateMem" | |
| SUBMISSIONS_DIR = WORKDIR / "submissions" | |
| def ensure_repo(): | |
| WORKDIR.mkdir(parents=True, exist_ok=True) | |
| SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True) | |
| if not REPO_DIR.exists(): | |
| subprocess.run( | |
| ["git", "clone", "--depth", "1", "https://github.com/rzhub/GateMem.git", str(REPO_DIR)], | |
| check=True, | |
| ) | |
| else: | |
| subprocess.run(["git", "-C", str(REPO_DIR), "pull"], check=False) | |
| def github_headers(): | |
| if not GITHUB_TOKEN: | |
| raise RuntimeError("GITHUB_TOKEN is not configured.") | |
| return { | |
| "Authorization": f"Bearer {GITHUB_TOKEN}", | |
| "Accept": "application/vnd.github+json", | |
| } | |
| def github_get_json_file(path): | |
| url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/contents/{path}" | |
| r = requests.get(url, headers=github_headers(), params={"ref": GITHUB_BRANCH}, timeout=30) | |
| if r.status_code == 404: | |
| return [], None | |
| r.raise_for_status() | |
| obj = r.json() | |
| content = base64.b64decode(obj["content"]).decode("utf-8") | |
| data = json.loads(content) if content.strip() else [] | |
| return data, obj["sha"] | |
| def github_put_json_file(path, data, message, sha=None): | |
| url = f"https://api.github.com/repos/{GITHUB_OWNER}/{GITHUB_REPO}/contents/{path}" | |
| encoded = base64.b64encode( | |
| json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8") | |
| ).decode("utf-8") | |
| payload = { | |
| "message": message, | |
| "content": encoded, | |
| "branch": GITHUB_BRANCH, | |
| } | |
| if sha: | |
| payload["sha"] = sha | |
| r = requests.put(url, headers=github_headers(), json=payload, timeout=30) | |
| r.raise_for_status() | |
| return r.json() | |
| def extract_metrics(summary): | |
| """ | |
| Adjust this if your summary.json uses slightly different field names. | |
| """ | |
| return { | |
| "u": float(summary.get("utility_accuracy", 0.0)), | |
| "a": float(summary.get("privacy_leakage_rate", 0.0)), | |
| "f": float(summary.get("deletion_leakage_rate", 0.0)), | |
| "mgs": float(summary.get("compliance_utility_score", 0.0)), | |
| } | |
| def run_scorer(predictions_path, domain, use_llm_judge=False): | |
| ensure_repo() | |
| domain_name = domain.lower() | |
| run_id = str(uuid.uuid4())[:8] | |
| out_dir = SUBMISSIONS_DIR / run_id / "eval" | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| data_dir = REPO_DIR / "bench" / "data" / domain_name | |
| cmd = [ | |
| "python", | |
| str(REPO_DIR / "bench" / "scripts" / "score_predictions.py"), | |
| "--data_dir", | |
| str(data_dir), | |
| "--predictions", | |
| str(predictions_path), | |
| "--out_dir", | |
| str(out_dir), | |
| ] | |
| if use_llm_judge: | |
| cmd += [ | |
| "--use_llm_judge", | |
| "--judge_provider", | |
| "openai", | |
| "--judge_model", | |
| "gpt-4o", | |
| ] | |
| env = os.environ.copy() | |
| proc = subprocess.run( | |
| cmd, | |
| cwd=str(REPO_DIR), | |
| env=env, | |
| text=True, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| ) | |
| if proc.returncode != 0: | |
| raise RuntimeError(f"Scoring failed:\n{proc.stdout}") | |
| summary_path = out_dir / "summary.json" | |
| if not summary_path.exists(): | |
| raise RuntimeError(f"summary.json not found. Scorer output:\n{proc.stdout}") | |
| summary = json.loads(summary_path.read_text(encoding="utf-8")) | |
| metrics = extract_metrics(summary) | |
| return metrics, str(summary_path), proc.stdout | |
| def submit_result(predictions_file, method, backbone, domain, family, contact, code_url, use_llm_judge): | |
| if predictions_file is None: | |
| return "Please upload predictions.jsonl." | |
| if not method.strip(): | |
| return "Please provide a method name." | |
| submit_id = f"{int(time.time())}-{uuid.uuid4().hex[:8]}" | |
| submit_dir = SUBMISSIONS_DIR / submit_id | |
| submit_dir.mkdir(parents=True, exist_ok=True) | |
| pred_path = submit_dir / "predictions.jsonl" | |
| shutil.copy(predictions_file.name, pred_path) | |
| try: | |
| metrics, summary_path, logs = run_scorer(pred_path, domain, use_llm_judge=use_llm_judge) | |
| except Exception as e: | |
| return f"Submission failed:\n{e}" | |
| row = { | |
| "submission_id": submit_id, | |
| "method": method.strip(), | |
| "backbone": backbone, | |
| "domain": domain, | |
| "family": family, | |
| "u": round(metrics["u"], 1), | |
| "a": round(metrics["a"], 1), | |
| "f": round(metrics["f"], 1), | |
| "mgs": round(metrics["mgs"], 1), | |
| "source": "external", | |
| "verified": False, | |
| "contact": contact.strip(), | |
| "code_url": code_url.strip(), | |
| "created_at": int(time.time()), | |
| } | |
| pending, sha = github_get_json_file(PENDING_PATH) | |
| pending.append(row) | |
| github_put_json_file( | |
| PENDING_PATH, | |
| pending, | |
| f"Add pending GateMem submission: {method.strip()}", | |
| sha=sha, | |
| ) | |
| return ( | |
| "Submitted and scored successfully.\n\n" | |
| f"Status: pending maintainer approval\n" | |
| f"Submission ID: {submit_id}\n\n" | |
| f"U={row['u']}, A={row['a']}, F={row['f']}, MGS={row['mgs']}" | |
| ) | |
| def list_pending(password): | |
| if password != ADMIN_PASSWORD: | |
| return "Invalid admin password." | |
| pending, _ = github_get_json_file(PENDING_PATH) | |
| if not pending: | |
| return "No pending submissions." | |
| lines = [] | |
| for item in pending: | |
| lines.append( | |
| f"{item['submission_id']} | {item['method']} | {item['backbone']} | " | |
| f"{item['domain']} | U={item['u']} A={item['a']} F={item['f']} MGS={item['mgs']}" | |
| ) | |
| return "\n".join(lines) | |
| def approve_submission(password, submission_id): | |
| if password != ADMIN_PASSWORD: | |
| return "Invalid admin password." | |
| pending, pending_sha = github_get_json_file(PENDING_PATH) | |
| leaderboard, leaderboard_sha = github_get_json_file(LEADERBOARD_PATH) | |
| target = None | |
| remaining = [] | |
| for item in pending: | |
| if item["submission_id"] == submission_id.strip(): | |
| target = item | |
| else: | |
| remaining.append(item) | |
| if target is None: | |
| return f"Submission not found: {submission_id}" | |
| target["verified"] = True | |
| target["approved_at"] = int(time.time()) | |
| leaderboard.append(target) | |
| github_put_json_file( | |
| LEADERBOARD_PATH, | |
| leaderboard, | |
| f"Approve GateMem leaderboard submission: {target['method']}", | |
| sha=leaderboard_sha, | |
| ) | |
| github_put_json_file( | |
| PENDING_PATH, | |
| remaining, | |
| f"Remove approved pending submission: {target['method']}", | |
| sha=pending_sha, | |
| ) | |
| return ( | |
| f"Approved and added to leaderboard:\n" | |
| f"{target['method']} | {target['backbone']} | {target['domain']} | " | |
| f"MGS={target['mgs']}" | |
| ) | |
| with gr.Blocks(title="GateMem Result Submission") as demo: | |
| gr.Markdown("# GateMem Result Submission") | |
| gr.Markdown( | |
| "Upload `predictions.jsonl` generated by your method. " | |
| "The server scores it with the official GateMem evaluator and stores it as a pending submission." | |
| ) | |
| with gr.Tab("Submit Result"): | |
| predictions_file = gr.File(label="predictions.jsonl", file_types=[".jsonl"]) | |
| method = gr.Textbox(label="Method name", placeholder="e.g., MyMemoryAgent") | |
| backbone = gr.Dropdown( | |
| ["GPT-5-mini", "GPT-4o-mini", "Gemini-2.5-Flash-Lite", "Other"], | |
| label="Backbone model", | |
| value="GPT-5-mini", | |
| ) | |
| domain = gr.Dropdown( | |
| ["Medical", "Office", "Education", "Household"], | |
| label="Domain", | |
| value="Medical", | |
| ) | |
| family = gr.Dropdown( | |
| ["Full-context", "RAG", "External memory", "Other"], | |
| label="Method family", | |
| value="Other", | |
| ) | |
| contact = gr.Textbox(label="Contact email") | |
| code_url = gr.Textbox(label="Code URL / commit / artifact link") | |
| use_llm_judge = gr.Checkbox( | |
| label="Use LLM judge if server is configured", | |
| value=False, | |
| ) | |
| submit_btn = gr.Button("Submit and Score", variant="primary") | |
| submit_out = gr.Textbox(label="Submission status", lines=8) | |
| submit_btn.click( | |
| submit_result, | |
| inputs=[predictions_file, method, backbone, domain, family, contact, code_url, use_llm_judge], | |
| outputs=submit_out, | |
| ) | |
| with gr.Tab("Admin"): | |
| admin_password = gr.Textbox(label="Admin password", type="password") | |
| list_btn = gr.Button("List Pending Submissions") | |
| pending_out = gr.Textbox(label="Pending submissions", lines=12) | |
| submission_id = gr.Textbox(label="Submission ID to approve") | |
| approve_btn = gr.Button("Approve and Update Leaderboard", variant="primary") | |
| approve_out = gr.Textbox(label="Approval status", lines=6) | |
| list_btn.click(list_pending, inputs=[admin_password], outputs=pending_out) | |
| approve_btn.click(approve_submission, inputs=[admin_password, submission_id], outputs=approve_out) | |
| if __name__ == "__main__": | |
| ensure_repo() | |
| demo.launch() |