Spaces:

AMA-bench
/

AMA-bench-Leaderboard

Running

App Files Files Community

AMA-bench-Leaderboard / submission.py

uuuhjb

update push_to_hf

2581d07 about 2 months ago

raw

history blame contribute delete

26.2 kB

	"""
	Submission handling module for AMA-Bench Leaderboard

	Submission format:
	{
	"episode_id": str,
	"question_uuid_list": list[str], # required - UUIDs that map answers to groundtruth
	"answer_list": list[str], # required - same length as question_uuid_list
	"llm_as_judge_score_list": list[bool] # optional - same length as answer_list
	}

	Scoring logic:
	- Uses llm_as_judge_score_list (true/false) from the submission
	- Maps each question to its domain and capability (A/B/C/D) via groundtruth metadata
	- Computes per-domain, per-capability accuracy
	- Writes entry to data/agent.jsonl or data/model.jsonl (verified=False by default)
	"""

	import json
	import os
	import datetime
	from email.utils import parseaddr
	from collections import defaultdict
	from typing import Dict, List, Tuple, Optional

	try:
	from content import format_error, format_warning, format_log
	except ImportError:
	def format_error(msg): return f"❌ Error: {msg}"
	def format_warning(msg): return f"⚠️ Warning: {msg}"
	def format_log(msg): return f"✅ {msg}"


	# ---------------------------------------------------------------------------
	# Validation
	# ---------------------------------------------------------------------------

	def validate_submission_format(data: dict) -> Tuple[bool, str]:
	"""
	Validate a single submission record.

	Required fields: episode_id, question_uuid_list, answer_list
	Optional fields: llm_as_judge_score_list, reasoning_trace
	"""
	if not isinstance(data, dict):
	return False, "Submission must be a JSON object"

	# episode_id
	if "episode_id" not in data:
	return False, "Missing required field: episode_id"
	if not isinstance(data["episode_id"], str) or not data["episode_id"].strip():
	return False, "episode_id must be a non-empty string"

	# answer_list
	if "answer_list" not in data:
	return False, "Missing required field: answer_list"
	if not isinstance(data["answer_list"], list) or not data["answer_list"]:
	return False, "answer_list must be a non-empty list"

	# question_uuid_list
	if "question_uuid_list" not in data:
	return False, "Missing required field: question_uuid_list"
	if not isinstance(data["question_uuid_list"], list):
	return False, "question_uuid_list must be a list"
	if len(data["question_uuid_list"]) != len(data["answer_list"]):
	return False, (
	f"question_uuid_list length ({len(data['question_uuid_list'])}) must match "
	f"answer_list length ({len(data['answer_list'])})"
	)
	for i, q in enumerate(data["question_uuid_list"]):
	if not isinstance(q, str) or not q.strip():
	return False, f"question_uuid_list[{i}] must be a non-empty string"

	# llm_as_judge_score_list (required)
	if "llm_as_judge_score_list" not in data:
	return False, "Missing required field: llm_as_judge_score_list"
	score_list = data["llm_as_judge_score_list"]
	if not isinstance(score_list, list):
	return False, "llm_as_judge_score_list must be a list"
	if len(score_list) != len(data["answer_list"]):
	return False, (
	f"llm_as_judge_score_list length ({len(score_list)}) must match "
	f"answer_list length ({len(data['answer_list'])})"
	)
	for i, score in enumerate(score_list):
	if not isinstance(score, bool):
	return False, f"llm_as_judge_score_list[{i}] must be true or false (boolean)"

	return True, ""


	def validate_submission_file(file_path: str) -> Tuple[bool, str, List[dict]]:
	"""Validate and load a JSONL submission file."""
	if not os.path.exists(file_path):
	return False, "File not found", []
	if not file_path.endswith(".jsonl"):
	return False, "File must be in JSONL format (.jsonl)", []

	submissions = []
	seen_ids = set()

	try:
	with open(file_path, "r", encoding="utf-8") as f:
	for line_num, line in enumerate(f, 1):
	line = line.strip()
	if not line:
	continue
	try:
	data = json.loads(line)
	except json.JSONDecodeError as e:
	return False, f"JSON parse error on line {line_num}: {e}", []

	is_valid, error_msg = validate_submission_format(data)
	if not is_valid:
	return False, f"Validation error on line {line_num}: {error_msg}", []

	episode_id = data["episode_id"]
	if episode_id in seen_ids:
	return False, f"Duplicate episode_id '{episode_id}' on line {line_num}", []
	seen_ids.add(episode_id)
	submissions.append(data)

	if not submissions:
	return False, "File is empty or contains no valid submissions", []
	return True, "", submissions

	except Exception as e:
	return False, f"Error reading file: {e}", []


	# ---------------------------------------------------------------------------
	# Groundtruth loading
	# ---------------------------------------------------------------------------

	def load_groundtruth_metadata(dataset_name: str = "AMA-bench/AMA-bench",
	token: str = None) -> Dict[str, dict]:
	"""
	Load groundtruth metadata. Returns a dict with two sub-dicts:

	{
	"episode_domain": {
	"episode_id": "GAME" \| "TEXT2SQL" \| ...
	},
	"question_cap": {
	"question_uuid": "A" \| "B" \| "C" \| "D"
	}
	}

	- episode_domain: episode_id -> domain (used even when question uuid doesn't match)
	- question_cap: question_uuid -> capability letter
	"""
	episode_domain: Dict[str, str] = {}
	question_cap: Dict[str, str] = {}

	def _index_rows(rows):
	for row in rows:
	episode_id = str(row.get("episode_id", ""))
	domain = row.get("domain", "UNKNOWN").upper()
	episode_domain[episode_id] = domain
	for qa in row.get("qa_pairs", []):
	question_uuid = qa.get("question_uuid", "").strip()
	if not question_uuid:
	continue
	cap_letter = _normalize_cap(qa.get("type", "A"))
	question_cap[question_uuid] = cap_letter

	# --- Try HuggingFace ---
	try:
	from datasets import load_dataset, VerificationMode
	dataset = load_dataset(
	dataset_name, split="test", token=token,
	verification_mode=VerificationMode.NO_CHECKS,
	)
	_index_rows(dataset)
	print(f"[groundtruth] Loaded {len(episode_domain)} episodes, "
	f"{len(question_cap)} Q&A entries from HuggingFace (indexed by question_uuid).")
	return {"episode_domain": episode_domain, "question_cap": question_cap}
	except Exception as hf_err:
	print(f"[groundtruth] HuggingFace failed ({hf_err}), trying local fallback…")

	# --- Local fallback ---
	for local_path in ["test/open_end_qa_set.jsonl", "data/open_end_qa_set.jsonl"]:
	if not os.path.exists(local_path):
	continue
	try:
	rows = []
	with open(local_path, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if line:
	rows.append(json.loads(line))
	_index_rows(rows)
	print(f"[groundtruth] Loaded {len(episode_domain)} episodes, "
	f"{len(question_cap)} Q&A entries from {local_path} (indexed by question_uuid).")
	return {"episode_domain": episode_domain, "question_cap": question_cap}
	except Exception as e:
	print(f"[groundtruth] Error reading {local_path}: {e}")

	print("[groundtruth] WARNING: No groundtruth metadata available.")
	return {"episode_domain": {}, "question_cap": {}}


	def _normalize_cap(cap: str) -> str:
	"""Normalize capability label to single letter A/B/C/D."""
	mapping = {
	"A": "A", "Recall": "A",
	"B": "B", "Causal Inference": "B", "Causal": "B",
	"C": "C", "State Updating": "C", "State": "C",
	"D": "D", "State Abstraction": "D", "Abstraction": "D",
	}
	return mapping.get(cap.strip(), "A")


	# ---------------------------------------------------------------------------
	# Scoring
	# ---------------------------------------------------------------------------

	VALID_DOMAINS = {"TEXT2SQL", "SOFTWARE", "WEB", "GAME", "EMBODIED_AI", "OPENWORLD_QA"}
	VALID_CAPS = ["A", "B", "C", "D"]


	def compute_scores_from_submissions(
	submissions: List[dict],
	groundtruth_meta: Dict[str, dict],
	) -> Dict:
	"""
	Compute per-domain, per-capability accuracy using llm_as_judge_score_list.

	Each question is matched to groundtruth via question_uuid.
	Score structure matches agent.jsonl / model.jsonl:
	{
	"TEXT2SQL": [{"A": 0.xx}, {"B": 0.xx}, {"C": 0.xx}, {"D": 0.xx}],
	...
	}
	"""
	# domain -> capability -> [scores]
	domain_cap_scores: Dict[str, Dict[str, List[float]]] = defaultdict(
	lambda: defaultdict(list)
	)

	scored_questions = 0
	skipped_episodes = 0 # no judge scores
	unmatched_questions = 0 # question cap not found (domain still resolved via episode)

	# Unpack the two indexes from groundtruth metadata
	episode_domain: Dict[str, str] = groundtruth_meta.get("episode_domain", {})
	question_cap: Dict[str, str] = groundtruth_meta.get("question_cap", {})

	for sub in submissions:
	episode_id = str(sub["episode_id"])
	question_uuid_list = sub["question_uuid_list"]
	judge_scores = sub.get("llm_as_judge_score_list")

	# Resolve domain via episode_id
	domain = episode_domain.get(episode_id, "UNKNOWN").upper()

	for i, question_uuid in enumerate(question_uuid_list):
	if i >= len(judge_scores):
	break

	# Resolve capability via question_uuid
	cap = question_cap.get(question_uuid.strip())
	if cap is None:
	unmatched_questions += 1
	continue

	score = 1.0 if judge_scores[i] is True else 0.0
	domain_cap_scores[domain][cap].append(score)
	scored_questions += 1

	# Build Score dict — always include all 6 known domains
	score_dict: Dict[str, List[dict]] = {}
	for domain in sorted(VALID_DOMAINS \| set(domain_cap_scores.keys())):
	cap_data = domain_cap_scores.get(domain, {})
	score_dict[domain] = [
	{cap: round(sum(cap_data[cap]) / len(cap_data[cap]), 4)
	if cap_data.get(cap) else 0.0}
	for cap in VALID_CAPS
	]

	# Coverage warning
	coverage_warning = None
	parts = []
	if skipped_episodes:
	parts.append(f"{skipped_episodes} episode(s) had no llm_as_judge_score_list")
	if unmatched_questions:
	parts.append(f"{unmatched_questions} question(s) not matched in groundtruth")
	if parts:
	coverage_warning = "; ".join(parts)

	return {
	"Score": score_dict,
	"scored_questions": scored_questions,
	"skipped_episodes": skipped_episodes,
	"unmatched_questions": unmatched_questions,
	"coverage_warning": coverage_warning,
	}


	# ---------------------------------------------------------------------------
	# Leaderboard update
	# ---------------------------------------------------------------------------

	def update_leaderboard_data(
	model_or_agent_name: str,
	model_family: str,
	submission_type: str,
	organisation: str,
	score_dict: Dict,
	verified: bool = False,
	) -> bool:
	"""Append a scored entry to data/agent.jsonl or data/model.jsonl."""
	try:
	os.makedirs("data", exist_ok=True)
	data_file = "data/agent.jsonl" if submission_type == "agent" else "data/model.jsonl"
	name_key = "agent_name" if submission_type == "agent" else "model"

	entry = {
	name_key: model_or_agent_name,
	"model_family": model_family,
	"Date": datetime.datetime.today().strftime("%Y-%m-%d"),
	"verified": verified,
	"Score": score_dict,
	}
	with open(data_file, "a", encoding="utf-8") as f:
	f.write(json.dumps(entry, ensure_ascii=False) + "\n")

	print(f"[leaderboard] Appended to {data_file}: {model_or_agent_name}")
	return True
	except Exception as e:
	print(f"[leaderboard] Error: {e}")
	import traceback; traceback.print_exc()
	return False


	# ---------------------------------------------------------------------------
	# HuggingFace submission push
	# ---------------------------------------------------------------------------

	HF_SUBMISSIONS_DATASET = "AMA-bench/AMA_submissions_internal"


	def push_submission_to_hf(
	submissions: List[dict],
	metadata: dict,
	score_dict: Dict,
	token: str,
	timestamp: str,
	) -> Tuple[bool, str]:
	"""
	Push raw submission + metadata + scores to the private HuggingFace dataset
	``AMA-bench/AMA_submissions_internal``.

	The dataset is expected (or will be created) with a single ``data`` config.
	Each call appends one row per episode, using a Parquet shard named by
	``{organisation}_{model}_{timestamp}``.

	Row schema
	----------
	submission_id : str – "{organisation}_{model}_{timestamp}"
	organisation : str
	model_name : str
	submission_type : str – "agent" \| "model"
	timestamp : str – "YYYYMMDD_HHMMSS"
	date : str – "YYYY-MM-DD"
	episode_id : str
	question_uuid_list : str – JSON-encoded list
	answer_list : str – JSON-encoded list
	llm_as_judge_score_list : str – JSON-encoded list
	reasoning_trace : str – optional, empty string if absent
	score_json : str – JSON-encoded per-domain score dict
	metadata_json : str – JSON-encoded full metadata dict
	"""
	try:
	from huggingface_hub import HfApi
	import pandas as pd
	import io

	api = HfApi(token=token)
	organisation = metadata.get("organisation", "unknown")
	model_name = metadata.get("model", metadata.get("agent_name", "unknown"))
	submission_id = f"{organisation}_{model_name}_{timestamp}"

	# Build one row per episode submission
	rows = []
	for sub in submissions:
	rows.append({
	"submission_id": submission_id,
	"organisation": organisation,
	"model_name": model_name,
	"submission_type": metadata.get("submission_type", ""),
	"timestamp": timestamp,
	"date": metadata.get("Date", ""),
	"episode_id": str(sub.get("episode_id", "")),
	"question_uuid_list": json.dumps(sub.get("question_uuid_list", []), ensure_ascii=False),
	"answer_list": json.dumps(sub.get("answer_list", []), ensure_ascii=False),
	"llm_as_judge_score_list": json.dumps(sub.get("llm_as_judge_score_list", []), ensure_ascii=False),
	"reasoning_trace": str(sub.get("reasoning_trace", "")),
	"score_json": json.dumps(score_dict, ensure_ascii=False),
	"metadata_json": json.dumps(metadata, ensure_ascii=False),
	})

	df = pd.DataFrame(rows)

	# Serialise to Parquet in memory
	buf = io.BytesIO()
	df.to_parquet(buf, index=False)
	buf.seek(0)

	# Upload as a new shard under data/
	path_in_repo = f"data/{submission_id}.parquet"
	api.upload_file(
	path_or_fileobj=buf,
	path_in_repo=path_in_repo,
	repo_id=HF_SUBMISSIONS_DATASET,
	repo_type="dataset",
	commit_message=f"Add submission: {submission_id}",
	)

	print(f"[hf_push] Pushed {len(rows)} row(s) to {HF_SUBMISSIONS_DATASET}/{path_in_repo}")
	return True, submission_id

	except Exception as e:
	import traceback
	traceback.print_exc()
	print(f"[hf_push] ERROR: {e}")
	return False, str(e)


	# ---------------------------------------------------------------------------
	# Main entry point
	# ---------------------------------------------------------------------------

	def add_new_submission(
	model: str,
	submission_type: str,
	url: str,
	file,
	organisation: str,
	mail: str,
	model_family: str = "",
	) -> str:
	"""Validate, score, and record a new submission."""
	try:
	if file is None:
	return format_warning("Please attach a submission file.")
	_, parsed_mail = parseaddr(mail)
	if "@" not in parsed_mail:
	return format_warning("Please provide a valid email address.")
	if not model or not submission_type or not organisation:
	return format_warning("Please fill in all required fields.")

	print(f"[submission] Processing {organisation}/{model} ({submission_type})")

	is_valid, error_msg, submissions = validate_submission_file(file.name)
	if not is_valid:
	return format_error(error_msg)

	print(f"[submission] Validated {len(submissions)} episode submissions")

	groundtruth_meta = load_groundtruth_metadata(token=os.environ.get("HF_TOKEN") or os.environ.get("TOKEN"))
	score_result = compute_scores_from_submissions(submissions, groundtruth_meta)
	score_dict = score_result["Score"]

	# Save raw submission
	submission_dir = f"submissions/{organisation}_{model}"
	os.makedirs(submission_dir, exist_ok=True)
	timestamp = datetime.datetime.today().strftime("%Y%m%d_%H%M%S")
	saved_file = f"{submission_dir}/submission_{timestamp}.jsonl"
	with open(saved_file, "w", encoding="utf-8") as f_out:
	for sub in submissions:
	f_out.write(json.dumps(sub, ensure_ascii=False) + "\n")

	# Save metadata
	metadata = {
	"model" if submission_type.lower() == "model" else "agent_name": model,
	"model_family": model_family,
	"submission_type": submission_type.lower(),
	"organisation": organisation,
	"url": url,
	"mail": parsed_mail,
	"Date": datetime.datetime.today().strftime("%Y-%m-%d"),
	"timestamp": timestamp,
	"verified": False,
	"submission_count": len(submissions),
	"scored_questions": score_result["scored_questions"],
	"skipped_episodes": score_result["skipped_episodes"],
	"unmatched_questions": score_result["unmatched_questions"],
	"file_path": saved_file,
	}
	with open(f"{submission_dir}/metadata_{timestamp}.json", "w", encoding="utf-8") as f_meta:
	json.dump(metadata, f_meta, indent=2, ensure_ascii=False)

	# Push to HuggingFace private submissions dataset
	hf_token = os.environ.get("HF_TOKEN") or os.environ.get("TOKEN")
	if hf_token:
	hf_ok, hf_result = push_submission_to_hf(
	submissions=submissions,
	metadata=metadata,
	score_dict=score_dict,
	token=hf_token,
	timestamp=timestamp,
	)
	if not hf_ok:
	print(f"[hf_push] WARNING: Push to HuggingFace failed: {hf_result}")
	# Non-fatal — we continue even if HF push fails
	else:
	print("[hf_push] WARNING: No HF_TOKEN found, skipping HuggingFace push.")

	# Update leaderboard
	updated = update_leaderboard_data(
	model_or_agent_name=model,
	model_family=model_family,
	submission_type=submission_type.lower(),
	organisation=organisation,
	score_dict=score_dict,
	verified=False,
	)
	if not updated:
	return format_error("Submission validated but failed to update leaderboard data.")

	type_label = "Agent" if submission_type.lower() == "agent" else "Model"

	# Compute per-domain averages and overall avg
	domain_order = ["TEXT2SQL", "SOFTWARE", "WEB", "GAME", "EMBODIED_AI", "OPENWORLD_QA"]
	domain_avgs = {}
	for dom in domain_order:
	caps = score_dict.get(dom, [])
	vals = [list(c.values())[0] for c in caps if c]
	domain_avgs[dom] = sum(vals) / len(vals) if vals else 0.0
	overall_avg = sum(domain_avgs.values()) / len(domain_avgs) if domain_avgs else 0.0

	# Build domain rows — all colors explicit to override Gradio dark theme
	TD = 'style="padding:8px 12px;text-align:center;color:#2c3e50;background:#ffffff;"'
	TD_NAME = 'style="padding:8px 12px;font-weight:600;color:#1a1a2e;background:#ffffff;"'
	TD_AVG = 'style="padding:8px 12px;text-align:center;font-weight:700;color:#0e9e7a;background:#ffffff;"'

	dom_rows_html = ""
	for i, dom in enumerate(domain_order):
	caps = score_dict.get(dom, [])
	cap_cells = "".join(
	f'<td {TD}>{list(c.values())[0]*100:.1f}%</td>'
	for c in caps
	)
	avg = domain_avgs.get(dom, 0.0)
	row_bg = "#f9fbff" if i % 2 == 0 else "#ffffff"
	dom_rows_html += (
	f'<tr style="border-bottom:1px solid #e8eef3;">'
	f'<td style="padding:8px 12px;font-weight:600;color:#1a1a2e;background:{row_bg};">{dom}</td>'
	f'<td style="padding:8px 12px;text-align:center;font-weight:700;color:#0e9e7a;background:{row_bg};">{avg*100:.2f}%</td>'
	+ "".join(
	f'<td style="padding:8px 12px;text-align:center;color:#2c3e50;background:{row_bg};">{list(c.values())[0]*100:.1f}%</td>'
	for c in caps
	)
	+ '</tr>'
	)

	warning_html = (
	'<div style="margin-top:12px;padding:10px 14px;background:#fff8e1;'
	'border-left:4px solid #f0ad4e;border-radius:6px;font-size:13px;color:#7d5a00;">'
	f'⚠️ {score_result["coverage_warning"]}</div>'
	if score_result["coverage_warning"] else ""
	)

	result_html = (
	'<div style="border:1px solid #c8e6c9;border-radius:12px;overflow:hidden;'
	'margin-top:16px;font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
	'background:#ffffff;color:#1a1a2e;">'

	# Header
	'<div style="background:linear-gradient(135deg,#1abc9c,#16a085);padding:16px 22px;'
	'display:flex;align-items:center;gap:12px;">'
	'<span style="font-size:24px;">✅</span>'
	'<span style="color:#ffffff;font-size:18px;font-weight:700;letter-spacing:0.3px;">'
	'Submission Received Successfully</span>'
	'</div>'

	# Meta row
	'<div style="padding:18px 22px;background:#f0faf7;display:flex;flex-wrap:wrap;'
	'gap:28px;border-bottom:1px solid #d5eee8;">'
	+ "".join(
	f'<div><div style="color:#6b8f85;font-size:11px;font-weight:600;'
	f'letter-spacing:0.8px;text-transform:uppercase;">{label}</div>'
	f'<div style="font-weight:700;font-size:15px;color:{color};margin-top:3px;">{value}</div></div>'
	for label, value, color in [
	(type_label, model, "#1a1a2e"),
	("Organisation", organisation, "#1a1a2e"),
	("Episodes", str(len(submissions)), "#1a1a2e"),
	("Questions Scored", str(score_result["scored_questions"]), "#1a1a2e"),
	("Overall Avg", f"{overall_avg*100:.2f}%", "#0e9e7a"),
	("Submission ID", timestamp, "#666"),
	]
	)
	+ '</div>'

	# Score table
	'<div style="padding:18px 22px;background:#ffffff;">'
	'<div style="font-size:13px;font-weight:600;color:#444;margin-bottom:12px;">'
	'📊  Score Preview '
	'<span style="font-weight:400;color:#888;">(self-reported · pending official verification)</span>'
	'</div>'
	'<div style="border-radius:8px;overflow:hidden;border:1px solid #e0eaf0;">'
	'<table style="width:100%;border-collapse:collapse;font-size:13px;">'
	'<thead>'
	'<tr style="background:#e8f4f0;">'
	'<th style="padding:9px 12px;text-align:left;color:#1a1a2e;font-weight:600;">Domain</th>'
	'<th style="padding:9px 12px;text-align:center;color:#1a1a2e;font-weight:600;">Avg</th>'
	'<th style="padding:9px 12px;text-align:center;color:#1a1a2e;font-weight:600;">Recall (A)</th>'
	'<th style="padding:9px 12px;text-align:center;color:#1a1a2e;font-weight:600;">Causal Inf. (B)</th>'
	'<th style="padding:9px 12px;text-align:center;color:#1a1a2e;font-weight:600;">State Upd. (C)</th>'
	'<th style="padding:9px 12px;text-align:center;color:#1a1a2e;font-weight:600;">State Abs. (D)</th>'
	'</tr>'
	'</thead>'
	f'<tbody>{dom_rows_html}</tbody>'
	'</table>'
	'</div>'
	+ warning_html +
	'<div style="margin-top:14px;padding:10px 14px;background:#fffbea;border-radius:6px;'
	'font-size:12px;color:#7d5a00;line-height:1.7;border-left:3px solid #f5c518;">'
	'ℹ️  This is a <strong style="color:#5a3e00;">self-reported preview</strong> based on your '
	'<code style="background:#f5e9b8;color:#5a3e00;padding:1px 4px;border-radius:3px;">llm_as_judge_score_list</code>. '
	'Official scores will be recomputed by LLM-as-Judge — your entry will appear on the leaderboard after weekly verification.'
	'</div>'
	'</div>'
	'</div>'
	'</div>'
	)
	return result_html

	except Exception as e:
	import traceback; traceback.print_exc()
	return format_error(f"An error occurred: {str(e)}")