import json import os import tempfile from datetime import datetime from uuid import uuid4 import gradio as gr import pandas as pd from huggingface_hub import HfApi, hf_hub_download from constants import AVBENCH_INTRODUCTION, HARD_ROWS, METRIC_COLUMNS, METRIC_WEIGHTS, NORMAL_ROWS DATASET_REPO = os.getenv("AVBENCH_DATASET_REPO", "iiiiii123/AVBench_results") DATASET_FILE = os.getenv("AVBENCH_DATASET_FILE", "results.csv") REQUEST_REPO = os.getenv("AVBENCH_REQUEST_REPO", "iiiiii123/AVBench_requests") REQUEST_FILE = os.getenv("AVBENCH_REQUEST_FILE", "requests.csv") HF_TOKEN = os.getenv("HF_TOKEN") ADMIN_TOKEN = os.getenv("ADMIN_TOKEN") DISPLAY_COLUMNS = ["Medal", "Model"] + METRIC_COLUMNS + ["Overall", "UpdatedAt"] BASE_COLUMNS = ["Model", "Split"] + METRIC_COLUMNS + ["Team", "Contact", "ModelLink", "UpdatedAt"] REQUEST_COLUMNS = [ "RequestID", "Status", "Model", "Split", ] + METRIC_COLUMNS + ["Team", "Contact", "ModelLink", "SubmittedAt", "ReviewedAt", "ReviewNote"] AT_CANONICAL = { ("Sora 2", "Normal"): 0.8675, ("Veo 3 Fast", "Normal"): 0.8300, ("Wan 2.6", "Normal"): 0.8227, ("Kling 2.6", "Normal"): 0.8061, ("Seedance 1.5 Pro", "Normal"): 0.8554, ("Sora 2", "Hard"): 0.8575, ("Veo 3 Fast", "Hard"): 0.8117, ("Wan 2.6", "Hard"): 0.8418, ("Kling 2.6", "Hard"): 0.7602, ("Seedance 1.5 Pro", "Hard"): 0.8646, } def default_df(): normal = pd.DataFrame(NORMAL_ROWS, columns=["Model"] + METRIC_COLUMNS) normal["Split"] = "Normal" hard = pd.DataFrame(HARD_ROWS, columns=["Model"] + METRIC_COLUMNS) hard["Split"] = "Hard" df = pd.concat([normal, hard], ignore_index=True) df["Team"] = "" df["Contact"] = "" df["ModelLink"] = "" df["UpdatedAt"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") return df[BASE_COLUMNS] def ensure_schema(df): for col in BASE_COLUMNS: if col not in df.columns: df[col] = "" if col not in METRIC_COLUMNS else 0.0 for c in METRIC_COLUMNS: df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0) df["Split"] = df["Split"].astype(str).str.strip().str.title() df = df[df["Split"].isin(["Normal", "Hard"])].copy() return df[BASE_COLUMNS] def ensure_request_schema(df): for col in REQUEST_COLUMNS: if col not in df.columns: if col in METRIC_COLUMNS: df[col] = 0.0 else: df[col] = "" for c in METRIC_COLUMNS: df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0) df["Split"] = df["Split"].astype(str).str.strip().str.title() df["Status"] = df["Status"].astype(str).str.strip().str.lower() return df[REQUEST_COLUMNS] def apply_canonical_at_updates(df): out = df.copy() for (model, split), at_value in AT_CANONICAL.items(): mask = (out["Model"].astype(str) == model) & (out["Split"].astype(str) == split) out.loc[mask, "AT"] = float(at_value) return out def load_dataset_df(): try: local_csv = hf_hub_download( repo_id=DATASET_REPO, repo_type="dataset", filename=DATASET_FILE, token=HF_TOKEN, ) df = pd.read_csv(local_csv) df = ensure_schema(df) df = apply_canonical_at_updates(df) return df, f"Loaded from dataset: {DATASET_REPO}/{DATASET_FILE}" except Exception as e: return default_df(), f"Fallback to local default rows (dataset read failed: {str(e)[:120]})" def load_request_df(): # Prefer dedicated request repo; fallback to results repo when request repo is unavailable. candidates = [REQUEST_REPO] if DATASET_REPO not in candidates: candidates.append(DATASET_REPO) errors = [] for repo in candidates: try: local_csv = hf_hub_download( repo_id=repo, repo_type="dataset", filename=REQUEST_FILE, token=HF_TOKEN, ) df = pd.read_csv(local_csv) return ensure_request_schema(df), f"Loaded requests: {repo}/{REQUEST_FILE}" except Exception as e: errors.append(f"{repo}: {str(e)[:80]}") return pd.DataFrame(columns=REQUEST_COLUMNS), f"Requests file not found yet. Tried: {' | '.join(errors)}" def save_requests_csv(df, commit_message): """Save requests CSV with repo fallback for robustness.""" preferred = [REQUEST_REPO] if DATASET_REPO not in preferred: preferred.append(DATASET_REPO) errors = [] for repo in preferred: msg = save_csv_to_dataset(df, repo, REQUEST_FILE, commit_message) if msg == "ok": return "ok", repo errors.append(f"{repo}: {msg}") return f"Submit failed. Tried repos -> {' | '.join(errors)}", None def compute_normalized_overall(df): norm_df = df.copy() total_weight = sum(METRIC_WEIGHTS[c] for c in METRIC_COLUMNS) weighted_sum = 0.0 for c in METRIC_COLUMNS: col_min = norm_df[c].min() col_max = norm_df[c].max() if col_max > col_min: col_norm = (norm_df[c] - col_min) / (col_max - col_min) else: col_norm = 0.0 weighted_sum = weighted_sum + col_norm * METRIC_WEIGHTS[c] norm_df["Overall"] = (weighted_sum / total_weight).round(4) return norm_df def add_medals(df): medal_df = df.copy().reset_index(drop=True) medals = [""] * len(medal_df) if len(medal_df) > 0: medals[0] = "🥇" if len(medal_df) > 1: medals[1] = "🥈" if len(medal_df) > 2: medals[2] = "🥉" medal_df["Medal"] = medals return medal_df def view_table(df, split_name, sort_metric): if split_name in ["Normal", "Hard"]: cur = df[df["Split"] == split_name].copy() else: cur = df.copy() if cur.empty: cols = ["Split"] + DISPLAY_COLUMNS if split_name == "All" else DISPLAY_COLUMNS return pd.DataFrame(columns=cols) cur = compute_normalized_overall(cur) sort_col = "Overall" if sort_metric == "Overall" else sort_metric cur = cur.sort_values(by=sort_col, ascending=False).reset_index(drop=True) cur = add_medals(cur) return cur[["Split"] + DISPLAY_COLUMNS] if split_name == "All" else cur[DISPLAY_COLUMNS] def refresh_table(split_name, sort_metric): df, msg = load_dataset_df() return msg, view_table(df, split_name, sort_metric) def save_csv_to_dataset(df, repo_id, file_name, commit_message): if not HF_TOKEN: return "Submit failed: HF_TOKEN secret is missing." try: api = HfApi(token=HF_TOKEN) with tempfile.TemporaryDirectory() as tmpdir: csv_path = os.path.join(tmpdir, "data.csv") df.to_csv(csv_path, index=False) api.upload_file( path_or_fileobj=csv_path, path_in_repo=file_name, repo_id=repo_id, repo_type="dataset", commit_message=commit_message, ) return "ok" except Exception as e: return f"Submit failed while uploading {repo_id}/{file_name}: {str(e)[:300]}" def save_request_artifact(payload, repo_id=None): if not HF_TOKEN: return "Submit failed: HF_TOKEN secret is missing." target_repo = repo_id or REQUEST_REPO try: api = HfApi(token=HF_TOKEN) with tempfile.TemporaryDirectory() as tmpdir: raw_path = os.path.join(tmpdir, "submission.json") with open(raw_path, "w", encoding="utf-8") as f: json.dump(payload, f, ensure_ascii=False, indent=2) ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") raw_name = f"submissions/{ts}_{payload['RequestID']}_{payload['Model'].replace(' ', '_')}_{payload['Split']}.json" api.upload_file( path_or_fileobj=raw_path, path_in_repo=raw_name, repo_id=target_repo, repo_type="dataset", commit_message=f"New request: {payload['Model']} ({payload['Split']})", ) return "ok" except Exception as e: return f"Submit failed while saving raw request artifact: {str(e)[:300]}" def submit_entry(model_name, split_name, team, contact, model_link, av, at, vt, syncnet, sc, df_arena, nisqa, audiobox, dover, aesthetic, split_selector, sort_selector): if not model_name or split_name not in ["Normal", "Hard"]: df, msg = load_dataset_df() return "Submit failed: model name and split are required.", view_table(df, split_selector, sort_selector), msg metrics = { "AV": av, "AT": at, "VT": vt, "SyncNet": syncnet, "SC": sc, "DF-Arena": df_arena, "NISQA": nisqa, "Audiobox": audiobox, "DOVER++": dover, "Aesthetic": aesthetic, } try: metrics = {k: float(v) for k, v in metrics.items()} except Exception: df, msg = load_dataset_df() return "Submit failed: all metric values must be numbers.", view_table(df, split_selector, sort_selector), msg req_df, _ = load_request_df() submitted_at = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") request_id = datetime.utcnow().strftime("REQ%Y%m%d%H%M%S") + "_" + uuid4().hex[:6] row = { "RequestID": request_id, "Status": "pending", "Model": model_name.strip(), "Split": split_name, **metrics, "Team": (team or "").strip(), "Contact": (contact or "").strip(), "ModelLink": (model_link or "").strip(), "SubmittedAt": submitted_at, "ReviewedAt": "", "ReviewNote": "", } req_df = pd.concat([req_df, pd.DataFrame([row])], ignore_index=True) req_df = ensure_request_schema(req_df) save_msg, request_repo_used = save_requests_csv( req_df, f"Add request: {row['Model']} ({row['Split']})", ) if save_msg == "ok": save_msg = save_request_artifact(row, repo_id=request_repo_used) if save_msg == "ok": save_msg = f"Submit succeeded. Request queued: {request_id}. Stored in {request_repo_used}. Awaiting review." latest_df, load_msg = load_dataset_df() return save_msg, view_table(latest_df, split_selector, sort_selector), load_msg def refresh_requests(): req_df, msg = load_request_df() if req_df.empty: pending = req_df else: pending = req_df[req_df["Status"] == "pending"].copy() options = pending["RequestID"].tolist() if not pending.empty else [] return msg, pending, gr.update(choices=options, value=(options[0] if options else None)) def review_request(request_id, decision, review_note, admin_token, split_selector, sort_selector): if not ADMIN_TOKEN: return "Review failed: ADMIN_TOKEN secret is missing.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] if admin_token != ADMIN_TOKEN: return "Review failed: invalid admin token.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] if not request_id: return "Review failed: request id is required.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] req_df, _ = load_request_df() mask = req_df["RequestID"].astype(str) == str(request_id) if not mask.any(): return "Review failed: request id not found.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] idx = req_df[mask].index[0] status = str(req_df.loc[idx, "Status"]).lower().strip() if status != "pending": return "Review failed: request already reviewed.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] now = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") req_df.loc[idx, "Status"] = "approved" if decision == "Approve" else "rejected" req_df.loc[idx, "ReviewedAt"] = now req_df.loc[idx, "ReviewNote"] = (review_note or "").strip() save_msg, _ = save_requests_csv( req_df, f"{decision} request: {request_id}", ) if save_msg != "ok": return f"Review failed: {save_msg}", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] # Approved requests are promoted to results dataset. if decision == "Approve": res_df, _ = load_dataset_df() row = req_df.loc[idx] result_row = { "Model": str(row["Model"]), "Split": str(row["Split"]), **{c: float(row[c]) for c in METRIC_COLUMNS}, "Team": str(row.get("Team", "")), "Contact": str(row.get("Contact", "")), "ModelLink": str(row.get("ModelLink", "")), "UpdatedAt": now, } key_mask = (res_df["Model"].astype(str) == result_row["Model"]) & (res_df["Split"].astype(str) == result_row["Split"]) res_df = res_df[~key_mask].copy() res_df = pd.concat([res_df, pd.DataFrame([result_row])], ignore_index=True) res_df = ensure_schema(res_df) save_res = save_csv_to_dataset( res_df, DATASET_REPO, DATASET_FILE, f"Approve leaderboard entry: {result_row['Model']} ({result_row['Split']})", ) if save_res != "ok": return f"Review partially succeeded but failed to update results: {save_res}", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] req_msg, pending_df, choice_update = refresh_requests() latest_df, load_msg = load_dataset_df() return f"{decision} succeeded for {request_id}.", req_msg, pending_df, choice_update, view_table(latest_df, split_selector, sort_selector), load_msg with gr.Blocks(title="AVBench Leaderboard") as demo: gr.Markdown(AVBENCH_INTRODUCTION) status_md = gr.Markdown(f"Results backend: {DATASET_REPO}/{DATASET_FILE} \\n+Requests backend: {REQUEST_REPO}/{REQUEST_FILE}") with gr.Row(): split_selector = gr.Dropdown(choices=["All", "Normal", "Hard"], value="All", label="Split") sort_selector = gr.Dropdown(choices=["Overall"] + METRIC_COLUMNS, value="Overall", label="Sort By") refresh_btn = gr.Button("Refresh") leaderboard = gr.Dataframe( value=view_table(default_df(), "All", "Overall"), interactive=False, wrap=True, label="Leaderboard", ) with gr.Accordion("Metric Groups", open=False): gr.Markdown( "- Cross-Modal Alignment & Sync: AV, AT, VT, SyncNet\n" "- Unimodal Generation Quality: SC, DF-Arena, NISQA, Audiobox, DOVER++, Aesthetic\n" "- Overall: min-max normalize each metric first, then weighted sum." ) with gr.Accordion("Submit New Result", open=False): gr.Markdown("Submit creates a pending request in requests dataset. It appears on leaderboard only after approval.") with gr.Row(): model_name = gr.Textbox(label="Model Name", placeholder="e.g. MyModel-1") split_name = gr.Dropdown(choices=["Normal", "Hard"], value="Normal", label="Split") team = gr.Textbox(label="Team (optional)") with gr.Row(): contact = gr.Textbox(label="Contact (optional)") model_link = gr.Textbox(label="Model Link (optional)") with gr.Row(): av = gr.Number(label="AV") at = gr.Number(label="AT") vt = gr.Number(label="VT") syncnet = gr.Number(label="SyncNet") sc = gr.Number(label="SC") with gr.Row(): df_arena = gr.Number(label="DF-Arena") nisqa = gr.Number(label="NISQA") audiobox = gr.Number(label="Audiobox") dover = gr.Number(label="DOVER++") aesthetic = gr.Number(label="Aesthetic") submit_btn = gr.Button("Submit") submit_status = gr.Markdown() with gr.Accordion("Review Requests (Admin)", open=False): gr.Markdown("Approve/reject pending requests. Approval writes into results dataset.") admin_token = gr.Textbox(label="Admin Token", type="password") with gr.Row(): req_refresh_btn = gr.Button("Refresh Requests") request_id = gr.Dropdown(choices=[], label="Pending RequestID") decision = gr.Radio(choices=["Approve", "Reject"], value="Approve", label="Decision") review_note = gr.Textbox(label="Review Note (optional)") review_btn = gr.Button("Apply Review") review_status = gr.Markdown() request_status_md = gr.Markdown() pending_table = gr.Dataframe(interactive=False, wrap=True, label="Pending Requests") demo.load(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard]) refresh_btn.click(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard]) split_selector.change(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard]) sort_selector.change(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard]) submit_btn.click( submit_entry, inputs=[ model_name, split_name, team, contact, model_link, av, at, vt, syncnet, sc, df_arena, nisqa, audiobox, dover, aesthetic, split_selector, sort_selector, ], outputs=[submit_status, leaderboard, status_md], ) req_refresh_btn.click( refresh_requests, outputs=[request_status_md, pending_table, request_id], ) review_btn.click( review_request, inputs=[request_id, decision, review_note, admin_token, split_selector, sort_selector], outputs=[review_status, request_status_md, pending_table, request_id, leaderboard, status_md], ) demo.load( refresh_requests, outputs=[request_status_md, pending_table, request_id], ) if __name__ == "__main__": demo.launch(ssr_mode=False)