| import json |
| import os |
| import tempfile |
| from datetime import datetime |
| from uuid import uuid4 |
|
|
| import gradio as gr |
| import pandas as pd |
| from huggingface_hub import HfApi, hf_hub_download |
|
|
| from constants import AVBENCH_INTRODUCTION, HARD_ROWS, METRIC_COLUMNS, METRIC_WEIGHTS, NORMAL_ROWS |
|
|
|
|
| DATASET_REPO = os.getenv("AVBENCH_DATASET_REPO", "iiiiii123/AVBench_results") |
| DATASET_FILE = os.getenv("AVBENCH_DATASET_FILE", "results.csv") |
| REQUEST_REPO = os.getenv("AVBENCH_REQUEST_REPO", "iiiiii123/AVBench_requests") |
| REQUEST_FILE = os.getenv("AVBENCH_REQUEST_FILE", "requests.csv") |
| HF_TOKEN = os.getenv("HF_TOKEN") |
| ADMIN_TOKEN = os.getenv("ADMIN_TOKEN") |
|
|
| DISPLAY_COLUMNS = ["Medal", "Model"] + METRIC_COLUMNS + ["Overall", "UpdatedAt"] |
| BASE_COLUMNS = ["Model", "Split"] + METRIC_COLUMNS + ["Team", "Contact", "ModelLink", "UpdatedAt"] |
| REQUEST_COLUMNS = [ |
| "RequestID", |
| "Status", |
| "Model", |
| "Split", |
| ] + METRIC_COLUMNS + ["Team", "Contact", "ModelLink", "SubmittedAt", "ReviewedAt", "ReviewNote"] |
|
|
| AT_CANONICAL = { |
| ("Sora 2", "Normal"): 0.8675, |
| ("Veo 3 Fast", "Normal"): 0.8300, |
| ("Wan 2.6", "Normal"): 0.8227, |
| ("Kling 2.6", "Normal"): 0.8061, |
| ("Seedance 1.5 Pro", "Normal"): 0.8554, |
| ("Sora 2", "Hard"): 0.8575, |
| ("Veo 3 Fast", "Hard"): 0.8117, |
| ("Wan 2.6", "Hard"): 0.8418, |
| ("Kling 2.6", "Hard"): 0.7602, |
| ("Seedance 1.5 Pro", "Hard"): 0.8646, |
| } |
|
|
|
|
| def default_df(): |
| normal = pd.DataFrame(NORMAL_ROWS, columns=["Model"] + METRIC_COLUMNS) |
| normal["Split"] = "Normal" |
| hard = pd.DataFrame(HARD_ROWS, columns=["Model"] + METRIC_COLUMNS) |
| hard["Split"] = "Hard" |
| df = pd.concat([normal, hard], ignore_index=True) |
| df["Team"] = "" |
| df["Contact"] = "" |
| df["ModelLink"] = "" |
| df["UpdatedAt"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") |
| return df[BASE_COLUMNS] |
|
|
|
|
| def ensure_schema(df): |
| for col in BASE_COLUMNS: |
| if col not in df.columns: |
| df[col] = "" if col not in METRIC_COLUMNS else 0.0 |
| for c in METRIC_COLUMNS: |
| df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0) |
| df["Split"] = df["Split"].astype(str).str.strip().str.title() |
| df = df[df["Split"].isin(["Normal", "Hard"])].copy() |
| return df[BASE_COLUMNS] |
|
|
|
|
| def ensure_request_schema(df): |
| for col in REQUEST_COLUMNS: |
| if col not in df.columns: |
| if col in METRIC_COLUMNS: |
| df[col] = 0.0 |
| else: |
| df[col] = "" |
| for c in METRIC_COLUMNS: |
| df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0) |
| df["Split"] = df["Split"].astype(str).str.strip().str.title() |
| df["Status"] = df["Status"].astype(str).str.strip().str.lower() |
| return df[REQUEST_COLUMNS] |
|
|
|
|
| def apply_canonical_at_updates(df): |
| out = df.copy() |
| for (model, split), at_value in AT_CANONICAL.items(): |
| mask = (out["Model"].astype(str) == model) & (out["Split"].astype(str) == split) |
| out.loc[mask, "AT"] = float(at_value) |
| return out |
|
|
|
|
| def load_dataset_df(): |
| try: |
| local_csv = hf_hub_download( |
| repo_id=DATASET_REPO, |
| repo_type="dataset", |
| filename=DATASET_FILE, |
| token=HF_TOKEN, |
| ) |
| df = pd.read_csv(local_csv) |
| df = ensure_schema(df) |
| df = apply_canonical_at_updates(df) |
| return df, f"Loaded from dataset: {DATASET_REPO}/{DATASET_FILE}" |
| except Exception as e: |
| return default_df(), f"Fallback to local default rows (dataset read failed: {str(e)[:120]})" |
|
|
|
|
| def load_request_df(): |
| |
| candidates = [REQUEST_REPO] |
| if DATASET_REPO not in candidates: |
| candidates.append(DATASET_REPO) |
|
|
| errors = [] |
| for repo in candidates: |
| try: |
| local_csv = hf_hub_download( |
| repo_id=repo, |
| repo_type="dataset", |
| filename=REQUEST_FILE, |
| token=HF_TOKEN, |
| ) |
| df = pd.read_csv(local_csv) |
| return ensure_request_schema(df), f"Loaded requests: {repo}/{REQUEST_FILE}" |
| except Exception as e: |
| errors.append(f"{repo}: {str(e)[:80]}") |
|
|
| return pd.DataFrame(columns=REQUEST_COLUMNS), f"Requests file not found yet. Tried: {' | '.join(errors)}" |
|
|
|
|
| def save_requests_csv(df, commit_message): |
| """Save requests CSV with repo fallback for robustness.""" |
| preferred = [REQUEST_REPO] |
| if DATASET_REPO not in preferred: |
| preferred.append(DATASET_REPO) |
|
|
| errors = [] |
| for repo in preferred: |
| msg = save_csv_to_dataset(df, repo, REQUEST_FILE, commit_message) |
| if msg == "ok": |
| return "ok", repo |
| errors.append(f"{repo}: {msg}") |
|
|
| return f"Submit failed. Tried repos -> {' | '.join(errors)}", None |
|
|
|
|
| def compute_normalized_overall(df): |
| norm_df = df.copy() |
| total_weight = sum(METRIC_WEIGHTS[c] for c in METRIC_COLUMNS) |
| weighted_sum = 0.0 |
|
|
| for c in METRIC_COLUMNS: |
| col_min = norm_df[c].min() |
| col_max = norm_df[c].max() |
| if col_max > col_min: |
| col_norm = (norm_df[c] - col_min) / (col_max - col_min) |
| else: |
| col_norm = 0.0 |
| weighted_sum = weighted_sum + col_norm * METRIC_WEIGHTS[c] |
|
|
| norm_df["Overall"] = (weighted_sum / total_weight).round(4) |
| return norm_df |
|
|
|
|
| def add_medals(df): |
| medal_df = df.copy().reset_index(drop=True) |
| medals = [""] * len(medal_df) |
| if len(medal_df) > 0: |
| medals[0] = "🥇" |
| if len(medal_df) > 1: |
| medals[1] = "🥈" |
| if len(medal_df) > 2: |
| medals[2] = "🥉" |
| medal_df["Medal"] = medals |
| return medal_df |
|
|
|
|
| def view_table(df, split_name, sort_metric): |
| if split_name in ["Normal", "Hard"]: |
| cur = df[df["Split"] == split_name].copy() |
| else: |
| cur = df.copy() |
|
|
| if cur.empty: |
| cols = ["Split"] + DISPLAY_COLUMNS if split_name == "All" else DISPLAY_COLUMNS |
| return pd.DataFrame(columns=cols) |
|
|
| cur = compute_normalized_overall(cur) |
| sort_col = "Overall" if sort_metric == "Overall" else sort_metric |
| cur = cur.sort_values(by=sort_col, ascending=False).reset_index(drop=True) |
| cur = add_medals(cur) |
| return cur[["Split"] + DISPLAY_COLUMNS] if split_name == "All" else cur[DISPLAY_COLUMNS] |
|
|
|
|
| def refresh_table(split_name, sort_metric): |
| df, msg = load_dataset_df() |
| return msg, view_table(df, split_name, sort_metric) |
|
|
|
|
| def save_csv_to_dataset(df, repo_id, file_name, commit_message): |
| if not HF_TOKEN: |
| return "Submit failed: HF_TOKEN secret is missing." |
|
|
| try: |
| api = HfApi(token=HF_TOKEN) |
| with tempfile.TemporaryDirectory() as tmpdir: |
| csv_path = os.path.join(tmpdir, "data.csv") |
| df.to_csv(csv_path, index=False) |
| api.upload_file( |
| path_or_fileobj=csv_path, |
| path_in_repo=file_name, |
| repo_id=repo_id, |
| repo_type="dataset", |
| commit_message=commit_message, |
| ) |
| return "ok" |
| except Exception as e: |
| return f"Submit failed while uploading {repo_id}/{file_name}: {str(e)[:300]}" |
|
|
|
|
| def save_request_artifact(payload, repo_id=None): |
| if not HF_TOKEN: |
| return "Submit failed: HF_TOKEN secret is missing." |
|
|
| target_repo = repo_id or REQUEST_REPO |
|
|
| try: |
| api = HfApi(token=HF_TOKEN) |
| with tempfile.TemporaryDirectory() as tmpdir: |
| raw_path = os.path.join(tmpdir, "submission.json") |
| with open(raw_path, "w", encoding="utf-8") as f: |
| json.dump(payload, f, ensure_ascii=False, indent=2) |
|
|
| ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S") |
| raw_name = f"submissions/{ts}_{payload['RequestID']}_{payload['Model'].replace(' ', '_')}_{payload['Split']}.json" |
| api.upload_file( |
| path_or_fileobj=raw_path, |
| path_in_repo=raw_name, |
| repo_id=target_repo, |
| repo_type="dataset", |
| commit_message=f"New request: {payload['Model']} ({payload['Split']})", |
| ) |
| return "ok" |
| except Exception as e: |
| return f"Submit failed while saving raw request artifact: {str(e)[:300]}" |
|
|
|
|
| def submit_entry(model_name, split_name, team, contact, model_link, av, at, vt, syncnet, sc, df_arena, nisqa, audiobox, dover, aesthetic, split_selector, sort_selector): |
| if not model_name or split_name not in ["Normal", "Hard"]: |
| df, msg = load_dataset_df() |
| return "Submit failed: model name and split are required.", view_table(df, split_selector, sort_selector), msg |
|
|
| metrics = { |
| "AV": av, |
| "AT": at, |
| "VT": vt, |
| "SyncNet": syncnet, |
| "SC": sc, |
| "DF-Arena": df_arena, |
| "NISQA": nisqa, |
| "Audiobox": audiobox, |
| "DOVER++": dover, |
| "Aesthetic": aesthetic, |
| } |
|
|
| try: |
| metrics = {k: float(v) for k, v in metrics.items()} |
| except Exception: |
| df, msg = load_dataset_df() |
| return "Submit failed: all metric values must be numbers.", view_table(df, split_selector, sort_selector), msg |
|
|
| req_df, _ = load_request_df() |
| submitted_at = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") |
| request_id = datetime.utcnow().strftime("REQ%Y%m%d%H%M%S") + "_" + uuid4().hex[:6] |
| row = { |
| "RequestID": request_id, |
| "Status": "pending", |
| "Model": model_name.strip(), |
| "Split": split_name, |
| **metrics, |
| "Team": (team or "").strip(), |
| "Contact": (contact or "").strip(), |
| "ModelLink": (model_link or "").strip(), |
| "SubmittedAt": submitted_at, |
| "ReviewedAt": "", |
| "ReviewNote": "", |
| } |
|
|
| req_df = pd.concat([req_df, pd.DataFrame([row])], ignore_index=True) |
| req_df = ensure_request_schema(req_df) |
|
|
| save_msg, request_repo_used = save_requests_csv( |
| req_df, |
| f"Add request: {row['Model']} ({row['Split']})", |
| ) |
| if save_msg == "ok": |
| save_msg = save_request_artifact(row, repo_id=request_repo_used) |
| if save_msg == "ok": |
| save_msg = f"Submit succeeded. Request queued: {request_id}. Stored in {request_repo_used}. Awaiting review." |
|
|
| latest_df, load_msg = load_dataset_df() |
| return save_msg, view_table(latest_df, split_selector, sort_selector), load_msg |
|
|
|
|
| def refresh_requests(): |
| req_df, msg = load_request_df() |
| if req_df.empty: |
| pending = req_df |
| else: |
| pending = req_df[req_df["Status"] == "pending"].copy() |
| options = pending["RequestID"].tolist() if not pending.empty else [] |
| return msg, pending, gr.update(choices=options, value=(options[0] if options else None)) |
|
|
|
|
| def review_request(request_id, decision, review_note, admin_token, split_selector, sort_selector): |
| if not ADMIN_TOKEN: |
| return "Review failed: ADMIN_TOKEN secret is missing.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] |
| if admin_token != ADMIN_TOKEN: |
| return "Review failed: invalid admin token.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] |
| if not request_id: |
| return "Review failed: request id is required.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] |
|
|
| req_df, _ = load_request_df() |
| mask = req_df["RequestID"].astype(str) == str(request_id) |
| if not mask.any(): |
| return "Review failed: request id not found.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] |
|
|
| idx = req_df[mask].index[0] |
| status = str(req_df.loc[idx, "Status"]).lower().strip() |
| if status != "pending": |
| return "Review failed: request already reviewed.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] |
|
|
| now = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") |
| req_df.loc[idx, "Status"] = "approved" if decision == "Approve" else "rejected" |
| req_df.loc[idx, "ReviewedAt"] = now |
| req_df.loc[idx, "ReviewNote"] = (review_note or "").strip() |
|
|
| save_msg, _ = save_requests_csv( |
| req_df, |
| f"{decision} request: {request_id}", |
| ) |
| if save_msg != "ok": |
| return f"Review failed: {save_msg}", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] |
|
|
| |
| if decision == "Approve": |
| res_df, _ = load_dataset_df() |
| row = req_df.loc[idx] |
| result_row = { |
| "Model": str(row["Model"]), |
| "Split": str(row["Split"]), |
| **{c: float(row[c]) for c in METRIC_COLUMNS}, |
| "Team": str(row.get("Team", "")), |
| "Contact": str(row.get("Contact", "")), |
| "ModelLink": str(row.get("ModelLink", "")), |
| "UpdatedAt": now, |
| } |
| key_mask = (res_df["Model"].astype(str) == result_row["Model"]) & (res_df["Split"].astype(str) == result_row["Split"]) |
| res_df = res_df[~key_mask].copy() |
| res_df = pd.concat([res_df, pd.DataFrame([result_row])], ignore_index=True) |
| res_df = ensure_schema(res_df) |
|
|
| save_res = save_csv_to_dataset( |
| res_df, |
| DATASET_REPO, |
| DATASET_FILE, |
| f"Approve leaderboard entry: {result_row['Model']} ({result_row['Split']})", |
| ) |
| if save_res != "ok": |
| return f"Review partially succeeded but failed to update results: {save_res}", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1] |
|
|
| req_msg, pending_df, choice_update = refresh_requests() |
| latest_df, load_msg = load_dataset_df() |
| return f"{decision} succeeded for {request_id}.", req_msg, pending_df, choice_update, view_table(latest_df, split_selector, sort_selector), load_msg |
|
|
|
|
| with gr.Blocks(title="AVBench Leaderboard") as demo: |
| gr.Markdown(AVBENCH_INTRODUCTION) |
| status_md = gr.Markdown(f"Results backend: {DATASET_REPO}/{DATASET_FILE} \\n+Requests backend: {REQUEST_REPO}/{REQUEST_FILE}") |
|
|
| with gr.Row(): |
| split_selector = gr.Dropdown(choices=["All", "Normal", "Hard"], value="All", label="Split") |
| sort_selector = gr.Dropdown(choices=["Overall"] + METRIC_COLUMNS, value="Overall", label="Sort By") |
| refresh_btn = gr.Button("Refresh") |
|
|
| leaderboard = gr.Dataframe( |
| value=view_table(default_df(), "All", "Overall"), |
| interactive=False, |
| wrap=True, |
| label="Leaderboard", |
| ) |
|
|
| with gr.Accordion("Metric Groups", open=False): |
| gr.Markdown( |
| "- Cross-Modal Alignment & Sync: AV, AT, VT, SyncNet\n" |
| "- Unimodal Generation Quality: SC, DF-Arena, NISQA, Audiobox, DOVER++, Aesthetic\n" |
| "- Overall: min-max normalize each metric first, then weighted sum." |
| ) |
|
|
| with gr.Accordion("Submit New Result", open=False): |
| gr.Markdown("Submit creates a pending request in requests dataset. It appears on leaderboard only after approval.") |
| with gr.Row(): |
| model_name = gr.Textbox(label="Model Name", placeholder="e.g. MyModel-1") |
| split_name = gr.Dropdown(choices=["Normal", "Hard"], value="Normal", label="Split") |
| team = gr.Textbox(label="Team (optional)") |
| with gr.Row(): |
| contact = gr.Textbox(label="Contact (optional)") |
| model_link = gr.Textbox(label="Model Link (optional)") |
| with gr.Row(): |
| av = gr.Number(label="AV") |
| at = gr.Number(label="AT") |
| vt = gr.Number(label="VT") |
| syncnet = gr.Number(label="SyncNet") |
| sc = gr.Number(label="SC") |
| with gr.Row(): |
| df_arena = gr.Number(label="DF-Arena") |
| nisqa = gr.Number(label="NISQA") |
| audiobox = gr.Number(label="Audiobox") |
| dover = gr.Number(label="DOVER++") |
| aesthetic = gr.Number(label="Aesthetic") |
|
|
| submit_btn = gr.Button("Submit") |
| submit_status = gr.Markdown() |
|
|
| with gr.Accordion("Review Requests (Admin)", open=False): |
| gr.Markdown("Approve/reject pending requests. Approval writes into results dataset.") |
| admin_token = gr.Textbox(label="Admin Token", type="password") |
| with gr.Row(): |
| req_refresh_btn = gr.Button("Refresh Requests") |
| request_id = gr.Dropdown(choices=[], label="Pending RequestID") |
| decision = gr.Radio(choices=["Approve", "Reject"], value="Approve", label="Decision") |
| review_note = gr.Textbox(label="Review Note (optional)") |
| review_btn = gr.Button("Apply Review") |
| review_status = gr.Markdown() |
| request_status_md = gr.Markdown() |
| pending_table = gr.Dataframe(interactive=False, wrap=True, label="Pending Requests") |
|
|
| demo.load(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard]) |
| refresh_btn.click(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard]) |
| split_selector.change(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard]) |
| sort_selector.change(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard]) |
|
|
| submit_btn.click( |
| submit_entry, |
| inputs=[ |
| model_name, |
| split_name, |
| team, |
| contact, |
| model_link, |
| av, |
| at, |
| vt, |
| syncnet, |
| sc, |
| df_arena, |
| nisqa, |
| audiobox, |
| dover, |
| aesthetic, |
| split_selector, |
| sort_selector, |
| ], |
| outputs=[submit_status, leaderboard, status_md], |
| ) |
|
|
| req_refresh_btn.click( |
| refresh_requests, |
| outputs=[request_status_md, pending_table, request_id], |
| ) |
|
|
| review_btn.click( |
| review_request, |
| inputs=[request_id, decision, review_note, admin_token, split_selector, sort_selector], |
| outputs=[review_status, request_status_md, pending_table, request_id, leaderboard, status_md], |
| ) |
|
|
| demo.load( |
| refresh_requests, |
| outputs=[request_status_md, pending_table, request_id], |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch(ssr_mode=False) |
|
|