AVBenchLB / app.py
iiiiii123's picture
Upload app.py with huggingface_hub
4daed9e verified
import json
import os
import tempfile
from datetime import datetime
from uuid import uuid4
import gradio as gr
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download
from constants import AVBENCH_INTRODUCTION, HARD_ROWS, METRIC_COLUMNS, METRIC_WEIGHTS, NORMAL_ROWS
DATASET_REPO = os.getenv("AVBENCH_DATASET_REPO", "iiiiii123/AVBench_results")
DATASET_FILE = os.getenv("AVBENCH_DATASET_FILE", "results.csv")
REQUEST_REPO = os.getenv("AVBENCH_REQUEST_REPO", "iiiiii123/AVBench_requests")
REQUEST_FILE = os.getenv("AVBENCH_REQUEST_FILE", "requests.csv")
HF_TOKEN = os.getenv("HF_TOKEN")
ADMIN_TOKEN = os.getenv("ADMIN_TOKEN")
DISPLAY_COLUMNS = ["Medal", "Model"] + METRIC_COLUMNS + ["Overall", "UpdatedAt"]
BASE_COLUMNS = ["Model", "Split"] + METRIC_COLUMNS + ["Team", "Contact", "ModelLink", "UpdatedAt"]
REQUEST_COLUMNS = [
"RequestID",
"Status",
"Model",
"Split",
] + METRIC_COLUMNS + ["Team", "Contact", "ModelLink", "SubmittedAt", "ReviewedAt", "ReviewNote"]
AT_CANONICAL = {
("Sora 2", "Normal"): 0.8675,
("Veo 3 Fast", "Normal"): 0.8300,
("Wan 2.6", "Normal"): 0.8227,
("Kling 2.6", "Normal"): 0.8061,
("Seedance 1.5 Pro", "Normal"): 0.8554,
("Sora 2", "Hard"): 0.8575,
("Veo 3 Fast", "Hard"): 0.8117,
("Wan 2.6", "Hard"): 0.8418,
("Kling 2.6", "Hard"): 0.7602,
("Seedance 1.5 Pro", "Hard"): 0.8646,
}
def default_df():
normal = pd.DataFrame(NORMAL_ROWS, columns=["Model"] + METRIC_COLUMNS)
normal["Split"] = "Normal"
hard = pd.DataFrame(HARD_ROWS, columns=["Model"] + METRIC_COLUMNS)
hard["Split"] = "Hard"
df = pd.concat([normal, hard], ignore_index=True)
df["Team"] = ""
df["Contact"] = ""
df["ModelLink"] = ""
df["UpdatedAt"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
return df[BASE_COLUMNS]
def ensure_schema(df):
for col in BASE_COLUMNS:
if col not in df.columns:
df[col] = "" if col not in METRIC_COLUMNS else 0.0
for c in METRIC_COLUMNS:
df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)
df["Split"] = df["Split"].astype(str).str.strip().str.title()
df = df[df["Split"].isin(["Normal", "Hard"])].copy()
return df[BASE_COLUMNS]
def ensure_request_schema(df):
for col in REQUEST_COLUMNS:
if col not in df.columns:
if col in METRIC_COLUMNS:
df[col] = 0.0
else:
df[col] = ""
for c in METRIC_COLUMNS:
df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)
df["Split"] = df["Split"].astype(str).str.strip().str.title()
df["Status"] = df["Status"].astype(str).str.strip().str.lower()
return df[REQUEST_COLUMNS]
def apply_canonical_at_updates(df):
out = df.copy()
for (model, split), at_value in AT_CANONICAL.items():
mask = (out["Model"].astype(str) == model) & (out["Split"].astype(str) == split)
out.loc[mask, "AT"] = float(at_value)
return out
def load_dataset_df():
try:
local_csv = hf_hub_download(
repo_id=DATASET_REPO,
repo_type="dataset",
filename=DATASET_FILE,
token=HF_TOKEN,
)
df = pd.read_csv(local_csv)
df = ensure_schema(df)
df = apply_canonical_at_updates(df)
return df, f"Loaded from dataset: {DATASET_REPO}/{DATASET_FILE}"
except Exception as e:
return default_df(), f"Fallback to local default rows (dataset read failed: {str(e)[:120]})"
def load_request_df():
# Prefer dedicated request repo; fallback to results repo when request repo is unavailable.
candidates = [REQUEST_REPO]
if DATASET_REPO not in candidates:
candidates.append(DATASET_REPO)
errors = []
for repo in candidates:
try:
local_csv = hf_hub_download(
repo_id=repo,
repo_type="dataset",
filename=REQUEST_FILE,
token=HF_TOKEN,
)
df = pd.read_csv(local_csv)
return ensure_request_schema(df), f"Loaded requests: {repo}/{REQUEST_FILE}"
except Exception as e:
errors.append(f"{repo}: {str(e)[:80]}")
return pd.DataFrame(columns=REQUEST_COLUMNS), f"Requests file not found yet. Tried: {' | '.join(errors)}"
def save_requests_csv(df, commit_message):
"""Save requests CSV with repo fallback for robustness."""
preferred = [REQUEST_REPO]
if DATASET_REPO not in preferred:
preferred.append(DATASET_REPO)
errors = []
for repo in preferred:
msg = save_csv_to_dataset(df, repo, REQUEST_FILE, commit_message)
if msg == "ok":
return "ok", repo
errors.append(f"{repo}: {msg}")
return f"Submit failed. Tried repos -> {' | '.join(errors)}", None
def compute_normalized_overall(df):
norm_df = df.copy()
total_weight = sum(METRIC_WEIGHTS[c] for c in METRIC_COLUMNS)
weighted_sum = 0.0
for c in METRIC_COLUMNS:
col_min = norm_df[c].min()
col_max = norm_df[c].max()
if col_max > col_min:
col_norm = (norm_df[c] - col_min) / (col_max - col_min)
else:
col_norm = 0.0
weighted_sum = weighted_sum + col_norm * METRIC_WEIGHTS[c]
norm_df["Overall"] = (weighted_sum / total_weight).round(4)
return norm_df
def add_medals(df):
medal_df = df.copy().reset_index(drop=True)
medals = [""] * len(medal_df)
if len(medal_df) > 0:
medals[0] = "🥇"
if len(medal_df) > 1:
medals[1] = "🥈"
if len(medal_df) > 2:
medals[2] = "🥉"
medal_df["Medal"] = medals
return medal_df
def view_table(df, split_name, sort_metric):
if split_name in ["Normal", "Hard"]:
cur = df[df["Split"] == split_name].copy()
else:
cur = df.copy()
if cur.empty:
cols = ["Split"] + DISPLAY_COLUMNS if split_name == "All" else DISPLAY_COLUMNS
return pd.DataFrame(columns=cols)
cur = compute_normalized_overall(cur)
sort_col = "Overall" if sort_metric == "Overall" else sort_metric
cur = cur.sort_values(by=sort_col, ascending=False).reset_index(drop=True)
cur = add_medals(cur)
return cur[["Split"] + DISPLAY_COLUMNS] if split_name == "All" else cur[DISPLAY_COLUMNS]
def refresh_table(split_name, sort_metric):
df, msg = load_dataset_df()
return msg, view_table(df, split_name, sort_metric)
def save_csv_to_dataset(df, repo_id, file_name, commit_message):
if not HF_TOKEN:
return "Submit failed: HF_TOKEN secret is missing."
try:
api = HfApi(token=HF_TOKEN)
with tempfile.TemporaryDirectory() as tmpdir:
csv_path = os.path.join(tmpdir, "data.csv")
df.to_csv(csv_path, index=False)
api.upload_file(
path_or_fileobj=csv_path,
path_in_repo=file_name,
repo_id=repo_id,
repo_type="dataset",
commit_message=commit_message,
)
return "ok"
except Exception as e:
return f"Submit failed while uploading {repo_id}/{file_name}: {str(e)[:300]}"
def save_request_artifact(payload, repo_id=None):
if not HF_TOKEN:
return "Submit failed: HF_TOKEN secret is missing."
target_repo = repo_id or REQUEST_REPO
try:
api = HfApi(token=HF_TOKEN)
with tempfile.TemporaryDirectory() as tmpdir:
raw_path = os.path.join(tmpdir, "submission.json")
with open(raw_path, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=2)
ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
raw_name = f"submissions/{ts}_{payload['RequestID']}_{payload['Model'].replace(' ', '_')}_{payload['Split']}.json"
api.upload_file(
path_or_fileobj=raw_path,
path_in_repo=raw_name,
repo_id=target_repo,
repo_type="dataset",
commit_message=f"New request: {payload['Model']} ({payload['Split']})",
)
return "ok"
except Exception as e:
return f"Submit failed while saving raw request artifact: {str(e)[:300]}"
def submit_entry(model_name, split_name, team, contact, model_link, av, at, vt, syncnet, sc, df_arena, nisqa, audiobox, dover, aesthetic, split_selector, sort_selector):
if not model_name or split_name not in ["Normal", "Hard"]:
df, msg = load_dataset_df()
return "Submit failed: model name and split are required.", view_table(df, split_selector, sort_selector), msg
metrics = {
"AV": av,
"AT": at,
"VT": vt,
"SyncNet": syncnet,
"SC": sc,
"DF-Arena": df_arena,
"NISQA": nisqa,
"Audiobox": audiobox,
"DOVER++": dover,
"Aesthetic": aesthetic,
}
try:
metrics = {k: float(v) for k, v in metrics.items()}
except Exception:
df, msg = load_dataset_df()
return "Submit failed: all metric values must be numbers.", view_table(df, split_selector, sort_selector), msg
req_df, _ = load_request_df()
submitted_at = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
request_id = datetime.utcnow().strftime("REQ%Y%m%d%H%M%S") + "_" + uuid4().hex[:6]
row = {
"RequestID": request_id,
"Status": "pending",
"Model": model_name.strip(),
"Split": split_name,
**metrics,
"Team": (team or "").strip(),
"Contact": (contact or "").strip(),
"ModelLink": (model_link or "").strip(),
"SubmittedAt": submitted_at,
"ReviewedAt": "",
"ReviewNote": "",
}
req_df = pd.concat([req_df, pd.DataFrame([row])], ignore_index=True)
req_df = ensure_request_schema(req_df)
save_msg, request_repo_used = save_requests_csv(
req_df,
f"Add request: {row['Model']} ({row['Split']})",
)
if save_msg == "ok":
save_msg = save_request_artifact(row, repo_id=request_repo_used)
if save_msg == "ok":
save_msg = f"Submit succeeded. Request queued: {request_id}. Stored in {request_repo_used}. Awaiting review."
latest_df, load_msg = load_dataset_df()
return save_msg, view_table(latest_df, split_selector, sort_selector), load_msg
def refresh_requests():
req_df, msg = load_request_df()
if req_df.empty:
pending = req_df
else:
pending = req_df[req_df["Status"] == "pending"].copy()
options = pending["RequestID"].tolist() if not pending.empty else []
return msg, pending, gr.update(choices=options, value=(options[0] if options else None))
def review_request(request_id, decision, review_note, admin_token, split_selector, sort_selector):
if not ADMIN_TOKEN:
return "Review failed: ADMIN_TOKEN secret is missing.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1]
if admin_token != ADMIN_TOKEN:
return "Review failed: invalid admin token.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1]
if not request_id:
return "Review failed: request id is required.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1]
req_df, _ = load_request_df()
mask = req_df["RequestID"].astype(str) == str(request_id)
if not mask.any():
return "Review failed: request id not found.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1]
idx = req_df[mask].index[0]
status = str(req_df.loc[idx, "Status"]).lower().strip()
if status != "pending":
return "Review failed: request already reviewed.", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1]
now = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
req_df.loc[idx, "Status"] = "approved" if decision == "Approve" else "rejected"
req_df.loc[idx, "ReviewedAt"] = now
req_df.loc[idx, "ReviewNote"] = (review_note or "").strip()
save_msg, _ = save_requests_csv(
req_df,
f"{decision} request: {request_id}",
)
if save_msg != "ok":
return f"Review failed: {save_msg}", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1]
# Approved requests are promoted to results dataset.
if decision == "Approve":
res_df, _ = load_dataset_df()
row = req_df.loc[idx]
result_row = {
"Model": str(row["Model"]),
"Split": str(row["Split"]),
**{c: float(row[c]) for c in METRIC_COLUMNS},
"Team": str(row.get("Team", "")),
"Contact": str(row.get("Contact", "")),
"ModelLink": str(row.get("ModelLink", "")),
"UpdatedAt": now,
}
key_mask = (res_df["Model"].astype(str) == result_row["Model"]) & (res_df["Split"].astype(str) == result_row["Split"])
res_df = res_df[~key_mask].copy()
res_df = pd.concat([res_df, pd.DataFrame([result_row])], ignore_index=True)
res_df = ensure_schema(res_df)
save_res = save_csv_to_dataset(
res_df,
DATASET_REPO,
DATASET_FILE,
f"Approve leaderboard entry: {result_row['Model']} ({result_row['Split']})",
)
if save_res != "ok":
return f"Review partially succeeded but failed to update results: {save_res}", *refresh_requests(), view_table(load_dataset_df()[0], split_selector, sort_selector), load_dataset_df()[1]
req_msg, pending_df, choice_update = refresh_requests()
latest_df, load_msg = load_dataset_df()
return f"{decision} succeeded for {request_id}.", req_msg, pending_df, choice_update, view_table(latest_df, split_selector, sort_selector), load_msg
with gr.Blocks(title="AVBench Leaderboard") as demo:
gr.Markdown(AVBENCH_INTRODUCTION)
status_md = gr.Markdown(f"Results backend: {DATASET_REPO}/{DATASET_FILE} \\n+Requests backend: {REQUEST_REPO}/{REQUEST_FILE}")
with gr.Row():
split_selector = gr.Dropdown(choices=["All", "Normal", "Hard"], value="All", label="Split")
sort_selector = gr.Dropdown(choices=["Overall"] + METRIC_COLUMNS, value="Overall", label="Sort By")
refresh_btn = gr.Button("Refresh")
leaderboard = gr.Dataframe(
value=view_table(default_df(), "All", "Overall"),
interactive=False,
wrap=True,
label="Leaderboard",
)
with gr.Accordion("Metric Groups", open=False):
gr.Markdown(
"- Cross-Modal Alignment & Sync: AV, AT, VT, SyncNet\n"
"- Unimodal Generation Quality: SC, DF-Arena, NISQA, Audiobox, DOVER++, Aesthetic\n"
"- Overall: min-max normalize each metric first, then weighted sum."
)
with gr.Accordion("Submit New Result", open=False):
gr.Markdown("Submit creates a pending request in requests dataset. It appears on leaderboard only after approval.")
with gr.Row():
model_name = gr.Textbox(label="Model Name", placeholder="e.g. MyModel-1")
split_name = gr.Dropdown(choices=["Normal", "Hard"], value="Normal", label="Split")
team = gr.Textbox(label="Team (optional)")
with gr.Row():
contact = gr.Textbox(label="Contact (optional)")
model_link = gr.Textbox(label="Model Link (optional)")
with gr.Row():
av = gr.Number(label="AV")
at = gr.Number(label="AT")
vt = gr.Number(label="VT")
syncnet = gr.Number(label="SyncNet")
sc = gr.Number(label="SC")
with gr.Row():
df_arena = gr.Number(label="DF-Arena")
nisqa = gr.Number(label="NISQA")
audiobox = gr.Number(label="Audiobox")
dover = gr.Number(label="DOVER++")
aesthetic = gr.Number(label="Aesthetic")
submit_btn = gr.Button("Submit")
submit_status = gr.Markdown()
with gr.Accordion("Review Requests (Admin)", open=False):
gr.Markdown("Approve/reject pending requests. Approval writes into results dataset.")
admin_token = gr.Textbox(label="Admin Token", type="password")
with gr.Row():
req_refresh_btn = gr.Button("Refresh Requests")
request_id = gr.Dropdown(choices=[], label="Pending RequestID")
decision = gr.Radio(choices=["Approve", "Reject"], value="Approve", label="Decision")
review_note = gr.Textbox(label="Review Note (optional)")
review_btn = gr.Button("Apply Review")
review_status = gr.Markdown()
request_status_md = gr.Markdown()
pending_table = gr.Dataframe(interactive=False, wrap=True, label="Pending Requests")
demo.load(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard])
refresh_btn.click(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard])
split_selector.change(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard])
sort_selector.change(refresh_table, inputs=[split_selector, sort_selector], outputs=[status_md, leaderboard])
submit_btn.click(
submit_entry,
inputs=[
model_name,
split_name,
team,
contact,
model_link,
av,
at,
vt,
syncnet,
sc,
df_arena,
nisqa,
audiobox,
dover,
aesthetic,
split_selector,
sort_selector,
],
outputs=[submit_status, leaderboard, status_md],
)
req_refresh_btn.click(
refresh_requests,
outputs=[request_status_md, pending_table, request_id],
)
review_btn.click(
review_request,
inputs=[request_id, decision, review_note, admin_token, split_selector, sort_selector],
outputs=[review_status, request_status_md, pending_table, request_id, leaderboard, status_md],
)
demo.load(
refresh_requests,
outputs=[request_status_md, pending_table, request_id],
)
if __name__ == "__main__":
demo.launch(ssr_mode=False)