import gradio as gr import pandas as pd import numpy as np import json import os from pathlib import Path from huggingface_hub import snapshot_download, HfApi # ========================= # Basic Config # ========================= DATASET_REPO = "Fysics-AI/FysicsWorld-Leaderborad-Result" HF_TOKEN = os.environ.get("HF_TOKEN") TRACK_TO_CSV = { "omni-mllm": "omni-mllm.csv", "image-gen": "image-gen.csv", "video-gen": "video-gen.csv", } # ========================= # Download Dataset (once) # ========================= LOCAL_DATA_DIR = Path( snapshot_download( repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN, ) ) print("📂 Dataset dir:", LOCAL_DATA_DIR) print("📄 Files:", [p.name for p in LOCAL_DATA_DIR.iterdir()]) # ========================= # Column Rename Maps (关键修复点) # ========================= OMNI_MLLM_RENAME = { "Task1-1": "Image\nUnderstanding", "Task1-2": "Video\nUnderstanding", "Task2-1": "Speech-Driven\nImage Understanding", "Task2-2": "Image-Audio\nReasoning", "Task2-3": "Speech-Based\nImage QA", "Task2-4": "Speech Generation\nfrom Image", "Task2-5": "Audio Matching\nfrom Image", "Task3-1": "Speech-Driven\nVideo Understanding", "Task3-2": "Video-Audio\nReasoning", "Task3-3": "Speech-Based\nVideo QA", "Task3-4": "Speech Generation\nfrom Video", "Task3-5": "Audio Matching\nfrom Video", "Task3-6": "Next-Action\nPrediction", } AUDIO_RENAME = { "Task1-3": "Audio Reasoning" } IMAGE_GEN_RENAME = { "WIScore": "WIScore", "SC": "Semantic\nConsistency", "PQ": "Perceptual\nQuality", "OR": "Overall\nQuality", } VIDEO_GEN_RENAME = { "Imaging": "Imaging", "Aesthetic": "Aesthetic", "Motion": "Motion", "Temporal": "Temporal", } # ========================= # Utils # ========================= def format_numeric_columns(df, decimals=2): df = df.copy() numeric_cols = df.select_dtypes(include=[np.number]).columns for col in numeric_cols: df[col] = df[col].map( lambda x: f"{x:.{decimals}f}" if pd.notnull(x) else "" ) return df def load_csv(filename, sort_key=None, ascending=False): csv_path = LOCAL_DATA_DIR / filename df = pd.read_csv(csv_path) if sort_key and sort_key in df.columns: df = df.sort_values(sort_key, ascending=ascending) df = format_numeric_columns(df, decimals=2) return df # ========================= # Submission Logic(不动) # ========================= api = HfApi() def parse_submission(file_bytes): data = json.loads(file_bytes.decode("utf-8")) required = ["benchmark", "track", "model", "type", "metrics"] for k in required: if k not in data: raise ValueError(f"Missing field: {k}") if data["benchmark"] != "OmniWorld": raise ValueError("Invalid benchmark") if data["track"] not in TRACK_TO_CSV: raise ValueError("Invalid track") return data def append_submission(data): csv_name = TRACK_TO_CSV[data["track"]] csv_path = LOCAL_DATA_DIR / csv_name df = pd.read_csv(csv_path) if data["model"] in df["Model"].values: raise ValueError("Model already exists in leaderboard") row = { "Model": data["model"], "Type": data["type"], } row.update(data["metrics"]) df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) df.to_csv(csv_path, index=False) api.upload_file( path_or_fileobj=str(csv_path), path_in_repo=csv_name, repo_id=DATASET_REPO, repo_type="dataset", token=HF_TOKEN, ) def handle_submit(file): if file is None: return "❌ No file uploaded" try: data = parse_submission(file) append_submission(data) return "✅ Submission successful! Please refresh leaderboard." except Exception as e: return f"❌ Error: {str(e)}" # ========================= # Gradio UI # ========================= with gr.Blocks( theme=gr.themes.Soft(), css=""" .container { max-width: 1200px; margin: auto; } .leaderboard-links a { display: inline-block; margin: 0 8px; padding: 6px 12px; border-radius: 20px; background: #f4f4f5; color: #111827; text-decoration: none; font-weight: 500; font-size: 14px; } .leaderboard-links a:hover { background: #e5e7eb; } .description { max-width: 900px; margin: 18px auto 30px auto; font-size: 16px; line-height: 1.7; color: #374151; text-align: center; } body, .gradio-container { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", "Liberation Sans", sans-serif; } /* OmniLLM 表格:第 1 列(Model) */ table th:nth-child(1), table td:nth-child(1) { min-width: 220px; max-width: 220px; white-space: nowrap; } /* 第 2 列(Type) */ table th:nth-child(2), table td:nth-child(2) { min-width: 120px; max-width: 120px; } .overall-definition { max-width: 900px; margin: 30px auto 40px auto; padding: 22px 28px; background: #f9fafb; border: 1px solid #e5e7eb; border-radius: 14px; font-size: 15px; line-height: 1.7; color: #1f2937; } .overall-definition h3 { text-align: center; font-size: 22px; margin-bottom: 16px; } .overall-definition strong { color: #111827; } """) as demo: gr.Markdown( """

🏆 FysicsWorld Leaderboard

We introduce FysicsWorld, the first unified full-modality benchmark that supports bidirectional input-output across image, video, audio, and text, enabling comprehensive any-to-any evaluation across understanding, generation, and reasoning. Our systematic design spans uni-modal perception tasks to fusion-dependent reasoning under strong cross-modal coupling, allowing us to diagnose, with unprecedented clarity, the limitations and emerging strengths of modern multimodal and omni-modal architectures.
""" ) with gr.Tabs(): # ---------- OmniLLM / MLLM ---------- with gr.Tab("🧠 OmniLLM / MLLM"): gr.Markdown("Evaluation results for OmniLLM / MLLM models.") df_omni = load_csv("omni-mllm.csv", sort_key="Overall") df_omni = df_omni.rename(columns=OMNI_MLLM_RENAME) omni_table = gr.Dataframe( value=df_omni, interactive=False, wrap=True ) # ---------- Image Generation ---------- with gr.Tab("🎨 Image Generation"): gr.Markdown("Evaluation results for image generation models.") df_img = load_csv("image-gen.csv", sort_key="Overall") df_img = df_img.rename(columns=IMAGE_GEN_RENAME) image_table = gr.Dataframe( value=df_img, interactive=False, ) # ---------- Video Generation ---------- with gr.Tab("🎬 Video Generation"): gr.Markdown("Evaluation results for video generation models.") df_vid = load_csv("video-gen.csv", sort_key="Overall") df_vid = df_vid.rename(columns=VIDEO_GEN_RENAME) video_table = gr.Dataframe( value=df_vid, interactive=False, ) # ---------- Audio Reasoning ---------- with gr.Tab("🎵 Audio Reasoning"): gr.Markdown("Evaluation results for OmniLLMs, MLLMs, and AudioLLMs.") df_aud = load_csv("audio-reasoning.csv", sort_key="Task1-3") df_aud = df_aud.rename(columns=AUDIO_RENAME) audio_table = gr.Dataframe( value=df_aud, interactive=False, ) # ---------- Refresh ---------- gr.Button("🔄 Refresh All").click( fn=lambda: ( load_csv("omni-mllm.csv", "Overall").rename(columns=OMNI_MLLM_RENAME), load_csv("image-gen.csv", "Overall").rename(columns=IMAGE_GEN_RENAME), load_csv("video-gen.csv", "Overall").rename(columns=VIDEO_GEN_RENAME), load_csv("audio-reasoning.csv", "Task1-3").rename(columns=AUDIO_RENAME), ), outputs=[omni_table, image_table, video_table, audio_table], ) gr.Markdown( r""" ### 📊 Overall Score Definition To facilitate clearer and more consistent comparison across models, we introduce an **Overall** score for each leaderboard track. **1. OmniLLM / MLLM** The **Overall** score is computed as the arithmetic mean of all reported task-specific scores. **2. Image Generation** The evaluation involves metrics defined on different numerical scales. **WIScore** is used for image generation, while **VIEScore** (averaged over three dimensions) is used for image editing. The **Overall** score is defined as: $$ \text{Overall}=\frac{(\text{WIScore}\times 10)+\left(\frac{\sum \text{VIEScore}}{3}\right)}{2} $$ This normalization-based formulation ensures a balanced contribution from both image generation and image editing performance. **3. Video Generation** The **Overall** score is calculated as the arithmetic mean of all evaluated dimensions, including imaging quality, aesthetics, motion, and temporal consistency. """ ) demo.launch()