Spaces:

worldbench
/

WorldLens

Running

App Files Files Community

AlanLiangC commited on Dec 9, 2025

Commit

9b69df9

1 Parent(s): 7fdfd5f

first commit

Browse files

Files changed (7) hide show

app.py +329 -196
src/UNKNOWN.egg-info/PKG-INFO +3 -0
src/UNKNOWN.egg-info/SOURCES.txt +15 -0
src/UNKNOWN.egg-info/dependency_links.txt +1 -0
src/UNKNOWN.egg-info/top_level.txt +6 -0
worldlens-results/dreamforge.json +34 -0
worldlens-results/magicdrive.json +34 -0

app.py CHANGED Viewed

@@ -1,204 +1,337 @@
-import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
     )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
-        with gr.Accordion("📙 Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

+import os
+import glob
+import json
+from typing import Dict, Literal, Tuple, List, Optional
 import pandas as pd
+import matplotlib.pyplot as plt
+import gradio as gr
+RESULTS_DIR = "./worldlens-results"
+# 指标好坏方向
+METRICS_MIN_BETTER = [
+    "Depth Discrepancy", "Perceptual Discrepancy",
+    "Photometric Error", "Geometric Discrepancy",
+    "Novel-View Discrepancy",
+    "Displacement Error",
+]
+METRICS_MAX_BETTER = [
+    "Subject Fidelity", "Subject Coherence", "Subject Consistency",
+    "Temporal Consistency", "Semantic Consistency",
+    "View Consistency",             # 你的 JSON 里有这个，默认认为越大越好
+    "Novel-View Quality",
+    "Open-Loop Adherence", "Route Completion", "Closed-Loop Adherence",
+    "Map Segmentation", "3D Object Detection", "3D Object Tracking",
+    "Occupancy Prediction",
+]
+METRIC_BETTER: Dict[str, Literal["min", "max"]] = {
+    m: "min" for m in METRICS_MIN_BETTER
+}
+METRIC_BETTER.update({m: "max" for m in METRICS_MAX_BETTER})
+# 下拉框展示的所有指标（去重+排序）
+METRIC_CHOICES: List[str] = sorted(set(METRICS_MIN_BETTER + METRICS_MAX_BETTER))
+DEFAULT_METRIC = "Subject Fidelity" if "Subject Fidelity" in METRIC_CHOICES else METRIC_CHOICES[0]
+# 全局 DataFrame（所有模型）
+df_all: Optional[pd.DataFrame] = None
+def load_results() -> pd.DataFrame:
+    """
+    从 ./worldlens-results 读取所有 json，整理成一个宽表：
+    每一行是一个模型，每一列是一个指标。
+    """
+    rows = []
+    json_files = sorted(glob.glob(os.path.join(RESULTS_DIR, "*.json")))
+    if not json_files:
+        return pd.DataFrame()
+    for path in json_files:
+        with open(path, "r") as f:
+            data = json.load(f)
+        model_name = os.path.splitext(os.path.basename(path))[0]
+        venue = data.get("venue", "")
+        date = data.get("data", "")  # 你这边字段叫 data，我就直接用
+        row = {
+            "Model": model_name,
+            "venue": venue,
+            "date": date,
+        }
+        metrics = data.get("Metrics", {})
+        # 展开所有子字典，列名直接用 metric 名称（假设唯一）
+        for category, metric_dict in metrics.items():
+            if not isinstance(metric_dict, dict):
+                continue
+            for metric_name, value in metric_dict.items():
+                row[metric_name] = value
+        rows.append(row)
+    df = pd.DataFrame(rows)
+    # 统一列顺序：meta + 指标
+    meta_cols = ["Model", "venue", "date"]
+    metric_cols = [c for c in df.columns if c not in meta_cols]
+    df = df[meta_cols + metric_cols]
+    return df
+def get_venue_choices(df: pd.DataFrame) -> List[str]:
+    if "venue" not in df.columns:
+        return ["All"]
+    venues = sorted([v for v in df["venue"].dropna().unique() if v != ""])
+    return ["All"] + venues
+def update_leaderboard(
+    metric: str,
+    top_k: int,
+    model_filter: str,
+    venue_filter: str,
+    sort_mode: str,
+    selected_metrics: Optional[List[str]],
+) -> Tuple[pd.DataFrame, plt.Figure]:
+    """
+    根据用户选择更新排行榜表格与条形图。
+    metric: 用于排序 & 画图的主指标
+    selected_metrics: 勾选的“想在表格中展示”的其它指标（可以多个）
+    """
+    global df_all
+    if df_all is None or df_all.empty:
+        # 空表兜底
+        fig, ax = plt.subplots(figsize=(6, 3))
+        ax.text(0.5, 0.5, "No results found in ./worldlens-results",
+                ha="center", va="center")
+        ax.axis("off")
+        return pd.DataFrame(), fig
+    df = df_all.copy()
+    # 模型名过滤
+    if model_filter:
+        df = df[df["Model"].str.contains(model_filter, case=False, regex=False)]
+    # venue 过滤
+    if venue_filter and venue_filter != "All":
+        df = df[df["venue"] == venue_filter]
+    if metric not in df.columns:
+        fig, ax = plt.subplots(figsize=(6, 3))
+        ax.text(0.5, 0.5, f"Metric '{metric}' not found in current data.", ha="center", va="center")
+        ax.axis("off")
+        return pd.DataFrame(), fig
+    # 排序方向
+    better = METRIC_BETTER.get(metric, "max")
+    if sort_mode == "Auto":
+        ascending = (better == "min")
+    elif sort_mode == "Ascending (small → large)":
+        ascending = True
+    else:  # "Descending (large → small)"
+        ascending = False
+    df_sorted = df.sort_values(metric, ascending=ascending)
+    # Top-K
+    df_top = df_sorted.head(top_k).copy()
+    # 构造表格列：
+    # 固定: Model, venue, date
+    # + 勾选的指标
+    # + 排序指标（如果没选）
+    cols = ["Model", "venue", "date"]
+    if selected_metrics is None:
+        selected_metrics = []
+    # 去掉不在 df_top 里的指标（有些 metric 可能某些 json 里没计算）
+    for m in selected_metrics:
+        if m in df_top.columns and m not in cols:
+            cols.append(m)
+    if metric in df_top.columns and metric not in cols:
+        cols.append(metric)
+    table_df = df_top[cols].round(3)
+    # 画条形图（只画排序指标）
+    fig, ax = plt.subplots(figsize=(9, 4))
+    ax.barh(table_df["Model"], df_top[metric].iloc[:len(table_df)])
+    ax.set_xlabel(metric)
+    ax.set_ylabel("Model")
+    ax.set_title(f"Leaderboard by {metric}")
+    # 为了让「最好的」在上面：如果按升序(小→大)，我们反转 y 轴，让更小的在上。
+    if ascending:
+        ax.invert_yaxis()
+    plt.tight_layout()
+    return table_df, fig
+def reload_data():
+    """
+    点击“Reload JSONs” / 页面加载时调用：
+    重新加载所有 json，并返回：
+      - 状态文字
+      - venue_dropdown 的更新
+      - 默认的表格和图
+    """
+    global df_all
+    df_all = load_results()
+    if df_all is None or df_all.empty:
+        msg = "No JSON files found in ./worldlens-results. Please upload some results."
+        dummy_fig, ax = plt.subplots(figsize=(6, 3))
+        ax.text(0.5, 0.5, msg, ha="center", va="center")
+        ax.axis("off")
+        venue_update = gr.update(choices=["All"], value="All")
+        return msg, venue_update, pd.DataFrame(), dummy_fig
+    venue_choices = get_venue_choices(df_all)
+    msg = f"Loaded {len(df_all)} models from {RESULTS_DIR}"
+    # 用默认 metric 画一次（selected_metrics 先用一个简单默认）
+    default_selected = ["Subject Fidelity", "Temporal Consistency", "Map Segmentation"]
+    default_selected = [m for m in default_selected if m in METRIC_CHOICES]
+    table_df, fig = update_leaderboard(
+        metric=DEFAULT_METRIC,
+        top_k=10,
+        model_filter="",
+        venue_filter="All",
+        sort_mode="Auto",
+        selected_metrics=default_selected,
     )
+    venue_update = gr.update(
+        choices=venue_choices,
+        value="All",
+        interactive=True,
     )
+    return msg, venue_update, table_df, fig
+with gr.Blocks(css="""
+#title {
+  text-align: center;
+}
+""") as demo:
+    gr.Markdown(
+        """
+# 🌍 WorldLens Leaderboard
+基于 `./worldlens-results/*.json` 的自动排行榜：
+- 选择一个**排序指标**用来排名
+- 勾选多个指标一起在表格中展示
+- 支持模型名搜索 & venue 筛选
+- 自动区分“越大越好 / 越小越好”的指标
+        """,
+        elem_id="title"
     )
+    status_box = gr.Markdown("Loading results...", elem_id="status")
+    with gr.Row():
+        metric_dropdown = gr.Dropdown(
+            label="排序指标 / Metric (for ranking)",
+            choices=METRIC_CHOICES,           # 固定 choices，避免动态更新不兼容
+            value=DEFAULT_METRIC,
+            interactive=True,
+        )
+        sort_mode_radio = gr.Radio(
+            label="排序方式 / Sort mode",
+            choices=[
+                "Auto",
+                "Ascending (small → large)",
+                "Descending (large → small)",
+            ],
+            value="Auto",
+            interactive=True,
+        )
+        topk_slider = gr.Slider(
+            label="显示 Top-K 模型 / Top-K",
+            minimum=3,
+            maximum=50,
+            value=10,
+            step=1,
+            interactive=True,
+        )
+    # 新增：表格中展示的多个指标
+    metrics_select = gr.CheckboxGroup(
+        label="在表格中一起展示的指标 / Metrics to show in table",
+        choices=METRIC_CHOICES,
+        value=["Subject Fidelity", "Temporal Consistency", "Map Segmentation"],
+        interactive=True,
+    )
     with gr.Row():
+        model_filter_box = gr.Textbox(
+            label="模型名过滤（包含关系）/ Filter by model name",
+            placeholder="例如: magic, dream, ...",
+            interactive=True,
+        )
+        venue_dropdown = gr.Dropdown(
+            label="按 Venue 筛选 / Filter by venue",
+            choices=["All"],
+            value="All",
+            interactive=True,
+        )
+    with gr.Row():
+        reload_button = gr.Button("🔄 Reload JSONs", variant="secondary")
+        update_button = gr.Button("✅ Update leaderboard", variant="primary")
+    leaderboard_table = gr.DataFrame(
+        label="Leaderboard",
+        interactive=False,
+    )
+    # 显式指定 format="png"，避免 webp 不支持的问题
+    leaderboard_plot = gr.Plot(label="Metric comparison", format="png")
+    # 点击 Reload：重新加载 + 更新 venue + 表格与图
+    reload_button.click(
+        fn=reload_data,
+        inputs=[],
+        outputs=[status_box, venue_dropdown, leaderboard_table, leaderboard_plot],
+    )
+    # 更新排行榜（多传一个 selected_metrics）
+    update_button.click(
+        fn=update_leaderboard,
+        inputs=[
+            metric_dropdown,
+            topk_slider,
+            model_filter_box,
+            venue_dropdown,
+            sort_mode_radio,
+            metrics_select,
+        ],
+        outputs=[leaderboard_table, leaderboard_plot],
+    )
+    # 页面加载时自动尝试加载一次
+    demo.load(
+        fn=reload_data,
+        inputs=[],
+        outputs=[status_box, venue_dropdown, leaderboard_table, leaderboard_plot],
+    )
+if __name__ == "__main__":
+    demo.launch()  # 本地想公网访问可以改成 demo.launch(share=True)

src/UNKNOWN.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,3 @@

+Metadata-Version: 2.4
+Name: UNKNOWN
+Version: 0.0.0

src/UNKNOWN.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+README.md
+pyproject.toml
+src/about.py
+src/envs.py
+src/populate.py
+src/UNKNOWN.egg-info/PKG-INFO
+src/UNKNOWN.egg-info/SOURCES.txt
+src/UNKNOWN.egg-info/dependency_links.txt
+src/UNKNOWN.egg-info/top_level.txt
+src/display/css_html_js.py
+src/display/formatting.py
+src/display/utils.py
+src/leaderboard/read_evals.py
+src/submission/check_validity.py
+src/submission/submit.py

src/UNKNOWN.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/UNKNOWN.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+about
+display
+envs
+leaderboard
+populate
+submission

worldlens-results/dreamforge.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "venue": "Arxiv'2024",
+    "date": "2024-10-04",
+    "Metrics": {
+        "Generation": {
+            "Subject Fidelity": 28.49,
+            "Subject Coherence": 75.95,
+            "Subject Consistency": 65.22,
+            "Depth Discrepancy": 24.19,
+            "Temporal Consistency": 74.44,
+            "Semantic Consistency": 80.63,
+            "Perceptual Discrepancy": 222.00,
+            "View Consistency": 185.77
+        },
+        "Reconstruction":{
+            "Photometric Error": 0.140,
+            "Geometric Discrepancy": 0.115,
+            "Novel-View Quality": 39.82,
+            "Novel-View Discrepancy": 427.30
+        },
+        "Action-Following":{
+            "Displacement Error": 0.57,
+            "Open-Loop Adherence": 71.23,
+            "Route Completion": 6.89,
+            "Closed-Loop Adherence": 4.82
+        },
+        "Downstream Task":{
+            "Map Segmentation": 18.34,
+            "3D Object Detection": 22.41,
+            "3D Object Tracking": 7.90,
+            "Occupancy Prediction": 23.14
+        }
+    }
+}

worldlens-results/magicdrive.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "venue": "ICLR'2023",
+    "date": "2023-10-04",
+    "Metrics": {
+        "Generation": {
+            "Subject Fidelity": 28.49,
+            "Subject Coherence": 75.95,
+            "Subject Consistency": 65.22,
+            "Depth Discrepancy": 24.19,
+            "Temporal Consistency": 74.44,
+            "Semantic Consistency": 80.63,
+            "Perceptual Discrepancy": 222.00,
+            "View Consistency": 185.77
+        },
+        "Reconstruction":{
+            "Photometric Error": 0.140,
+            "Geometric Discrepancy": 0.115,
+            "Novel-View Quality": 39.82,
+            "Novel-View Discrepancy": 427.30
+        },
+        "Action-Following":{
+            "Displacement Error": 0.57,
+            "Open-Loop Adherence": 71.23,
+            "Route Completion": 6.89,
+            "Closed-Loop Adherence": 4.82
+        },
+        "Downstream Task":{
+            "Map Segmentation": 18.34,
+            "3D Object Detection": 22.41,
+            "3D Object Tracking": 7.90,
+            "Occupancy Prediction": 23.14
+        }
+    }
+}