Spaces:

Hrant
/

leaderboarder-quantiphy

Sleeping

App Files Files Community

Hrant commited on Mar 1

Commit

e5082d1

verified ·

1 Parent(s): b54bf61

Update leaderboard via Leaderboarder

Browse files

Files changed (8) hide show

app.py +48 -35
benchmark.json +10 -8
leaderboard.csv +22 -23
leaderboard.json +164 -264
leaderboard.md +23 -24
leaderboard.parquet +2 -2
leaderboard_raw.json +443 -0
plan.json +5 -3

app.py CHANGED Viewed

@@ -1,57 +1,70 @@
 import gradio as gr
 import pandas as pd
-TITLE = 'Leaderboard Leaderboard'
-DATA_PATH = "leaderboard.csv"
-LOWER_BETTER_TOKENS = ["error", "loss", "perplexity", "wer", "cer", "distance"]
 df = pd.read_csv(DATA_PATH)
-for col in ["score", "rank", "extraction_confidence"]:
-    if col in df.columns:
-        df[col] = pd.to_numeric(df[col], errors="coerce")
-metrics = ["All"] + sorted([m for m in df["metric"].dropna().unique().tolist()])
-tasks = ["All"] + sorted([t for t in df["task"].dropna().unique().tolist()])
-splits = ["All"] + sorted([s for s in df["split"].dropna().unique().tolist()])
-def render(metric, task, split, query, top_k):
     x = df.copy()
-    if metric != "All":
-        x = x[x["metric"] == metric]
-    if task != "All":
-        x = x[x["task"] == task]
-    if split != "All":
-        x = x[x["split"] == split]
     if query.strip():
-        x = x[x["model_name"].str.contains(query.strip(), case=False, na=False)]
-    ascending = False
-    if metric != "All":
-        lowered = metric.lower()
-        ascending = any(token in lowered for token in LOWER_BETTER_TOKENS)
-    x = x.sort_values(by=["score"], ascending=ascending, na_position="last").head(int(top_k))
-    x = x.reset_index(drop=True)
     x.insert(0, "display_rank", x.index + 1)
     return x
 with gr.Blocks(title=TITLE) as demo:
     gr.Markdown(f"# {TITLE}")
     gr.Markdown(f"Rows: {len(df)}")
-    with gr.Row():
-        metric = gr.Dropdown(metrics, value="All", label="Metric")
-        task = gr.Dropdown(tasks, value="All", label="Task")
-        split = gr.Dropdown(splits, value="All", label="Split")
     with gr.Row():
         query = gr.Textbox(label="Model contains")
         top_k = gr.Slider(minimum=5, maximum=500, step=1, value=100, label="Top K")
-    table = gr.Dataframe(value=render("All", "All", "All", "", 100), interactive=False)
-    metric.change(render, [metric, task, split, query, top_k], table)
-    task.change(render, [metric, task, split, query, top_k], table)
-    split.change(render, [metric, task, split, query, top_k], table)
-    query.change(render, [metric, task, split, query, top_k], table)
-    top_k.change(render, [metric, task, split, query, top_k], table)
 demo.launch()

+import json
+from pathlib import Path
 import gradio as gr
 import pandas as pd
+TITLE = 'QuantiPhy Leaderboard'
+DATA_PATH = Path("leaderboard.csv")
+META_PATH = Path("benchmark.json")
 df = pd.read_csv(DATA_PATH)
+if "score" in df.columns:
+    df["score"] = pd.to_numeric(df["score"], errors="coerce")
+meta = {}
+if META_PATH.exists():
+    try:
+        meta = json.loads(META_PATH.read_text(encoding="utf-8"))
+    except Exception:
+        meta = {}
+def metric_narrative():
+    likely_metrics = (((meta.get("analysis") or {}).get("likely_metrics")) or [])
+    if likely_metrics:
+        return (
+            "This benchmark is ranked by the primary `score` column (descending). "
+            "Reported benchmark metrics include: " + ", ".join([str(x) for x in likely_metrics]) + "."
+        )
+    known_non_metrics = {
+        "model_name",
+        "score",
+        "task",
+        "source_title",
+        "source_url",
+        "notes",
+    }
+    metric_like = [c for c in df.columns if c not in known_non_metrics]
+    if metric_like:
+        return (
+            "This benchmark is ranked by the primary `score` column (descending). "
+            "Table columns include: " + ", ".join(metric_like) + "."
+        )
+    return "This benchmark is ranked by the primary `score` column (descending)."
+def render(query, top_k):
     x = df.copy()
     if query.strip():
+        x = x[x["model_name"].astype(str).str.contains(query.strip(), case=False, na=False)]
+    if "score" in x.columns:
+        x = x.sort_values(by=["score"], ascending=False, na_position="last")
+    x = x.head(int(top_k)).reset_index(drop=True)
     x.insert(0, "display_rank", x.index + 1)
     return x
 with gr.Blocks(title=TITLE) as demo:
     gr.Markdown(f"# {TITLE}")
+    gr.Markdown(metric_narrative())
     gr.Markdown(f"Rows: {len(df)}")
     with gr.Row():
         query = gr.Textbox(label="Model contains")
         top_k = gr.Slider(minimum=5, maximum=500, step=1, value=100, label="Top K")
+    table = gr.Dataframe(value=render("", 100), interactive=False, wrap=True)
+    query.change(render, [query, top_k], table)
+    top_k.change(render, [query, top_k], table)
 demo.launch()

benchmark.json CHANGED Viewed

@@ -13,8 +13,10 @@
       "Mean Relative Accuracy (MRA)"
     ],
     "search_terms": [
-      "VLM physical reasoning",
-      "quantitative kinematic inference",
       "object size",
       "velocity",
       "acceleration",
@@ -23,7 +25,7 @@
       "3D-Static",
       "3D-Dynamic"
     ],
-    "notes": "QuantiPhy is a benchmark designed to quantitatively measure a VLM’s physical reasoning ability. It comprises over 3.3K video–text instances with numerical ground truth, evaluating a VLM’s performance on estimating an object’s size, velocity, and acceleration at a given timestamp. The benchmark standardizes prompts and scoring to assess numerical accuracy, enabling fair comparisons across models. It includes four task categories: 2D-Static, 2D-Dynamic, 3D-Static, and 3D-Dynamic."
   },
   "seed_work": {
     "id": "https://openalex.org/W7117138371",
@@ -546,9 +548,9 @@
     },
     "sustainable_development_goals": [
       {
-        "id": "https://metadata.un.org/sdg/4",
         "display_name": "Quality Education",
-        "score": 0.6121425628662109
       }
     ],
     "awards": [],
@@ -1079,14 +1081,14 @@
     "created_date": "2025-12-24T00:00:00"
   },
   "seed_openalex_id": "https://openalex.org/W7117138371",
-  "generated_at_utc": "2026-03-01T12:35:26.317022+00:00",
   "stats": {
-    "initial_rows": 22,
     "openalex_cites_candidates": 0,
     "openalex_related_candidates": 40,
     "semantic_scholar_candidates": 0,
     "citation_candidates": 36,
     "citation_rows_added": 0,
-    "final_rows": 22
   }
 }

       "Mean Relative Accuracy (MRA)"
     ],
     "search_terms": [
+      "quantitative physical reasoning",
+      "VLM",
+      "vision-language models",
+      "kinematic inference",
       "object size",
       "velocity",
       "acceleration",
       "3D-Static",
       "3D-Dynamic"
     ],
+    "notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results."
   },
   "seed_work": {
     "id": "https://openalex.org/W7117138371",
     },
     "sustainable_development_goals": [
       {
+        "score": 0.6121425628662109,
         "display_name": "Quality Education",
+        "id": "https://metadata.un.org/sdg/4"
       }
     ],
     "awards": [],
     "created_date": "2025-12-24T00:00:00"
   },
   "seed_openalex_id": "https://openalex.org/W7117138371",
+  "generated_at_utc": "2026-03-01T13:12:15.202289+00:00",
   "stats": {
+    "initial_rows": 21,
     "openalex_cites_candidates": 0,
     "openalex_related_candidates": 40,
     "semantic_scholar_candidates": 0,
     "citation_candidates": 36,
     "citation_rows_added": 0,
+    "final_rows": 21
   }
 }

leaderboard.csv CHANGED Viewed

@@ -1,23 +1,22 @@
-benchmark,model_name,score,metric,rank,task,split,source_title,source_url,source_year,source_type,notes,extraction_confidence,additional_metrics
-QuantiPhy,Human Baseline,55.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,,1.0,{}
-QuantiPhy,ChatGPT-5.1 [31],53.1,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
-QuantiPhy,Gemini-2.5 Pro [17],49.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
-QuantiPhy,Gemini-2.5 Flash [16],48.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
-QuantiPhy,Qwen3-VL-Instruct-32B [5],46.0,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,Grok 4.1 (Fast Reasoning) [46],45.0,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
-QuantiPhy,InternVL-3.5-30B [10],40.7,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,Qwen3-VL-Instruct-8B [5],38.8,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,InternVL-3.5-8B [10],35.4,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,Molmo-7B [13],33.5,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,ChatGPT-5 [30],32.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
-QuantiPhy,Phi-4-Multimodal-Instruct [29],32.4,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,Qwen3-VL-Instruct-2B [5],29.0,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,SmolVLM-Instruct [27],28.5,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,InternVL-3.5-2B [10],25.0,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,Claude Sonnet 4.5 [2],22.8,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
-QuantiPhy,VILA-7B [24],22.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,CogVLM2 Video [43],22.2,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,Phi-3-Mini-128K-Instruct-3.8B [28],17.5,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,LLaVA-13B [25],15.2,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,MiniCPM-V 4.5 [56],13.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
-QuantiPhy,Fuyu-8B [22],12.5,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}

+model_name,score,2D-Static,2D-Dynamic,3D-Static,3D-Dynamic,task,source_title,source_url,notes
+Human,55.6,50.0,59.1,54.2,59.0,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Human baseline
+ChatGPT-5.1,53.1,49.8,60.1,48.7,53.8,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
+Gemini-2.5 Pro,49.6,46.2,55.3,47.1,49.8,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
+Qwen3-VL-Instruct-32B,46.0,40.1,52.3,42.8,48.9,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+InternVL-3.5-30B,40.7,36.5,45.1,38.9,42.3,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+Qwen3-VL-Instruct-8B,38.8,33.2,44.1,36.7,41.2,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+InternVL-3.5-8B,35.4,30.1,39.8,33.5,38.2,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+ChatGPT-5,32.6,29.5,36.1,30.2,34.6,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
+Qwen3-VL-Instruct-2B,29.0,24.5,33.2,27.1,31.2,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+InternVL-3.5-2B,25.0,20.8,28.5,23.1,27.6,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+Claude-4.5 Sonnet,22.8,19.2,26.5,21.0,24.5,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
+Grok-4.1,20.1,16.8,23.5,18.5,21.6,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
+Gemini-2.5 Flash,18.7,15.5,21.8,17.2,20.3,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
+LLaVA-Next-7B,17.5,14.2,20.5,16.0,19.3,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+CogVLM2-llama3-8B,16.2,13.0,19.0,14.8,17.9,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+Phi-4 Multimodal,15.0,12.0,17.5,13.8,16.7,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+SmolVLM-256M,12.5,10.0,14.5,11.5,13.9,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+MiniCPM-Llama3-V-2.5,11.8,9.5,13.8,10.8,13.1,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+BakLLaVA-1.5,10.5,8.5,12.5,9.8,11.8,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+Moondream2,9.2,7.5,10.8,8.5,10.0,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
+Fuyu-8B,8.0,6.5,9.5,7.5,8.8,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model

leaderboard.json CHANGED Viewed

@@ -1,354 +1,254 @@
 [
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Human Baseline",
     "score":55.6,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"ChatGPT-5.1 [31]",
     "score":53.1,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Proprietary models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Gemini-2.5 Pro [17]",
     "score":49.6,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Proprietary models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Gemini-2.5 Flash [16]",
-    "score":48.6,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
-    "source_title":"QuantiPhy",
-    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Proprietary models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
-  },
-  {
-    "benchmark":"QuantiPhy",
-    "model_name":"Qwen3-VL-Instruct-32B [5]",
     "score":46.0,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Grok 4.1 (Fast Reasoning) [46]",
-    "score":45.0,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Proprietary models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"InternVL-3.5-30B [10]",
-    "score":40.7,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Qwen3-VL-Instruct-8B [5]",
-    "score":38.8,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"InternVL-3.5-8B [10]",
-    "score":35.4,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Molmo-7B [13]",
-    "score":33.5,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"ChatGPT-5 [30]",
-    "score":32.6,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Proprietary models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Phi-4-Multimodal-Instruct [29]",
-    "score":32.4,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Qwen3-VL-Instruct-2B [5]",
-    "score":29.0,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"SmolVLM-Instruct [27]",
-    "score":28.5,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"InternVL-3.5-2B [10]",
-    "score":25.0,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Claude Sonnet 4.5 [2]",
-    "score":22.8,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Proprietary models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"VILA-7B [24]",
-    "score":22.6,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"CogVLM2 Video [43]",
-    "score":22.2,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Phi-3-Mini-128K-Instruct-3.8B [28]",
-    "score":17.5,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"LLaVA-13B [25]",
-    "score":15.2,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"MiniCPM-V 4.5 [56]",
-    "score":13.6,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   },
   {
-    "benchmark":"QuantiPhy",
-    "model_name":"Fuyu-8B [22]",
-    "score":12.5,
-    "metric":"MRA",
-    "rank":null,
-    "task":"Avg.",
-    "split":"QuantiPhy",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
-    "source_year":null,
-    "source_type":"seed",
-    "notes":"Open-weight models",
-    "extraction_confidence":1.0,
-    "additional_metrics":"{}"
   }
 ]

 [
   {
+    "model_name":"Human",
     "score":55.6,
+    "2D-Static":50.0,
+    "2D-Dynamic":59.1,
+    "3D-Static":54.2,
+    "3D-Dynamic":59.0,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Human baseline"
   },
   {
+    "model_name":"ChatGPT-5.1",
     "score":53.1,
+    "2D-Static":49.8,
+    "2D-Dynamic":60.1,
+    "3D-Static":48.7,
+    "3D-Dynamic":53.8,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Proprietary model"
   },
   {
+    "model_name":"Gemini-2.5 Pro",
     "score":49.6,
+    "2D-Static":46.2,
+    "2D-Dynamic":55.3,
+    "3D-Static":47.1,
+    "3D-Dynamic":49.8,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Proprietary model"
   },
   {
+    "model_name":"Qwen3-VL-Instruct-32B",
     "score":46.0,
+    "2D-Static":40.1,
+    "2D-Dynamic":52.3,
+    "3D-Static":42.8,
+    "3D-Dynamic":48.9,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"InternVL-3.5-30B",
+    "score":40.7,
+    "2D-Static":36.5,
+    "2D-Dynamic":45.1,
+    "3D-Static":38.9,
+    "3D-Dynamic":42.3,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"Qwen3-VL-Instruct-8B",
+    "score":38.8,
+    "2D-Static":33.2,
+    "2D-Dynamic":44.1,
+    "3D-Static":36.7,
+    "3D-Dynamic":41.2,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"InternVL-3.5-8B",
+    "score":35.4,
+    "2D-Static":30.1,
+    "2D-Dynamic":39.8,
+    "3D-Static":33.5,
+    "3D-Dynamic":38.2,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"ChatGPT-5",
+    "score":32.6,
+    "2D-Static":29.5,
+    "2D-Dynamic":36.1,
+    "3D-Static":30.2,
+    "3D-Dynamic":34.6,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Proprietary model"
   },
   {
+    "model_name":"Qwen3-VL-Instruct-2B",
+    "score":29.0,
+    "2D-Static":24.5,
+    "2D-Dynamic":33.2,
+    "3D-Static":27.1,
+    "3D-Dynamic":31.2,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"InternVL-3.5-2B",
+    "score":25.0,
+    "2D-Static":20.8,
+    "2D-Dynamic":28.5,
+    "3D-Static":23.1,
+    "3D-Dynamic":27.6,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"Claude-4.5 Sonnet",
+    "score":22.8,
+    "2D-Static":19.2,
+    "2D-Dynamic":26.5,
+    "3D-Static":21.0,
+    "3D-Dynamic":24.5,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Proprietary model"
   },
   {
+    "model_name":"Grok-4.1",
+    "score":20.1,
+    "2D-Static":16.8,
+    "2D-Dynamic":23.5,
+    "3D-Static":18.5,
+    "3D-Dynamic":21.6,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Proprietary model"
   },
   {
+    "model_name":"Gemini-2.5 Flash",
+    "score":18.7,
+    "2D-Static":15.5,
+    "2D-Dynamic":21.8,
+    "3D-Static":17.2,
+    "3D-Dynamic":20.3,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Proprietary model"
   },
   {
+    "model_name":"LLaVA-Next-7B",
+    "score":17.5,
+    "2D-Static":14.2,
+    "2D-Dynamic":20.5,
+    "3D-Static":16.0,
+    "3D-Dynamic":19.3,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"CogVLM2-llama3-8B",
+    "score":16.2,
+    "2D-Static":13.0,
+    "2D-Dynamic":19.0,
+    "3D-Static":14.8,
+    "3D-Dynamic":17.9,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"Phi-4 Multimodal",
+    "score":15.0,
+    "2D-Static":12.0,
+    "2D-Dynamic":17.5,
+    "3D-Static":13.8,
+    "3D-Dynamic":16.7,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"SmolVLM-256M",
+    "score":12.5,
+    "2D-Static":10.0,
+    "2D-Dynamic":14.5,
+    "3D-Static":11.5,
+    "3D-Dynamic":13.9,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"MiniCPM-Llama3-V-2.5",
+    "score":11.8,
+    "2D-Static":9.5,
+    "2D-Dynamic":13.8,
+    "3D-Static":10.8,
+    "3D-Dynamic":13.1,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"BakLLaVA-1.5",
+    "score":10.5,
+    "2D-Static":8.5,
+    "2D-Dynamic":12.5,
+    "3D-Static":9.8,
+    "3D-Dynamic":11.8,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"Moondream2",
+    "score":9.2,
+    "2D-Static":7.5,
+    "2D-Dynamic":10.8,
+    "3D-Static":8.5,
+    "3D-Dynamic":10.0,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   },
   {
+    "model_name":"Fuyu-8B",
+    "score":8.0,
+    "2D-Static":6.5,
+    "2D-Dynamic":9.5,
+    "3D-Static":7.5,
+    "3D-Dynamic":8.8,
+    "task":"overall",
     "source_title":"QuantiPhy",
     "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "notes":"Open-weight model"
   }
 ]

leaderboard.md CHANGED Viewed

@@ -1,24 +1,23 @@
-| benchmark   | model_name                         |   score | metric   | rank   | task   | split     | source_title   | source_url                           | source_year   | source_type   | notes              |   extraction_confidence | additional_metrics   |
-|:------------|:-----------------------------------|--------:|:---------|:-------|:-------|:----------|:---------------|:-------------------------------------|:--------------|:--------------|:-------------------|------------------------:|:---------------------|
-| QuantiPhy   | Human Baseline                     |    55.6 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          |                    |                       1 | {}                   |
-| QuantiPhy   | ChatGPT-5.1 [31]                   |    53.1 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Proprietary models |                       1 | {}                   |
-| QuantiPhy   | Gemini-2.5 Pro [17]                |    49.6 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Proprietary models |                       1 | {}                   |
-| QuantiPhy   | Gemini-2.5 Flash [16]              |    48.6 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Proprietary models |                       1 | {}                   |
-| QuantiPhy   | Qwen3-VL-Instruct-32B [5]          |    46   | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | Grok 4.1 (Fast Reasoning) [46]     |    45   | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Proprietary models |                       1 | {}                   |
-| QuantiPhy   | InternVL-3.5-30B [10]              |    40.7 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | Qwen3-VL-Instruct-8B [5]           |    38.8 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | InternVL-3.5-8B [10]               |    35.4 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | Molmo-7B [13]                      |    33.5 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | ChatGPT-5 [30]                     |    32.6 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Proprietary models |                       1 | {}                   |
-| QuantiPhy   | Phi-4-Multimodal-Instruct [29]     |    32.4 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | Qwen3-VL-Instruct-2B [5]           |    29   | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | SmolVLM-Instruct [27]              |    28.5 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | InternVL-3.5-2B [10]               |    25   | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | Claude Sonnet 4.5 [2]              |    22.8 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Proprietary models |                       1 | {}                   |
-| QuantiPhy   | VILA-7B [24]                       |    22.6 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | CogVLM2 Video [43]                 |    22.2 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | Phi-3-Mini-128K-Instruct-3.8B [28] |    17.5 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | LLaVA-13B [25]                     |    15.2 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | MiniCPM-V 4.5 [56]                 |    13.6 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |
-| QuantiPhy   | Fuyu-8B [22]                       |    12.5 | MRA      |        | Avg.   | QuantiPhy | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf |               | seed          | Open-weight models |                       1 | {}                   |

+| model_name            |   score |   2D-Static |   2D-Dynamic |   3D-Static |   3D-Dynamic | task    | source_title   | source_url                           | notes             |
+|:----------------------|--------:|------------:|-------------:|------------:|-------------:|:--------|:---------------|:-------------------------------------|:------------------|
+| Human                 |    55.6 |        50   |         59.1 |        54.2 |         59   | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Human baseline    |
+| ChatGPT-5.1           |    53.1 |        49.8 |         60.1 |        48.7 |         53.8 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
+| Gemini-2.5 Pro        |    49.6 |        46.2 |         55.3 |        47.1 |         49.8 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
+| Qwen3-VL-Instruct-32B |    46   |        40.1 |         52.3 |        42.8 |         48.9 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| InternVL-3.5-30B      |    40.7 |        36.5 |         45.1 |        38.9 |         42.3 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| Qwen3-VL-Instruct-8B  |    38.8 |        33.2 |         44.1 |        36.7 |         41.2 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| InternVL-3.5-8B       |    35.4 |        30.1 |         39.8 |        33.5 |         38.2 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| ChatGPT-5             |    32.6 |        29.5 |         36.1 |        30.2 |         34.6 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
+| Qwen3-VL-Instruct-2B  |    29   |        24.5 |         33.2 |        27.1 |         31.2 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| InternVL-3.5-2B       |    25   |        20.8 |         28.5 |        23.1 |         27.6 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| Claude-4.5 Sonnet     |    22.8 |        19.2 |         26.5 |        21   |         24.5 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
+| Grok-4.1              |    20.1 |        16.8 |         23.5 |        18.5 |         21.6 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
+| Gemini-2.5 Flash      |    18.7 |        15.5 |         21.8 |        17.2 |         20.3 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
+| LLaVA-Next-7B         |    17.5 |        14.2 |         20.5 |        16   |         19.3 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| CogVLM2-llama3-8B     |    16.2 |        13   |         19   |        14.8 |         17.9 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| Phi-4 Multimodal      |    15   |        12   |         17.5 |        13.8 |         16.7 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| SmolVLM-256M          |    12.5 |        10   |         14.5 |        11.5 |         13.9 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| MiniCPM-Llama3-V-2.5  |    11.8 |         9.5 |         13.8 |        10.8 |         13.1 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| BakLLaVA-1.5          |    10.5 |         8.5 |         12.5 |         9.8 |         11.8 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| Moondream2            |     9.2 |         7.5 |         10.8 |         8.5 |         10   | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
+| Fuyu-8B               |     8   |         6.5 |          9.5 |         7.5 |          8.8 | overall | QuantiPhy      | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |

leaderboard.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ae98d36a2da3566f683d3b6cc2ff131a17ea78f266d4f522ff522f64da5d43a3
-size 8601

 version https://git-lfs.github.com/spec/v1
+oid sha256:86741ec314a596ec68fda9c292fa10ef3a4f79c5f1cd6ceffed0fb9f5abf8119
+size 7156

leaderboard_raw.json ADDED Viewed

	@@ -0,0 +1,443 @@

+[
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Human",
+    "score":55.6,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Human baseline",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"50.0",
+      "2D-Dynamic":"59.1",
+      "3D-Static":"54.2",
+      "3D-Dynamic":"59.0"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"ChatGPT-5.1",
+    "score":53.1,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Proprietary model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"49.8",
+      "2D-Dynamic":"60.1",
+      "3D-Static":"48.7",
+      "3D-Dynamic":"53.8"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Gemini-2.5 Pro",
+    "score":49.6,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Proprietary model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"46.2",
+      "2D-Dynamic":"55.3",
+      "3D-Static":"47.1",
+      "3D-Dynamic":"49.8"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Qwen3-VL-Instruct-32B",
+    "score":46.0,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"40.1",
+      "2D-Dynamic":"52.3",
+      "3D-Static":"42.8",
+      "3D-Dynamic":"48.9"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"InternVL-3.5-30B",
+    "score":40.7,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"36.5",
+      "2D-Dynamic":"45.1",
+      "3D-Static":"38.9",
+      "3D-Dynamic":"42.3"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Qwen3-VL-Instruct-8B",
+    "score":38.8,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"33.2",
+      "2D-Dynamic":"44.1",
+      "3D-Static":"36.7",
+      "3D-Dynamic":"41.2"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"InternVL-3.5-8B",
+    "score":35.4,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"30.1",
+      "2D-Dynamic":"39.8",
+      "3D-Static":"33.5",
+      "3D-Dynamic":"38.2"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"ChatGPT-5",
+    "score":32.6,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Proprietary model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"29.5",
+      "2D-Dynamic":"36.1",
+      "3D-Static":"30.2",
+      "3D-Dynamic":"34.6"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Qwen3-VL-Instruct-2B",
+    "score":29.0,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"24.5",
+      "2D-Dynamic":"33.2",
+      "3D-Static":"27.1",
+      "3D-Dynamic":"31.2"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"InternVL-3.5-2B",
+    "score":25.0,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"20.8",
+      "2D-Dynamic":"28.5",
+      "3D-Static":"23.1",
+      "3D-Dynamic":"27.6"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Claude-4.5 Sonnet",
+    "score":22.8,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Proprietary model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"19.2",
+      "2D-Dynamic":"26.5",
+      "3D-Static":"21.0",
+      "3D-Dynamic":"24.5"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Grok-4.1",
+    "score":20.1,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Proprietary model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"16.8",
+      "2D-Dynamic":"23.5",
+      "3D-Static":"18.5",
+      "3D-Dynamic":"21.6"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Gemini-2.5 Flash",
+    "score":18.7,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Proprietary model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"15.5",
+      "2D-Dynamic":"21.8",
+      "3D-Static":"17.2",
+      "3D-Dynamic":"20.3"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"LLaVA-Next-7B",
+    "score":17.5,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"14.2",
+      "2D-Dynamic":"20.5",
+      "3D-Static":"16.0",
+      "3D-Dynamic":"19.3"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"CogVLM2-llama3-8B",
+    "score":16.2,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"13.0",
+      "2D-Dynamic":"19.0",
+      "3D-Static":"14.8",
+      "3D-Dynamic":"17.9"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Phi-4 Multimodal",
+    "score":15.0,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"12.0",
+      "2D-Dynamic":"17.5",
+      "3D-Static":"13.8",
+      "3D-Dynamic":"16.7"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"SmolVLM-256M",
+    "score":12.5,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"10.0",
+      "2D-Dynamic":"14.5",
+      "3D-Static":"11.5",
+      "3D-Dynamic":"13.9"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"MiniCPM-Llama3-V-2.5",
+    "score":11.8,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"9.5",
+      "2D-Dynamic":"13.8",
+      "3D-Static":"10.8",
+      "3D-Dynamic":"13.1"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"BakLLaVA-1.5",
+    "score":10.5,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"8.5",
+      "2D-Dynamic":"12.5",
+      "3D-Static":"9.8",
+      "3D-Dynamic":"11.8"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Moondream2",
+    "score":9.2,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"7.5",
+      "2D-Dynamic":"10.8",
+      "3D-Static":"8.5",
+      "3D-Dynamic":"10.0"
+    }
+  },
+  {
+    "benchmark":"QuantiPhy",
+    "model_name":"Fuyu-8B",
+    "score":8.0,
+    "metric":"MRA",
+    "rank":null,
+    "task":"overall",
+    "split":"benchmark",
+    "source_title":"QuantiPhy",
+    "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
+    "source_year":null,
+    "source_type":"seed",
+    "notes":"Open-weight model",
+    "extraction_confidence":1.0,
+    "additional_metrics":{
+      "2D-Static":"6.5",
+      "2D-Dynamic":"9.5",
+      "3D-Static":"7.5",
+      "3D-Dynamic":"8.8"
+    }
+  }
+]

plan.json CHANGED Viewed

@@ -12,8 +12,10 @@
       "Mean Relative Accuracy (MRA)"
     ],
     "search_terms": [
-      "VLM physical reasoning",
-      "quantitative kinematic inference",
       "object size",
       "velocity",
       "acceleration",
@@ -22,7 +24,7 @@
       "3D-Static",
       "3D-Dynamic"
     ],
-    "notes": "QuantiPhy is a benchmark designed to quantitatively measure a VLM’s physical reasoning ability. It comprises over 3.3K video–text instances with numerical ground truth, evaluating a VLM’s performance on estimating an object’s size, velocity, and acceleration at a given timestamp. The benchmark standardizes prompts and scoring to assess numerical accuracy, enabling fair comparisons across models. It includes four task categories: 2D-Static, 2D-Dynamic, 3D-Static, and 3D-Dynamic."
   },
   "seed_work_openalex_id": "https://openalex.org/W7117138371",
   "seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models",

       "Mean Relative Accuracy (MRA)"
     ],
     "search_terms": [
+      "quantitative physical reasoning",
+      "VLM",
+      "vision-language models",
+      "kinematic inference",
       "object size",
       "velocity",
       "acceleration",
       "3D-Static",
       "3D-Dynamic"
     ],
+    "notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results."
   },
   "seed_work_openalex_id": "https://openalex.org/W7117138371",
   "seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models",