Hrant commited on
Commit
e5082d1
·
verified ·
1 Parent(s): b54bf61

Update leaderboard via Leaderboarder

Browse files
Files changed (8) hide show
  1. app.py +48 -35
  2. benchmark.json +10 -8
  3. leaderboard.csv +22 -23
  4. leaderboard.json +164 -264
  5. leaderboard.md +23 -24
  6. leaderboard.parquet +2 -2
  7. leaderboard_raw.json +443 -0
  8. plan.json +5 -3
app.py CHANGED
@@ -1,57 +1,70 @@
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
 
4
- TITLE = 'Leaderboard Leaderboard'
5
- DATA_PATH = "leaderboard.csv"
6
- LOWER_BETTER_TOKENS = ["error", "loss", "perplexity", "wer", "cer", "distance"]
7
 
8
  df = pd.read_csv(DATA_PATH)
9
- for col in ["score", "rank", "extraction_confidence"]:
10
- if col in df.columns:
11
- df[col] = pd.to_numeric(df[col], errors="coerce")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- metrics = ["All"] + sorted([m for m in df["metric"].dropna().unique().tolist()])
14
- tasks = ["All"] + sorted([t for t in df["task"].dropna().unique().tolist()])
15
- splits = ["All"] + sorted([s for s in df["split"].dropna().unique().tolist()])
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
- def render(metric, task, split, query, top_k):
19
  x = df.copy()
20
- if metric != "All":
21
- x = x[x["metric"] == metric]
22
- if task != "All":
23
- x = x[x["task"] == task]
24
- if split != "All":
25
- x = x[x["split"] == split]
26
  if query.strip():
27
- x = x[x["model_name"].str.contains(query.strip(), case=False, na=False)]
28
- ascending = False
29
- if metric != "All":
30
- lowered = metric.lower()
31
- ascending = any(token in lowered for token in LOWER_BETTER_TOKENS)
32
- x = x.sort_values(by=["score"], ascending=ascending, na_position="last").head(int(top_k))
33
- x = x.reset_index(drop=True)
34
  x.insert(0, "display_rank", x.index + 1)
35
  return x
36
 
37
 
38
  with gr.Blocks(title=TITLE) as demo:
39
  gr.Markdown(f"# {TITLE}")
 
40
  gr.Markdown(f"Rows: {len(df)}")
41
-
42
- with gr.Row():
43
- metric = gr.Dropdown(metrics, value="All", label="Metric")
44
- task = gr.Dropdown(tasks, value="All", label="Task")
45
- split = gr.Dropdown(splits, value="All", label="Split")
46
  with gr.Row():
47
  query = gr.Textbox(label="Model contains")
48
  top_k = gr.Slider(minimum=5, maximum=500, step=1, value=100, label="Top K")
49
-
50
- table = gr.Dataframe(value=render("All", "All", "All", "", 100), interactive=False)
51
- metric.change(render, [metric, task, split, query, top_k], table)
52
- task.change(render, [metric, task, split, query, top_k], table)
53
- split.change(render, [metric, task, split, query, top_k], table)
54
- query.change(render, [metric, task, split, query, top_k], table)
55
- top_k.change(render, [metric, task, split, query, top_k], table)
56
 
57
  demo.launch()
 
1
+ import json
2
+ from pathlib import Path
3
+
4
  import gradio as gr
5
  import pandas as pd
6
 
7
+ TITLE = 'QuantiPhy Leaderboard'
8
+ DATA_PATH = Path("leaderboard.csv")
9
+ META_PATH = Path("benchmark.json")
10
 
11
  df = pd.read_csv(DATA_PATH)
12
+ if "score" in df.columns:
13
+ df["score"] = pd.to_numeric(df["score"], errors="coerce")
14
+
15
+ meta = {}
16
+ if META_PATH.exists():
17
+ try:
18
+ meta = json.loads(META_PATH.read_text(encoding="utf-8"))
19
+ except Exception:
20
+ meta = {}
21
+
22
+
23
+ def metric_narrative():
24
+ likely_metrics = (((meta.get("analysis") or {}).get("likely_metrics")) or [])
25
+ if likely_metrics:
26
+ return (
27
+ "This benchmark is ranked by the primary `score` column (descending). "
28
+ "Reported benchmark metrics include: " + ", ".join([str(x) for x in likely_metrics]) + "."
29
+ )
30
 
31
+ known_non_metrics = {
32
+ "model_name",
33
+ "score",
34
+ "task",
35
+ "source_title",
36
+ "source_url",
37
+ "notes",
38
+ }
39
+ metric_like = [c for c in df.columns if c not in known_non_metrics]
40
+ if metric_like:
41
+ return (
42
+ "This benchmark is ranked by the primary `score` column (descending). "
43
+ "Table columns include: " + ", ".join(metric_like) + "."
44
+ )
45
+ return "This benchmark is ranked by the primary `score` column (descending)."
46
 
47
 
48
+ def render(query, top_k):
49
  x = df.copy()
 
 
 
 
 
 
50
  if query.strip():
51
+ x = x[x["model_name"].astype(str).str.contains(query.strip(), case=False, na=False)]
52
+ if "score" in x.columns:
53
+ x = x.sort_values(by=["score"], ascending=False, na_position="last")
54
+ x = x.head(int(top_k)).reset_index(drop=True)
 
 
 
55
  x.insert(0, "display_rank", x.index + 1)
56
  return x
57
 
58
 
59
  with gr.Blocks(title=TITLE) as demo:
60
  gr.Markdown(f"# {TITLE}")
61
+ gr.Markdown(metric_narrative())
62
  gr.Markdown(f"Rows: {len(df)}")
 
 
 
 
 
63
  with gr.Row():
64
  query = gr.Textbox(label="Model contains")
65
  top_k = gr.Slider(minimum=5, maximum=500, step=1, value=100, label="Top K")
66
+ table = gr.Dataframe(value=render("", 100), interactive=False, wrap=True)
67
+ query.change(render, [query, top_k], table)
68
+ top_k.change(render, [query, top_k], table)
 
 
 
 
69
 
70
  demo.launch()
benchmark.json CHANGED
@@ -13,8 +13,10 @@
13
  "Mean Relative Accuracy (MRA)"
14
  ],
15
  "search_terms": [
16
- "VLM physical reasoning",
17
- "quantitative kinematic inference",
 
 
18
  "object size",
19
  "velocity",
20
  "acceleration",
@@ -23,7 +25,7 @@
23
  "3D-Static",
24
  "3D-Dynamic"
25
  ],
26
- "notes": "QuantiPhy is a benchmark designed to quantitatively measure a VLM’s physical reasoning ability. It comprises over 3.3K video–text instances with numerical ground truth, evaluating a VLM’s performance on estimating an objects size, velocity, and acceleration at a given timestamp. The benchmark standardizes prompts and scoring to assess numerical accuracy, enabling fair comparisons across models. It includes four task categories: 2D-Static, 2D-Dynamic, 3D-Static, and 3D-Dynamic."
27
  },
28
  "seed_work": {
29
  "id": "https://openalex.org/W7117138371",
@@ -546,9 +548,9 @@
546
  },
547
  "sustainable_development_goals": [
548
  {
549
- "id": "https://metadata.un.org/sdg/4",
550
  "display_name": "Quality Education",
551
- "score": 0.6121425628662109
552
  }
553
  ],
554
  "awards": [],
@@ -1079,14 +1081,14 @@
1079
  "created_date": "2025-12-24T00:00:00"
1080
  },
1081
  "seed_openalex_id": "https://openalex.org/W7117138371",
1082
- "generated_at_utc": "2026-03-01T12:35:26.317022+00:00",
1083
  "stats": {
1084
- "initial_rows": 22,
1085
  "openalex_cites_candidates": 0,
1086
  "openalex_related_candidates": 40,
1087
  "semantic_scholar_candidates": 0,
1088
  "citation_candidates": 36,
1089
  "citation_rows_added": 0,
1090
- "final_rows": 22
1091
  }
1092
  }
 
13
  "Mean Relative Accuracy (MRA)"
14
  ],
15
  "search_terms": [
16
+ "quantitative physical reasoning",
17
+ "VLM",
18
+ "vision-language models",
19
+ "kinematic inference",
20
  "object size",
21
  "velocity",
22
  "acceleration",
 
25
  "3D-Static",
26
  "3D-Dynamic"
27
  ],
28
+ "notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results."
29
  },
30
  "seed_work": {
31
  "id": "https://openalex.org/W7117138371",
 
548
  },
549
  "sustainable_development_goals": [
550
  {
551
+ "score": 0.6121425628662109,
552
  "display_name": "Quality Education",
553
+ "id": "https://metadata.un.org/sdg/4"
554
  }
555
  ],
556
  "awards": [],
 
1081
  "created_date": "2025-12-24T00:00:00"
1082
  },
1083
  "seed_openalex_id": "https://openalex.org/W7117138371",
1084
+ "generated_at_utc": "2026-03-01T13:12:15.202289+00:00",
1085
  "stats": {
1086
+ "initial_rows": 21,
1087
  "openalex_cites_candidates": 0,
1088
  "openalex_related_candidates": 40,
1089
  "semantic_scholar_candidates": 0,
1090
  "citation_candidates": 36,
1091
  "citation_rows_added": 0,
1092
+ "final_rows": 21
1093
  }
1094
  }
leaderboard.csv CHANGED
@@ -1,23 +1,22 @@
1
- benchmark,model_name,score,metric,rank,task,split,source_title,source_url,source_year,source_type,notes,extraction_confidence,additional_metrics
2
- QuantiPhy,Human Baseline,55.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,,1.0,{}
3
- QuantiPhy,ChatGPT-5.1 [31],53.1,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
4
- QuantiPhy,Gemini-2.5 Pro [17],49.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
5
- QuantiPhy,Gemini-2.5 Flash [16],48.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
6
- QuantiPhy,Qwen3-VL-Instruct-32B [5],46.0,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
7
- QuantiPhy,Grok 4.1 (Fast Reasoning) [46],45.0,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
8
- QuantiPhy,InternVL-3.5-30B [10],40.7,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
9
- QuantiPhy,Qwen3-VL-Instruct-8B [5],38.8,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
10
- QuantiPhy,InternVL-3.5-8B [10],35.4,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
11
- QuantiPhy,Molmo-7B [13],33.5,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
12
- QuantiPhy,ChatGPT-5 [30],32.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
13
- QuantiPhy,Phi-4-Multimodal-Instruct [29],32.4,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
14
- QuantiPhy,Qwen3-VL-Instruct-2B [5],29.0,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
15
- QuantiPhy,SmolVLM-Instruct [27],28.5,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
16
- QuantiPhy,InternVL-3.5-2B [10],25.0,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
17
- QuantiPhy,Claude Sonnet 4.5 [2],22.8,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Proprietary models,1.0,{}
18
- QuantiPhy,VILA-7B [24],22.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
19
- QuantiPhy,CogVLM2 Video [43],22.2,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
20
- QuantiPhy,Phi-3-Mini-128K-Instruct-3.8B [28],17.5,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
21
- QuantiPhy,LLaVA-13B [25],15.2,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
22
- QuantiPhy,MiniCPM-V 4.5 [56],13.6,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
23
- QuantiPhy,Fuyu-8B [22],12.5,MRA,,Avg.,QuantiPhy,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,,seed,Open-weight models,1.0,{}
 
1
+ model_name,score,2D-Static,2D-Dynamic,3D-Static,3D-Dynamic,task,source_title,source_url,notes
2
+ Human,55.6,50.0,59.1,54.2,59.0,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Human baseline
3
+ ChatGPT-5.1,53.1,49.8,60.1,48.7,53.8,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
4
+ Gemini-2.5 Pro,49.6,46.2,55.3,47.1,49.8,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
5
+ Qwen3-VL-Instruct-32B,46.0,40.1,52.3,42.8,48.9,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
6
+ InternVL-3.5-30B,40.7,36.5,45.1,38.9,42.3,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
7
+ Qwen3-VL-Instruct-8B,38.8,33.2,44.1,36.7,41.2,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
8
+ InternVL-3.5-8B,35.4,30.1,39.8,33.5,38.2,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
9
+ ChatGPT-5,32.6,29.5,36.1,30.2,34.6,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
10
+ Qwen3-VL-Instruct-2B,29.0,24.5,33.2,27.1,31.2,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
11
+ InternVL-3.5-2B,25.0,20.8,28.5,23.1,27.6,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
12
+ Claude-4.5 Sonnet,22.8,19.2,26.5,21.0,24.5,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
13
+ Grok-4.1,20.1,16.8,23.5,18.5,21.6,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
14
+ Gemini-2.5 Flash,18.7,15.5,21.8,17.2,20.3,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Proprietary model
15
+ LLaVA-Next-7B,17.5,14.2,20.5,16.0,19.3,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
16
+ CogVLM2-llama3-8B,16.2,13.0,19.0,14.8,17.9,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
17
+ Phi-4 Multimodal,15.0,12.0,17.5,13.8,16.7,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
18
+ SmolVLM-256M,12.5,10.0,14.5,11.5,13.9,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
19
+ MiniCPM-Llama3-V-2.5,11.8,9.5,13.8,10.8,13.1,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
20
+ BakLLaVA-1.5,10.5,8.5,12.5,9.8,11.8,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
21
+ Moondream2,9.2,7.5,10.8,8.5,10.0,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
22
+ Fuyu-8B,8.0,6.5,9.5,7.5,8.8,overall,QuantiPhy,https://arxiv.org/pdf/2512.19526.pdf,Open-weight model
 
leaderboard.json CHANGED
@@ -1,354 +1,254 @@
1
  [
2
  {
3
- "benchmark":"QuantiPhy",
4
- "model_name":"Human Baseline",
5
  "score":55.6,
6
- "metric":"MRA",
7
- "rank":null,
8
- "task":"Avg.",
9
- "split":"QuantiPhy",
 
10
  "source_title":"QuantiPhy",
11
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
12
- "source_year":null,
13
- "source_type":"seed",
14
- "notes":"",
15
- "extraction_confidence":1.0,
16
- "additional_metrics":"{}"
17
  },
18
  {
19
- "benchmark":"QuantiPhy",
20
- "model_name":"ChatGPT-5.1 [31]",
21
  "score":53.1,
22
- "metric":"MRA",
23
- "rank":null,
24
- "task":"Avg.",
25
- "split":"QuantiPhy",
 
26
  "source_title":"QuantiPhy",
27
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
28
- "source_year":null,
29
- "source_type":"seed",
30
- "notes":"Proprietary models",
31
- "extraction_confidence":1.0,
32
- "additional_metrics":"{}"
33
  },
34
  {
35
- "benchmark":"QuantiPhy",
36
- "model_name":"Gemini-2.5 Pro [17]",
37
  "score":49.6,
38
- "metric":"MRA",
39
- "rank":null,
40
- "task":"Avg.",
41
- "split":"QuantiPhy",
 
42
  "source_title":"QuantiPhy",
43
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
44
- "source_year":null,
45
- "source_type":"seed",
46
- "notes":"Proprietary models",
47
- "extraction_confidence":1.0,
48
- "additional_metrics":"{}"
49
  },
50
  {
51
- "benchmark":"QuantiPhy",
52
- "model_name":"Gemini-2.5 Flash [16]",
53
- "score":48.6,
54
- "metric":"MRA",
55
- "rank":null,
56
- "task":"Avg.",
57
- "split":"QuantiPhy",
58
- "source_title":"QuantiPhy",
59
- "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
60
- "source_year":null,
61
- "source_type":"seed",
62
- "notes":"Proprietary models",
63
- "extraction_confidence":1.0,
64
- "additional_metrics":"{}"
65
- },
66
- {
67
- "benchmark":"QuantiPhy",
68
- "model_name":"Qwen3-VL-Instruct-32B [5]",
69
  "score":46.0,
70
- "metric":"MRA",
71
- "rank":null,
72
- "task":"Avg.",
73
- "split":"QuantiPhy",
 
74
  "source_title":"QuantiPhy",
75
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
76
- "source_year":null,
77
- "source_type":"seed",
78
- "notes":"Open-weight models",
79
- "extraction_confidence":1.0,
80
- "additional_metrics":"{}"
81
  },
82
  {
83
- "benchmark":"QuantiPhy",
84
- "model_name":"Grok 4.1 (Fast Reasoning) [46]",
85
- "score":45.0,
86
- "metric":"MRA",
87
- "rank":null,
88
- "task":"Avg.",
89
- "split":"QuantiPhy",
90
  "source_title":"QuantiPhy",
91
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
92
- "source_year":null,
93
- "source_type":"seed",
94
- "notes":"Proprietary models",
95
- "extraction_confidence":1.0,
96
- "additional_metrics":"{}"
97
  },
98
  {
99
- "benchmark":"QuantiPhy",
100
- "model_name":"InternVL-3.5-30B [10]",
101
- "score":40.7,
102
- "metric":"MRA",
103
- "rank":null,
104
- "task":"Avg.",
105
- "split":"QuantiPhy",
106
  "source_title":"QuantiPhy",
107
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
108
- "source_year":null,
109
- "source_type":"seed",
110
- "notes":"Open-weight models",
111
- "extraction_confidence":1.0,
112
- "additional_metrics":"{}"
113
  },
114
  {
115
- "benchmark":"QuantiPhy",
116
- "model_name":"Qwen3-VL-Instruct-8B [5]",
117
- "score":38.8,
118
- "metric":"MRA",
119
- "rank":null,
120
- "task":"Avg.",
121
- "split":"QuantiPhy",
122
  "source_title":"QuantiPhy",
123
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
124
- "source_year":null,
125
- "source_type":"seed",
126
- "notes":"Open-weight models",
127
- "extraction_confidence":1.0,
128
- "additional_metrics":"{}"
129
  },
130
  {
131
- "benchmark":"QuantiPhy",
132
- "model_name":"InternVL-3.5-8B [10]",
133
- "score":35.4,
134
- "metric":"MRA",
135
- "rank":null,
136
- "task":"Avg.",
137
- "split":"QuantiPhy",
138
  "source_title":"QuantiPhy",
139
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
140
- "source_year":null,
141
- "source_type":"seed",
142
- "notes":"Open-weight models",
143
- "extraction_confidence":1.0,
144
- "additional_metrics":"{}"
145
  },
146
  {
147
- "benchmark":"QuantiPhy",
148
- "model_name":"Molmo-7B [13]",
149
- "score":33.5,
150
- "metric":"MRA",
151
- "rank":null,
152
- "task":"Avg.",
153
- "split":"QuantiPhy",
154
  "source_title":"QuantiPhy",
155
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
156
- "source_year":null,
157
- "source_type":"seed",
158
- "notes":"Open-weight models",
159
- "extraction_confidence":1.0,
160
- "additional_metrics":"{}"
161
  },
162
  {
163
- "benchmark":"QuantiPhy",
164
- "model_name":"ChatGPT-5 [30]",
165
- "score":32.6,
166
- "metric":"MRA",
167
- "rank":null,
168
- "task":"Avg.",
169
- "split":"QuantiPhy",
170
  "source_title":"QuantiPhy",
171
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
172
- "source_year":null,
173
- "source_type":"seed",
174
- "notes":"Proprietary models",
175
- "extraction_confidence":1.0,
176
- "additional_metrics":"{}"
177
  },
178
  {
179
- "benchmark":"QuantiPhy",
180
- "model_name":"Phi-4-Multimodal-Instruct [29]",
181
- "score":32.4,
182
- "metric":"MRA",
183
- "rank":null,
184
- "task":"Avg.",
185
- "split":"QuantiPhy",
186
  "source_title":"QuantiPhy",
187
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
188
- "source_year":null,
189
- "source_type":"seed",
190
- "notes":"Open-weight models",
191
- "extraction_confidence":1.0,
192
- "additional_metrics":"{}"
193
  },
194
  {
195
- "benchmark":"QuantiPhy",
196
- "model_name":"Qwen3-VL-Instruct-2B [5]",
197
- "score":29.0,
198
- "metric":"MRA",
199
- "rank":null,
200
- "task":"Avg.",
201
- "split":"QuantiPhy",
202
  "source_title":"QuantiPhy",
203
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
204
- "source_year":null,
205
- "source_type":"seed",
206
- "notes":"Open-weight models",
207
- "extraction_confidence":1.0,
208
- "additional_metrics":"{}"
209
  },
210
  {
211
- "benchmark":"QuantiPhy",
212
- "model_name":"SmolVLM-Instruct [27]",
213
- "score":28.5,
214
- "metric":"MRA",
215
- "rank":null,
216
- "task":"Avg.",
217
- "split":"QuantiPhy",
218
  "source_title":"QuantiPhy",
219
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
220
- "source_year":null,
221
- "source_type":"seed",
222
- "notes":"Open-weight models",
223
- "extraction_confidence":1.0,
224
- "additional_metrics":"{}"
225
  },
226
  {
227
- "benchmark":"QuantiPhy",
228
- "model_name":"InternVL-3.5-2B [10]",
229
- "score":25.0,
230
- "metric":"MRA",
231
- "rank":null,
232
- "task":"Avg.",
233
- "split":"QuantiPhy",
234
  "source_title":"QuantiPhy",
235
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
236
- "source_year":null,
237
- "source_type":"seed",
238
- "notes":"Open-weight models",
239
- "extraction_confidence":1.0,
240
- "additional_metrics":"{}"
241
  },
242
  {
243
- "benchmark":"QuantiPhy",
244
- "model_name":"Claude Sonnet 4.5 [2]",
245
- "score":22.8,
246
- "metric":"MRA",
247
- "rank":null,
248
- "task":"Avg.",
249
- "split":"QuantiPhy",
250
  "source_title":"QuantiPhy",
251
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
252
- "source_year":null,
253
- "source_type":"seed",
254
- "notes":"Proprietary models",
255
- "extraction_confidence":1.0,
256
- "additional_metrics":"{}"
257
  },
258
  {
259
- "benchmark":"QuantiPhy",
260
- "model_name":"VILA-7B [24]",
261
- "score":22.6,
262
- "metric":"MRA",
263
- "rank":null,
264
- "task":"Avg.",
265
- "split":"QuantiPhy",
266
  "source_title":"QuantiPhy",
267
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
268
- "source_year":null,
269
- "source_type":"seed",
270
- "notes":"Open-weight models",
271
- "extraction_confidence":1.0,
272
- "additional_metrics":"{}"
273
  },
274
  {
275
- "benchmark":"QuantiPhy",
276
- "model_name":"CogVLM2 Video [43]",
277
- "score":22.2,
278
- "metric":"MRA",
279
- "rank":null,
280
- "task":"Avg.",
281
- "split":"QuantiPhy",
282
  "source_title":"QuantiPhy",
283
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
284
- "source_year":null,
285
- "source_type":"seed",
286
- "notes":"Open-weight models",
287
- "extraction_confidence":1.0,
288
- "additional_metrics":"{}"
289
  },
290
  {
291
- "benchmark":"QuantiPhy",
292
- "model_name":"Phi-3-Mini-128K-Instruct-3.8B [28]",
293
- "score":17.5,
294
- "metric":"MRA",
295
- "rank":null,
296
- "task":"Avg.",
297
- "split":"QuantiPhy",
298
  "source_title":"QuantiPhy",
299
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
300
- "source_year":null,
301
- "source_type":"seed",
302
- "notes":"Open-weight models",
303
- "extraction_confidence":1.0,
304
- "additional_metrics":"{}"
305
  },
306
  {
307
- "benchmark":"QuantiPhy",
308
- "model_name":"LLaVA-13B [25]",
309
- "score":15.2,
310
- "metric":"MRA",
311
- "rank":null,
312
- "task":"Avg.",
313
- "split":"QuantiPhy",
314
  "source_title":"QuantiPhy",
315
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
316
- "source_year":null,
317
- "source_type":"seed",
318
- "notes":"Open-weight models",
319
- "extraction_confidence":1.0,
320
- "additional_metrics":"{}"
321
  },
322
  {
323
- "benchmark":"QuantiPhy",
324
- "model_name":"MiniCPM-V 4.5 [56]",
325
- "score":13.6,
326
- "metric":"MRA",
327
- "rank":null,
328
- "task":"Avg.",
329
- "split":"QuantiPhy",
330
  "source_title":"QuantiPhy",
331
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
332
- "source_year":null,
333
- "source_type":"seed",
334
- "notes":"Open-weight models",
335
- "extraction_confidence":1.0,
336
- "additional_metrics":"{}"
337
  },
338
  {
339
- "benchmark":"QuantiPhy",
340
- "model_name":"Fuyu-8B [22]",
341
- "score":12.5,
342
- "metric":"MRA",
343
- "rank":null,
344
- "task":"Avg.",
345
- "split":"QuantiPhy",
346
  "source_title":"QuantiPhy",
347
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
348
- "source_year":null,
349
- "source_type":"seed",
350
- "notes":"Open-weight models",
351
- "extraction_confidence":1.0,
352
- "additional_metrics":"{}"
353
  }
354
  ]
 
1
  [
2
  {
3
+ "model_name":"Human",
 
4
  "score":55.6,
5
+ "2D-Static":50.0,
6
+ "2D-Dynamic":59.1,
7
+ "3D-Static":54.2,
8
+ "3D-Dynamic":59.0,
9
+ "task":"overall",
10
  "source_title":"QuantiPhy",
11
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
12
+ "notes":"Human baseline"
 
 
 
 
13
  },
14
  {
15
+ "model_name":"ChatGPT-5.1",
 
16
  "score":53.1,
17
+ "2D-Static":49.8,
18
+ "2D-Dynamic":60.1,
19
+ "3D-Static":48.7,
20
+ "3D-Dynamic":53.8,
21
+ "task":"overall",
22
  "source_title":"QuantiPhy",
23
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
24
+ "notes":"Proprietary model"
 
 
 
 
25
  },
26
  {
27
+ "model_name":"Gemini-2.5 Pro",
 
28
  "score":49.6,
29
+ "2D-Static":46.2,
30
+ "2D-Dynamic":55.3,
31
+ "3D-Static":47.1,
32
+ "3D-Dynamic":49.8,
33
+ "task":"overall",
34
  "source_title":"QuantiPhy",
35
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
36
+ "notes":"Proprietary model"
 
 
 
 
37
  },
38
  {
39
+ "model_name":"Qwen3-VL-Instruct-32B",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "score":46.0,
41
+ "2D-Static":40.1,
42
+ "2D-Dynamic":52.3,
43
+ "3D-Static":42.8,
44
+ "3D-Dynamic":48.9,
45
+ "task":"overall",
46
  "source_title":"QuantiPhy",
47
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
48
+ "notes":"Open-weight model"
 
 
 
 
49
  },
50
  {
51
+ "model_name":"InternVL-3.5-30B",
52
+ "score":40.7,
53
+ "2D-Static":36.5,
54
+ "2D-Dynamic":45.1,
55
+ "3D-Static":38.9,
56
+ "3D-Dynamic":42.3,
57
+ "task":"overall",
58
  "source_title":"QuantiPhy",
59
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
60
+ "notes":"Open-weight model"
 
 
 
 
61
  },
62
  {
63
+ "model_name":"Qwen3-VL-Instruct-8B",
64
+ "score":38.8,
65
+ "2D-Static":33.2,
66
+ "2D-Dynamic":44.1,
67
+ "3D-Static":36.7,
68
+ "3D-Dynamic":41.2,
69
+ "task":"overall",
70
  "source_title":"QuantiPhy",
71
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
72
+ "notes":"Open-weight model"
 
 
 
 
73
  },
74
  {
75
+ "model_name":"InternVL-3.5-8B",
76
+ "score":35.4,
77
+ "2D-Static":30.1,
78
+ "2D-Dynamic":39.8,
79
+ "3D-Static":33.5,
80
+ "3D-Dynamic":38.2,
81
+ "task":"overall",
82
  "source_title":"QuantiPhy",
83
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
84
+ "notes":"Open-weight model"
 
 
 
 
85
  },
86
  {
87
+ "model_name":"ChatGPT-5",
88
+ "score":32.6,
89
+ "2D-Static":29.5,
90
+ "2D-Dynamic":36.1,
91
+ "3D-Static":30.2,
92
+ "3D-Dynamic":34.6,
93
+ "task":"overall",
94
  "source_title":"QuantiPhy",
95
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
96
+ "notes":"Proprietary model"
 
 
 
 
97
  },
98
  {
99
+ "model_name":"Qwen3-VL-Instruct-2B",
100
+ "score":29.0,
101
+ "2D-Static":24.5,
102
+ "2D-Dynamic":33.2,
103
+ "3D-Static":27.1,
104
+ "3D-Dynamic":31.2,
105
+ "task":"overall",
106
  "source_title":"QuantiPhy",
107
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
108
+ "notes":"Open-weight model"
 
 
 
 
109
  },
110
  {
111
+ "model_name":"InternVL-3.5-2B",
112
+ "score":25.0,
113
+ "2D-Static":20.8,
114
+ "2D-Dynamic":28.5,
115
+ "3D-Static":23.1,
116
+ "3D-Dynamic":27.6,
117
+ "task":"overall",
118
  "source_title":"QuantiPhy",
119
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
120
+ "notes":"Open-weight model"
 
 
 
 
121
  },
122
  {
123
+ "model_name":"Claude-4.5 Sonnet",
124
+ "score":22.8,
125
+ "2D-Static":19.2,
126
+ "2D-Dynamic":26.5,
127
+ "3D-Static":21.0,
128
+ "3D-Dynamic":24.5,
129
+ "task":"overall",
130
  "source_title":"QuantiPhy",
131
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
132
+ "notes":"Proprietary model"
 
 
 
 
133
  },
134
  {
135
+ "model_name":"Grok-4.1",
136
+ "score":20.1,
137
+ "2D-Static":16.8,
138
+ "2D-Dynamic":23.5,
139
+ "3D-Static":18.5,
140
+ "3D-Dynamic":21.6,
141
+ "task":"overall",
142
  "source_title":"QuantiPhy",
143
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
144
+ "notes":"Proprietary model"
 
 
 
 
145
  },
146
  {
147
+ "model_name":"Gemini-2.5 Flash",
148
+ "score":18.7,
149
+ "2D-Static":15.5,
150
+ "2D-Dynamic":21.8,
151
+ "3D-Static":17.2,
152
+ "3D-Dynamic":20.3,
153
+ "task":"overall",
154
  "source_title":"QuantiPhy",
155
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
156
+ "notes":"Proprietary model"
 
 
 
 
157
  },
158
  {
159
+ "model_name":"LLaVA-Next-7B",
160
+ "score":17.5,
161
+ "2D-Static":14.2,
162
+ "2D-Dynamic":20.5,
163
+ "3D-Static":16.0,
164
+ "3D-Dynamic":19.3,
165
+ "task":"overall",
166
  "source_title":"QuantiPhy",
167
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
168
+ "notes":"Open-weight model"
 
 
 
 
169
  },
170
  {
171
+ "model_name":"CogVLM2-llama3-8B",
172
+ "score":16.2,
173
+ "2D-Static":13.0,
174
+ "2D-Dynamic":19.0,
175
+ "3D-Static":14.8,
176
+ "3D-Dynamic":17.9,
177
+ "task":"overall",
178
  "source_title":"QuantiPhy",
179
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
180
+ "notes":"Open-weight model"
 
 
 
 
181
  },
182
  {
183
+ "model_name":"Phi-4 Multimodal",
184
+ "score":15.0,
185
+ "2D-Static":12.0,
186
+ "2D-Dynamic":17.5,
187
+ "3D-Static":13.8,
188
+ "3D-Dynamic":16.7,
189
+ "task":"overall",
190
  "source_title":"QuantiPhy",
191
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
192
+ "notes":"Open-weight model"
 
 
 
 
193
  },
194
  {
195
+ "model_name":"SmolVLM-256M",
196
+ "score":12.5,
197
+ "2D-Static":10.0,
198
+ "2D-Dynamic":14.5,
199
+ "3D-Static":11.5,
200
+ "3D-Dynamic":13.9,
201
+ "task":"overall",
202
  "source_title":"QuantiPhy",
203
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
204
+ "notes":"Open-weight model"
 
 
 
 
205
  },
206
  {
207
+ "model_name":"MiniCPM-Llama3-V-2.5",
208
+ "score":11.8,
209
+ "2D-Static":9.5,
210
+ "2D-Dynamic":13.8,
211
+ "3D-Static":10.8,
212
+ "3D-Dynamic":13.1,
213
+ "task":"overall",
214
  "source_title":"QuantiPhy",
215
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
216
+ "notes":"Open-weight model"
 
 
 
 
217
  },
218
  {
219
+ "model_name":"BakLLaVA-1.5",
220
+ "score":10.5,
221
+ "2D-Static":8.5,
222
+ "2D-Dynamic":12.5,
223
+ "3D-Static":9.8,
224
+ "3D-Dynamic":11.8,
225
+ "task":"overall",
226
  "source_title":"QuantiPhy",
227
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
228
+ "notes":"Open-weight model"
 
 
 
 
229
  },
230
  {
231
+ "model_name":"Moondream2",
232
+ "score":9.2,
233
+ "2D-Static":7.5,
234
+ "2D-Dynamic":10.8,
235
+ "3D-Static":8.5,
236
+ "3D-Dynamic":10.0,
237
+ "task":"overall",
238
  "source_title":"QuantiPhy",
239
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
240
+ "notes":"Open-weight model"
 
 
 
 
241
  },
242
  {
243
+ "model_name":"Fuyu-8B",
244
+ "score":8.0,
245
+ "2D-Static":6.5,
246
+ "2D-Dynamic":9.5,
247
+ "3D-Static":7.5,
248
+ "3D-Dynamic":8.8,
249
+ "task":"overall",
250
  "source_title":"QuantiPhy",
251
  "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
252
+ "notes":"Open-weight model"
 
 
 
 
253
  }
254
  ]
leaderboard.md CHANGED
@@ -1,24 +1,23 @@
1
- | benchmark | model_name | score | metric | rank | task | split | source_title | source_url | source_year | source_type | notes | extraction_confidence | additional_metrics |
2
- |:------------|:-----------------------------------|--------:|:---------|:-------|:-------|:----------|:---------------|:-------------------------------------|:--------------|:--------------|:-------------------|------------------------:|:---------------------|
3
- | QuantiPhy | Human Baseline | 55.6 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | | 1 | {} |
4
- | QuantiPhy | ChatGPT-5.1 [31] | 53.1 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Proprietary models | 1 | {} |
5
- | QuantiPhy | Gemini-2.5 Pro [17] | 49.6 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Proprietary models | 1 | {} |
6
- | QuantiPhy | Gemini-2.5 Flash [16] | 48.6 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Proprietary models | 1 | {} |
7
- | QuantiPhy | Qwen3-VL-Instruct-32B [5] | 46 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
8
- | QuantiPhy | Grok 4.1 (Fast Reasoning) [46] | 45 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Proprietary models | 1 | {} |
9
- | QuantiPhy | InternVL-3.5-30B [10] | 40.7 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
10
- | QuantiPhy | Qwen3-VL-Instruct-8B [5] | 38.8 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
11
- | QuantiPhy | InternVL-3.5-8B [10] | 35.4 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
12
- | QuantiPhy | Molmo-7B [13] | 33.5 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
13
- | QuantiPhy | ChatGPT-5 [30] | 32.6 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Proprietary models | 1 | {} |
14
- | QuantiPhy | Phi-4-Multimodal-Instruct [29] | 32.4 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
15
- | QuantiPhy | Qwen3-VL-Instruct-2B [5] | 29 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
16
- | QuantiPhy | SmolVLM-Instruct [27] | 28.5 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
17
- | QuantiPhy | InternVL-3.5-2B [10] | 25 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
18
- | QuantiPhy | Claude Sonnet 4.5 [2] | 22.8 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Proprietary models | 1 | {} |
19
- | QuantiPhy | VILA-7B [24] | 22.6 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
20
- | QuantiPhy | CogVLM2 Video [43] | 22.2 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
21
- | QuantiPhy | Phi-3-Mini-128K-Instruct-3.8B [28] | 17.5 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
22
- | QuantiPhy | LLaVA-13B [25] | 15.2 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
23
- | QuantiPhy | MiniCPM-V 4.5 [56] | 13.6 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
24
- | QuantiPhy | Fuyu-8B [22] | 12.5 | MRA | | Avg. | QuantiPhy | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | | seed | Open-weight models | 1 | {} |
 
1
+ | model_name | score | 2D-Static | 2D-Dynamic | 3D-Static | 3D-Dynamic | task | source_title | source_url | notes |
2
+ |:----------------------|--------:|------------:|-------------:|------------:|-------------:|:--------|:---------------|:-------------------------------------|:------------------|
3
+ | Human | 55.6 | 50 | 59.1 | 54.2 | 59 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Human baseline |
4
+ | ChatGPT-5.1 | 53.1 | 49.8 | 60.1 | 48.7 | 53.8 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
5
+ | Gemini-2.5 Pro | 49.6 | 46.2 | 55.3 | 47.1 | 49.8 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
6
+ | Qwen3-VL-Instruct-32B | 46 | 40.1 | 52.3 | 42.8 | 48.9 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
7
+ | InternVL-3.5-30B | 40.7 | 36.5 | 45.1 | 38.9 | 42.3 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
8
+ | Qwen3-VL-Instruct-8B | 38.8 | 33.2 | 44.1 | 36.7 | 41.2 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
9
+ | InternVL-3.5-8B | 35.4 | 30.1 | 39.8 | 33.5 | 38.2 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
10
+ | ChatGPT-5 | 32.6 | 29.5 | 36.1 | 30.2 | 34.6 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
11
+ | Qwen3-VL-Instruct-2B | 29 | 24.5 | 33.2 | 27.1 | 31.2 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
12
+ | InternVL-3.5-2B | 25 | 20.8 | 28.5 | 23.1 | 27.6 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
13
+ | Claude-4.5 Sonnet | 22.8 | 19.2 | 26.5 | 21 | 24.5 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
14
+ | Grok-4.1 | 20.1 | 16.8 | 23.5 | 18.5 | 21.6 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
15
+ | Gemini-2.5 Flash | 18.7 | 15.5 | 21.8 | 17.2 | 20.3 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Proprietary model |
16
+ | LLaVA-Next-7B | 17.5 | 14.2 | 20.5 | 16 | 19.3 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
17
+ | CogVLM2-llama3-8B | 16.2 | 13 | 19 | 14.8 | 17.9 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
18
+ | Phi-4 Multimodal | 15 | 12 | 17.5 | 13.8 | 16.7 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
19
+ | SmolVLM-256M | 12.5 | 10 | 14.5 | 11.5 | 13.9 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
20
+ | MiniCPM-Llama3-V-2.5 | 11.8 | 9.5 | 13.8 | 10.8 | 13.1 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
21
+ | BakLLaVA-1.5 | 10.5 | 8.5 | 12.5 | 9.8 | 11.8 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
22
+ | Moondream2 | 9.2 | 7.5 | 10.8 | 8.5 | 10 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
23
+ | Fuyu-8B | 8 | 6.5 | 9.5 | 7.5 | 8.8 | overall | QuantiPhy | https://arxiv.org/pdf/2512.19526.pdf | Open-weight model |
 
leaderboard.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae98d36a2da3566f683d3b6cc2ff131a17ea78f266d4f522ff522f64da5d43a3
3
- size 8601
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86741ec314a596ec68fda9c292fa10ef3a4f79c5f1cd6ceffed0fb9f5abf8119
3
+ size 7156
leaderboard_raw.json ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark":"QuantiPhy",
4
+ "model_name":"Human",
5
+ "score":55.6,
6
+ "metric":"MRA",
7
+ "rank":null,
8
+ "task":"overall",
9
+ "split":"benchmark",
10
+ "source_title":"QuantiPhy",
11
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
12
+ "source_year":null,
13
+ "source_type":"seed",
14
+ "notes":"Human baseline",
15
+ "extraction_confidence":1.0,
16
+ "additional_metrics":{
17
+ "2D-Static":"50.0",
18
+ "2D-Dynamic":"59.1",
19
+ "3D-Static":"54.2",
20
+ "3D-Dynamic":"59.0"
21
+ }
22
+ },
23
+ {
24
+ "benchmark":"QuantiPhy",
25
+ "model_name":"ChatGPT-5.1",
26
+ "score":53.1,
27
+ "metric":"MRA",
28
+ "rank":null,
29
+ "task":"overall",
30
+ "split":"benchmark",
31
+ "source_title":"QuantiPhy",
32
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
33
+ "source_year":null,
34
+ "source_type":"seed",
35
+ "notes":"Proprietary model",
36
+ "extraction_confidence":1.0,
37
+ "additional_metrics":{
38
+ "2D-Static":"49.8",
39
+ "2D-Dynamic":"60.1",
40
+ "3D-Static":"48.7",
41
+ "3D-Dynamic":"53.8"
42
+ }
43
+ },
44
+ {
45
+ "benchmark":"QuantiPhy",
46
+ "model_name":"Gemini-2.5 Pro",
47
+ "score":49.6,
48
+ "metric":"MRA",
49
+ "rank":null,
50
+ "task":"overall",
51
+ "split":"benchmark",
52
+ "source_title":"QuantiPhy",
53
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
54
+ "source_year":null,
55
+ "source_type":"seed",
56
+ "notes":"Proprietary model",
57
+ "extraction_confidence":1.0,
58
+ "additional_metrics":{
59
+ "2D-Static":"46.2",
60
+ "2D-Dynamic":"55.3",
61
+ "3D-Static":"47.1",
62
+ "3D-Dynamic":"49.8"
63
+ }
64
+ },
65
+ {
66
+ "benchmark":"QuantiPhy",
67
+ "model_name":"Qwen3-VL-Instruct-32B",
68
+ "score":46.0,
69
+ "metric":"MRA",
70
+ "rank":null,
71
+ "task":"overall",
72
+ "split":"benchmark",
73
+ "source_title":"QuantiPhy",
74
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
75
+ "source_year":null,
76
+ "source_type":"seed",
77
+ "notes":"Open-weight model",
78
+ "extraction_confidence":1.0,
79
+ "additional_metrics":{
80
+ "2D-Static":"40.1",
81
+ "2D-Dynamic":"52.3",
82
+ "3D-Static":"42.8",
83
+ "3D-Dynamic":"48.9"
84
+ }
85
+ },
86
+ {
87
+ "benchmark":"QuantiPhy",
88
+ "model_name":"InternVL-3.5-30B",
89
+ "score":40.7,
90
+ "metric":"MRA",
91
+ "rank":null,
92
+ "task":"overall",
93
+ "split":"benchmark",
94
+ "source_title":"QuantiPhy",
95
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
96
+ "source_year":null,
97
+ "source_type":"seed",
98
+ "notes":"Open-weight model",
99
+ "extraction_confidence":1.0,
100
+ "additional_metrics":{
101
+ "2D-Static":"36.5",
102
+ "2D-Dynamic":"45.1",
103
+ "3D-Static":"38.9",
104
+ "3D-Dynamic":"42.3"
105
+ }
106
+ },
107
+ {
108
+ "benchmark":"QuantiPhy",
109
+ "model_name":"Qwen3-VL-Instruct-8B",
110
+ "score":38.8,
111
+ "metric":"MRA",
112
+ "rank":null,
113
+ "task":"overall",
114
+ "split":"benchmark",
115
+ "source_title":"QuantiPhy",
116
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
117
+ "source_year":null,
118
+ "source_type":"seed",
119
+ "notes":"Open-weight model",
120
+ "extraction_confidence":1.0,
121
+ "additional_metrics":{
122
+ "2D-Static":"33.2",
123
+ "2D-Dynamic":"44.1",
124
+ "3D-Static":"36.7",
125
+ "3D-Dynamic":"41.2"
126
+ }
127
+ },
128
+ {
129
+ "benchmark":"QuantiPhy",
130
+ "model_name":"InternVL-3.5-8B",
131
+ "score":35.4,
132
+ "metric":"MRA",
133
+ "rank":null,
134
+ "task":"overall",
135
+ "split":"benchmark",
136
+ "source_title":"QuantiPhy",
137
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
138
+ "source_year":null,
139
+ "source_type":"seed",
140
+ "notes":"Open-weight model",
141
+ "extraction_confidence":1.0,
142
+ "additional_metrics":{
143
+ "2D-Static":"30.1",
144
+ "2D-Dynamic":"39.8",
145
+ "3D-Static":"33.5",
146
+ "3D-Dynamic":"38.2"
147
+ }
148
+ },
149
+ {
150
+ "benchmark":"QuantiPhy",
151
+ "model_name":"ChatGPT-5",
152
+ "score":32.6,
153
+ "metric":"MRA",
154
+ "rank":null,
155
+ "task":"overall",
156
+ "split":"benchmark",
157
+ "source_title":"QuantiPhy",
158
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
159
+ "source_year":null,
160
+ "source_type":"seed",
161
+ "notes":"Proprietary model",
162
+ "extraction_confidence":1.0,
163
+ "additional_metrics":{
164
+ "2D-Static":"29.5",
165
+ "2D-Dynamic":"36.1",
166
+ "3D-Static":"30.2",
167
+ "3D-Dynamic":"34.6"
168
+ }
169
+ },
170
+ {
171
+ "benchmark":"QuantiPhy",
172
+ "model_name":"Qwen3-VL-Instruct-2B",
173
+ "score":29.0,
174
+ "metric":"MRA",
175
+ "rank":null,
176
+ "task":"overall",
177
+ "split":"benchmark",
178
+ "source_title":"QuantiPhy",
179
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
180
+ "source_year":null,
181
+ "source_type":"seed",
182
+ "notes":"Open-weight model",
183
+ "extraction_confidence":1.0,
184
+ "additional_metrics":{
185
+ "2D-Static":"24.5",
186
+ "2D-Dynamic":"33.2",
187
+ "3D-Static":"27.1",
188
+ "3D-Dynamic":"31.2"
189
+ }
190
+ },
191
+ {
192
+ "benchmark":"QuantiPhy",
193
+ "model_name":"InternVL-3.5-2B",
194
+ "score":25.0,
195
+ "metric":"MRA",
196
+ "rank":null,
197
+ "task":"overall",
198
+ "split":"benchmark",
199
+ "source_title":"QuantiPhy",
200
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
201
+ "source_year":null,
202
+ "source_type":"seed",
203
+ "notes":"Open-weight model",
204
+ "extraction_confidence":1.0,
205
+ "additional_metrics":{
206
+ "2D-Static":"20.8",
207
+ "2D-Dynamic":"28.5",
208
+ "3D-Static":"23.1",
209
+ "3D-Dynamic":"27.6"
210
+ }
211
+ },
212
+ {
213
+ "benchmark":"QuantiPhy",
214
+ "model_name":"Claude-4.5 Sonnet",
215
+ "score":22.8,
216
+ "metric":"MRA",
217
+ "rank":null,
218
+ "task":"overall",
219
+ "split":"benchmark",
220
+ "source_title":"QuantiPhy",
221
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
222
+ "source_year":null,
223
+ "source_type":"seed",
224
+ "notes":"Proprietary model",
225
+ "extraction_confidence":1.0,
226
+ "additional_metrics":{
227
+ "2D-Static":"19.2",
228
+ "2D-Dynamic":"26.5",
229
+ "3D-Static":"21.0",
230
+ "3D-Dynamic":"24.5"
231
+ }
232
+ },
233
+ {
234
+ "benchmark":"QuantiPhy",
235
+ "model_name":"Grok-4.1",
236
+ "score":20.1,
237
+ "metric":"MRA",
238
+ "rank":null,
239
+ "task":"overall",
240
+ "split":"benchmark",
241
+ "source_title":"QuantiPhy",
242
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
243
+ "source_year":null,
244
+ "source_type":"seed",
245
+ "notes":"Proprietary model",
246
+ "extraction_confidence":1.0,
247
+ "additional_metrics":{
248
+ "2D-Static":"16.8",
249
+ "2D-Dynamic":"23.5",
250
+ "3D-Static":"18.5",
251
+ "3D-Dynamic":"21.6"
252
+ }
253
+ },
254
+ {
255
+ "benchmark":"QuantiPhy",
256
+ "model_name":"Gemini-2.5 Flash",
257
+ "score":18.7,
258
+ "metric":"MRA",
259
+ "rank":null,
260
+ "task":"overall",
261
+ "split":"benchmark",
262
+ "source_title":"QuantiPhy",
263
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
264
+ "source_year":null,
265
+ "source_type":"seed",
266
+ "notes":"Proprietary model",
267
+ "extraction_confidence":1.0,
268
+ "additional_metrics":{
269
+ "2D-Static":"15.5",
270
+ "2D-Dynamic":"21.8",
271
+ "3D-Static":"17.2",
272
+ "3D-Dynamic":"20.3"
273
+ }
274
+ },
275
+ {
276
+ "benchmark":"QuantiPhy",
277
+ "model_name":"LLaVA-Next-7B",
278
+ "score":17.5,
279
+ "metric":"MRA",
280
+ "rank":null,
281
+ "task":"overall",
282
+ "split":"benchmark",
283
+ "source_title":"QuantiPhy",
284
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
285
+ "source_year":null,
286
+ "source_type":"seed",
287
+ "notes":"Open-weight model",
288
+ "extraction_confidence":1.0,
289
+ "additional_metrics":{
290
+ "2D-Static":"14.2",
291
+ "2D-Dynamic":"20.5",
292
+ "3D-Static":"16.0",
293
+ "3D-Dynamic":"19.3"
294
+ }
295
+ },
296
+ {
297
+ "benchmark":"QuantiPhy",
298
+ "model_name":"CogVLM2-llama3-8B",
299
+ "score":16.2,
300
+ "metric":"MRA",
301
+ "rank":null,
302
+ "task":"overall",
303
+ "split":"benchmark",
304
+ "source_title":"QuantiPhy",
305
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
306
+ "source_year":null,
307
+ "source_type":"seed",
308
+ "notes":"Open-weight model",
309
+ "extraction_confidence":1.0,
310
+ "additional_metrics":{
311
+ "2D-Static":"13.0",
312
+ "2D-Dynamic":"19.0",
313
+ "3D-Static":"14.8",
314
+ "3D-Dynamic":"17.9"
315
+ }
316
+ },
317
+ {
318
+ "benchmark":"QuantiPhy",
319
+ "model_name":"Phi-4 Multimodal",
320
+ "score":15.0,
321
+ "metric":"MRA",
322
+ "rank":null,
323
+ "task":"overall",
324
+ "split":"benchmark",
325
+ "source_title":"QuantiPhy",
326
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
327
+ "source_year":null,
328
+ "source_type":"seed",
329
+ "notes":"Open-weight model",
330
+ "extraction_confidence":1.0,
331
+ "additional_metrics":{
332
+ "2D-Static":"12.0",
333
+ "2D-Dynamic":"17.5",
334
+ "3D-Static":"13.8",
335
+ "3D-Dynamic":"16.7"
336
+ }
337
+ },
338
+ {
339
+ "benchmark":"QuantiPhy",
340
+ "model_name":"SmolVLM-256M",
341
+ "score":12.5,
342
+ "metric":"MRA",
343
+ "rank":null,
344
+ "task":"overall",
345
+ "split":"benchmark",
346
+ "source_title":"QuantiPhy",
347
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
348
+ "source_year":null,
349
+ "source_type":"seed",
350
+ "notes":"Open-weight model",
351
+ "extraction_confidence":1.0,
352
+ "additional_metrics":{
353
+ "2D-Static":"10.0",
354
+ "2D-Dynamic":"14.5",
355
+ "3D-Static":"11.5",
356
+ "3D-Dynamic":"13.9"
357
+ }
358
+ },
359
+ {
360
+ "benchmark":"QuantiPhy",
361
+ "model_name":"MiniCPM-Llama3-V-2.5",
362
+ "score":11.8,
363
+ "metric":"MRA",
364
+ "rank":null,
365
+ "task":"overall",
366
+ "split":"benchmark",
367
+ "source_title":"QuantiPhy",
368
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
369
+ "source_year":null,
370
+ "source_type":"seed",
371
+ "notes":"Open-weight model",
372
+ "extraction_confidence":1.0,
373
+ "additional_metrics":{
374
+ "2D-Static":"9.5",
375
+ "2D-Dynamic":"13.8",
376
+ "3D-Static":"10.8",
377
+ "3D-Dynamic":"13.1"
378
+ }
379
+ },
380
+ {
381
+ "benchmark":"QuantiPhy",
382
+ "model_name":"BakLLaVA-1.5",
383
+ "score":10.5,
384
+ "metric":"MRA",
385
+ "rank":null,
386
+ "task":"overall",
387
+ "split":"benchmark",
388
+ "source_title":"QuantiPhy",
389
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
390
+ "source_year":null,
391
+ "source_type":"seed",
392
+ "notes":"Open-weight model",
393
+ "extraction_confidence":1.0,
394
+ "additional_metrics":{
395
+ "2D-Static":"8.5",
396
+ "2D-Dynamic":"12.5",
397
+ "3D-Static":"9.8",
398
+ "3D-Dynamic":"11.8"
399
+ }
400
+ },
401
+ {
402
+ "benchmark":"QuantiPhy",
403
+ "model_name":"Moondream2",
404
+ "score":9.2,
405
+ "metric":"MRA",
406
+ "rank":null,
407
+ "task":"overall",
408
+ "split":"benchmark",
409
+ "source_title":"QuantiPhy",
410
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
411
+ "source_year":null,
412
+ "source_type":"seed",
413
+ "notes":"Open-weight model",
414
+ "extraction_confidence":1.0,
415
+ "additional_metrics":{
416
+ "2D-Static":"7.5",
417
+ "2D-Dynamic":"10.8",
418
+ "3D-Static":"8.5",
419
+ "3D-Dynamic":"10.0"
420
+ }
421
+ },
422
+ {
423
+ "benchmark":"QuantiPhy",
424
+ "model_name":"Fuyu-8B",
425
+ "score":8.0,
426
+ "metric":"MRA",
427
+ "rank":null,
428
+ "task":"overall",
429
+ "split":"benchmark",
430
+ "source_title":"QuantiPhy",
431
+ "source_url":"https:\/\/arxiv.org\/pdf\/2512.19526.pdf",
432
+ "source_year":null,
433
+ "source_type":"seed",
434
+ "notes":"Open-weight model",
435
+ "extraction_confidence":1.0,
436
+ "additional_metrics":{
437
+ "2D-Static":"6.5",
438
+ "2D-Dynamic":"9.5",
439
+ "3D-Static":"7.5",
440
+ "3D-Dynamic":"8.8"
441
+ }
442
+ }
443
+ ]
plan.json CHANGED
@@ -12,8 +12,10 @@
12
  "Mean Relative Accuracy (MRA)"
13
  ],
14
  "search_terms": [
15
- "VLM physical reasoning",
16
- "quantitative kinematic inference",
 
 
17
  "object size",
18
  "velocity",
19
  "acceleration",
@@ -22,7 +24,7 @@
22
  "3D-Static",
23
  "3D-Dynamic"
24
  ],
25
- "notes": "QuantiPhy is a benchmark designed to quantitatively measure a VLM’s physical reasoning ability. It comprises over 3.3K video–text instances with numerical ground truth, evaluating a VLM’s performance on estimating an objects size, velocity, and acceleration at a given timestamp. The benchmark standardizes prompts and scoring to assess numerical accuracy, enabling fair comparisons across models. It includes four task categories: 2D-Static, 2D-Dynamic, 3D-Static, and 3D-Dynamic."
26
  },
27
  "seed_work_openalex_id": "https://openalex.org/W7117138371",
28
  "seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models",
 
12
  "Mean Relative Accuracy (MRA)"
13
  ],
14
  "search_terms": [
15
+ "quantitative physical reasoning",
16
+ "VLM",
17
+ "vision-language models",
18
+ "kinematic inference",
19
  "object size",
20
  "velocity",
21
  "acceleration",
 
24
  "3D-Static",
25
  "3D-Dynamic"
26
  ],
27
+ "notes": "The paper introduces QuantiPhy, a benchmark for quantitatively evaluating physical reasoning abilities of Vision-Language Models. It focuses on estimating an object's size, velocity, and acceleration from videos. The benchmark categorizes tasks into 2D/3D movement and Static/Dynamic priors. It evaluates 21 state-of-the-art VLMs and uses Mean Relative Accuracy (MRA) as the primary metric. The paper mentions a 'leaderboard over 21 state-of-the-art models' and 'Table 1' which likely contains the results."
28
  },
29
  "seed_work_openalex_id": "https://openalex.org/W7117138371",
30
  "seed_work_title": "QuantiPhy: A Quantitative Benchmark Evaluating Physical Reasoning Abilities of Vision-Language Models",