AlmazErmilov's picture
docs: update suite scope notes
a01417f
"""
FormationEval Leaderboard - Interactive benchmark results viewer
72 models evaluated on 505 petroleum geoscience MCQs. DISKOS-QA and SPE MCQ
have been added to the public suite and the expanded rerun is pending.
"""
import json
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
# Load data
DATA_PATH = Path(__file__).parent / "leaderboard_data.json"
with open(DATA_PATH) as f:
data = json.load(f)
df = pd.DataFrame(data["models"])
metadata = data["metadata"]
# Company list for filter
COMPANIES = sorted(df["company"].unique().tolist())
# Column configurations
OVERALL_COLS = ["rank", "model", "company", "open_weight", "price_input", "price_output", "accuracy", "correct", "total"]
DIFFICULTY_COLS = ["rank", "model", "company", "accuracy", "easy", "medium", "hard"]
DOMAIN_COLS = ["rank", "model", "drilling", "geophysics", "petroleum_geology", "petrophysics", "production", "reservoir", "sedimentology"]
BIAS_COLS = ["rank", "model", "company", "accuracy", "position_bias", "length_bias"]
# Display names for columns
COLUMN_NAMES = {
"rank": "Rank",
"model": "Model",
"company": "Company",
"open_weight": "Open",
"price_input": "Input $/M",
"price_output": "Output $/M",
"price_avg": "Avg $/M",
"accuracy": "Accuracy %",
"correct": "Correct",
"total": "Total",
"easy": "Easy %",
"medium": "Medium %",
"hard": "Hard %",
"drilling": "Drilling %",
"geophysics": "Geophysics %",
"petroleum_geology": "Petrol. Geo. %",
"petrophysics": "Petrophysics %",
"production": "Production %",
"reservoir": "Reservoir %",
"sedimentology": "Sediment. %",
"position_bias": "Position bias",
"length_bias": "Length bias",
}
def filter_dataframe(df_in, search_query, company_filter, open_weight_only):
"""Apply filters to dataframe."""
df_out = df_in.copy()
if search_query:
df_out = df_out[df_out["model"].str.lower().str.contains(search_query.lower(), regex=False)]
if company_filter and company_filter != "All":
df_out = df_out[df_out["company"] == company_filter]
if open_weight_only:
df_out = df_out[df_out["open_weight"] == True]
return df_out
def get_overall_table(search_query, company_filter, open_weight_only):
"""Get filtered overall leaderboard."""
filtered = filter_dataframe(df, search_query, company_filter, open_weight_only)
result = filtered[OVERALL_COLS].copy()
result.columns = [COLUMN_NAMES.get(c, c) for c in OVERALL_COLS]
return result
def get_difficulty_table(search_query, company_filter, open_weight_only):
"""Get filtered difficulty breakdown."""
filtered = filter_dataframe(df, search_query, company_filter, open_weight_only)
result = filtered[DIFFICULTY_COLS].copy()
result.columns = [COLUMN_NAMES.get(c, c) for c in DIFFICULTY_COLS]
return result
def get_domain_table(search_query, company_filter, open_weight_only):
"""Get filtered domain breakdown."""
filtered = filter_dataframe(df, search_query, company_filter, open_weight_only)
result = filtered[DOMAIN_COLS].copy()
result.columns = [COLUMN_NAMES.get(c, c) for c in DOMAIN_COLS]
return result
def get_bias_table(search_query, company_filter, open_weight_only):
"""Get filtered bias analysis."""
filtered = filter_dataframe(df, search_query, company_filter, open_weight_only)
result = filtered[BIAS_COLS].copy()
result.columns = [COLUMN_NAMES.get(c, c) for c in BIAS_COLS]
return result
def create_accuracy_vs_price_chart(open_weight_only=False):
"""Create interactive scatter plot of accuracy vs price."""
data = df.copy()
if open_weight_only:
data = data[data["open_weight"] == True]
fig = px.scatter(
data,
x="price_avg",
y="accuracy",
color="open_weight",
hover_name="model",
hover_data={
"company": True,
"accuracy": ":.1f",
"price_avg": ":.2f",
"open_weight": True,
},
color_discrete_map={True: "#1f77b4", False: "#ff7f0e"},
labels={
"price_avg": "Average price ($/M tokens)",
"accuracy": "Accuracy (%)",
"open_weight": "Open weight",
},
title=f"Accuracy vs price ({len(data)} models)",
)
fig.update_traces(marker=dict(size=10, opacity=0.8))
fig.update_layout(
font=dict(family="Inter, sans-serif"),
legend_title_text="Model type",
legend=dict(
itemsizing="constant",
title_font_size=12,
),
hovermode="closest",
height=800,
)
# Update legend labels
fig.for_each_trace(
lambda t: t.update(name="Open weight" if t.name == "True" else "Closed")
)
return fig
def create_all_models_chart(open_weight_only=False):
"""Create horizontal bar chart of all models."""
data = df.copy()
if open_weight_only:
data = data[data["open_weight"] == True]
all_models = data.sort_values("accuracy", ascending=False)
# Preserve sort order with category_orders
model_order = all_models["model"].tolist()
fig = px.bar(
all_models,
x="accuracy",
y="model",
orientation="h",
color="open_weight",
color_discrete_map={True: "#1f77b4", False: "#ff7f0e"},
hover_data={"company": True, "accuracy": ":.1f"},
labels={"accuracy": "Accuracy (%)", "model": "Model"},
title=f"All models by accuracy ({len(all_models)} total)",
category_orders={"model": model_order},
)
fig.update_layout(
font=dict(family="Inter, sans-serif"),
yaxis=dict(tickfont=dict(size=9)),
legend_title_text="Model type",
showlegend=True,
height=max(400, len(all_models) * 20),
)
fig.for_each_trace(
lambda t: t.update(name="Open weight" if t.name == "True" else "Closed")
)
return fig
def create_open_weight_chart():
"""Create bar chart of all open-weight models."""
open_models = df[df["open_weight"] == True].sort_values("accuracy", ascending=False)
model_order = open_models["model"].tolist()
fig = px.bar(
open_models,
x="accuracy",
y="model",
orientation="h",
color_discrete_sequence=["#1f77b4"],
hover_data={"company": True, "accuracy": ":.1f"},
labels={"accuracy": "Accuracy (%)", "model": "Model"},
title=f"Open-weight models ({len(open_models)} total)",
category_orders={"model": model_order},
)
fig.update_layout(
font=dict(family="Inter, sans-serif"),
yaxis=dict(tickfont=dict(size=10)),
height=max(400, len(open_models) * 18),
)
return fig
def create_domain_heatmap(open_weight_only=False):
"""Create heatmap of domain performance for top 50 models."""
data = df.copy()
if open_weight_only:
data = data[data["open_weight"] == True]
# Limit to top 50 models by rank
top_models = data.sort_values("rank").head(50)
domain_cols = ["drilling", "geophysics", "petroleum_geology", "petrophysics", "production", "reservoir", "sedimentology"]
heatmap_data = top_models[["model"] + domain_cols].set_index("model")
heatmap_data.columns = ["Drilling", "Geophysics", "Petrol. Geo.", "Petrophysics", "Production", "Reservoir", "Sediment."]
fig = px.imshow(
heatmap_data,
color_continuous_scale="RdYlGn",
aspect="auto",
title=f"Domain performance heatmap (top {len(top_models)} models)",
labels=dict(color="Accuracy %"),
)
fig.update_layout(
font=dict(family="Inter, sans-serif"),
height=max(400, len(top_models) * 20),
)
return fig
# About text
ABOUT_TEXT = """
## FormationEval Benchmark
**FormationEval** is an open multiple-choice question (MCQ) benchmark for evaluating language models on petroleum geoscience and subsurface disciplines.
### Key statistics
- **505 questions** across 7 domains
- **72 models** evaluated
- **3 authoritative sources**: Ellis & Singer (2007), Bjørlykke (2010), TU Delft OCW
### Project context
FormationEval `v0.1` was the first 505-question benchmark version and formed a small part of the work presented at EAGE Digital 2026 in Stavanger in the session *New Frontiers In Geomodelling: Recent Digital Advances* under the title *Multi-Agent Framework for Subsurface Workflows: Petrophysicist, Geologist and Reservoir Engineer GenAI Agents*. It was built to compare models for oil and gas geoscience and subsurface tasks and to provide a public leaderboard that was useful in practice. At that point I did not see public benchmarks or leaderboards in that area that matched that need. DISKOS-QA and the SPE MCQ Dataset were added later as separate imported tracks in the same suite.
### Current suite status
- **MCQ v0.1**: evaluated and shown in this Space
- **DISKOS-QA**: imported into the public suite, not yet rerun in this Space
- **SPE MCQ Dataset**: imported into the public suite, not yet rerun in this Space
- The original model comparison goal of the project was already addressed by the published MCQ leaderboard
- A full rerun on the expanded suite is pending because this is a self funded one person project and expanded suite evaluation requires materially more token spend
- If you want to collaborate, support reruns or discuss related research and engineering work, contact almaz.ermilov@gmail.com
- Imported track provenance and licensing notes: [main repository notices](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation/blob/main/THIRD_PARTY_NOTICES.md), [dataset notices](https://huggingface.co/datasets/AlmazErmilov/FormationEval/blob/main/THIRD_PARTY_NOTICES.md), the upstream [DISKOS-QA benchmark](https://github.com/georgeghon/DISKOS-QA), and the upstream [SPE MCQ dataset](https://huggingface.co/datasets/ynuwara/spe_mcq_dataset)
### Domains
| Domain | Questions |
|--------|-----------|
| Petrophysics | 272 |
| Petroleum Geology | 151 |
| Sedimentology | 98 |
| Geophysics | 80 |
| Reservoir Engineering | 43 |
| Drilling Engineering | 24 |
| Production Engineering | 14 |
### Difficulty distribution
| Level | Count | % |
|-------|-------|---|
| Easy | 132 | 26% |
| Medium | 274 | 54% |
| Hard | 99 | 20% |
### Links
- **Paper**: [arXiv:2601.02158](https://arxiv.org/abs/2601.02158)
- **Dataset**: [AlmazErmilov/FormationEval](https://huggingface.co/datasets/AlmazErmilov/FormationEval)
- **GitHub**: [FormationEval Repository](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation)
- **Website**: [formationeval.no](https://www.formationeval.no)
- **DISKOS-QA browser**: [formationeval.no/diskos-qa](https://www.formationeval.no/diskos-qa)
- **Unified question browser**: [formationeval.no/questions](https://www.formationeval.no/questions)
- **Third party notices**: [main repository notices](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation/blob/main/THIRD_PARTY_NOTICES.md)
"""
CITATION_BIBTEX = """@misc{ermilov2026formationeval,
title={FormationEval, an open multiple-choice benchmark for petroleum geoscience},
author={Almaz Ermilov},
year={2026},
eprint={2601.02158},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2601.02158},
}"""
# Custom CSS to show more rows in tables (~20 rows visible)
CUSTOM_CSS = """
.tall-table {
min-height: 700px !important;
}
.tall-table .table-wrap {
max-height: 700px !important;
}
"""
# Build Gradio interface
with gr.Blocks(title="FormationEval Leaderboard", css=CUSTOM_CSS) as demo:
gr.Markdown("# FormationEval Leaderboard")
gr.Markdown(f"**{metadata['total_models']} models** evaluated on **{metadata['total_questions']} petroleum geoscience MCQs**")
gr.Markdown(
"> March 2026 update: FormationEval now also includes the imported DISKOS-QA and SPE MCQ tracks. "
"This Space still displays results for the evaluated MCQ v0.1 track only. "
"A full rerun on the expanded suite is pending because this is a self funded one "
"person project and expanded suite evaluation requires materially more token spend."
)
# Filters row
with gr.Row():
search_input = gr.Textbox(
label="Search model",
placeholder="Type to search...",
scale=2,
)
company_dropdown = gr.Dropdown(
choices=["All"] + COMPANIES,
value="All",
label="Company",
scale=1,
)
open_weight_checkbox = gr.Checkbox(
label="Open-weight only",
value=False,
scale=1,
)
with gr.Tabs():
# Overall tab
with gr.Tab("Overall"):
overall_table = gr.Dataframe(
value=get_overall_table("", "All", False),
interactive=False,
wrap=True,
elem_classes=["tall-table"],
)
# Difficulty tab
with gr.Tab("By difficulty"):
difficulty_table = gr.Dataframe(
value=get_difficulty_table("", "All", False),
interactive=False,
wrap=True,
elem_classes=["tall-table"],
)
# Domain tab
with gr.Tab("By domain"):
domain_table = gr.Dataframe(
value=get_domain_table("", "All", False),
interactive=False,
wrap=True,
elem_classes=["tall-table"],
)
# Bias tab
with gr.Tab("Bias analysis"):
gr.Markdown("**Position bias**: How much the model favors certain answer positions (A/B/C/D). Low is better.")
gr.Markdown("**Length bias**: How much the model favors longer answers. High means the model tends to pick the longest option.")
bias_table = gr.Dataframe(
value=get_bias_table("", "All", False),
interactive=False,
wrap=True,
elem_classes=["tall-table"],
)
# Charts tab
with gr.Tab("Charts"):
with gr.Row():
accuracy_price_plot = gr.Plot(value=create_accuracy_vs_price_chart(), label="Accuracy vs price")
with gr.Row():
with gr.Column():
all_models_plot = gr.Plot(value=create_all_models_chart(), label="All models")
with gr.Column():
open_weight_plot = gr.Plot(value=create_open_weight_chart(), label="Open-weight models")
with gr.Row():
domain_heatmap_plot = gr.Plot(value=create_domain_heatmap(), label="Domain performance")
# About tab
with gr.Tab("About"):
gr.Markdown(ABOUT_TEXT)
gr.Markdown("### Citation")
gr.Code(CITATION_BIBTEX, language=None, label="BibTeX")
# Connect filters to tables
filter_inputs = [search_input, company_dropdown, open_weight_checkbox]
search_input.change(get_overall_table, filter_inputs, overall_table)
search_input.change(get_difficulty_table, filter_inputs, difficulty_table)
search_input.change(get_domain_table, filter_inputs, domain_table)
search_input.change(get_bias_table, filter_inputs, bias_table)
company_dropdown.change(get_overall_table, filter_inputs, overall_table)
company_dropdown.change(get_difficulty_table, filter_inputs, difficulty_table)
company_dropdown.change(get_domain_table, filter_inputs, domain_table)
company_dropdown.change(get_bias_table, filter_inputs, bias_table)
open_weight_checkbox.change(get_overall_table, filter_inputs, overall_table)
open_weight_checkbox.change(get_difficulty_table, filter_inputs, difficulty_table)
open_weight_checkbox.change(get_domain_table, filter_inputs, domain_table)
open_weight_checkbox.change(get_bias_table, filter_inputs, bias_table)
# Connect checkbox to charts
open_weight_checkbox.change(
lambda x: create_accuracy_vs_price_chart(x),
inputs=[open_weight_checkbox],
outputs=[accuracy_price_plot]
)
open_weight_checkbox.change(
lambda x: create_all_models_chart(x),
inputs=[open_weight_checkbox],
outputs=[all_models_plot]
)
open_weight_checkbox.change(
lambda x: create_domain_heatmap(x),
inputs=[open_weight_checkbox],
outputs=[domain_heatmap_plot]
)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft(font=gr.themes.GoogleFont("Inter")))