Spaces:

AlmazErmilov
/

FormationEval-Leaderboard

Sleeping

App Files Files Community

FormationEval-Leaderboard / app.py

AlmazErmilov

docs: update suite scope notes

a01417f about 1 month ago

raw

history blame contribute delete

16.8 kB

	"""
	FormationEval Leaderboard - Interactive benchmark results viewer

	72 models evaluated on 505 petroleum geoscience MCQs. DISKOS-QA and SPE MCQ
	have been added to the public suite and the expanded rerun is pending.
	"""

	import json
	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from pathlib import Path

	# Load data
	DATA_PATH = Path(__file__).parent / "leaderboard_data.json"
	with open(DATA_PATH) as f:
	data = json.load(f)

	df = pd.DataFrame(data["models"])
	metadata = data["metadata"]

	# Company list for filter
	COMPANIES = sorted(df["company"].unique().tolist())

	# Column configurations
	OVERALL_COLS = ["rank", "model", "company", "open_weight", "price_input", "price_output", "accuracy", "correct", "total"]
	DIFFICULTY_COLS = ["rank", "model", "company", "accuracy", "easy", "medium", "hard"]
	DOMAIN_COLS = ["rank", "model", "drilling", "geophysics", "petroleum_geology", "petrophysics", "production", "reservoir", "sedimentology"]
	BIAS_COLS = ["rank", "model", "company", "accuracy", "position_bias", "length_bias"]

	# Display names for columns
	COLUMN_NAMES = {
	"rank": "Rank",
	"model": "Model",
	"company": "Company",
	"open_weight": "Open",
	"price_input": "Input $/M",
	"price_output": "Output $/M",
	"price_avg": "Avg $/M",
	"accuracy": "Accuracy %",
	"correct": "Correct",
	"total": "Total",
	"easy": "Easy %",
	"medium": "Medium %",
	"hard": "Hard %",
	"drilling": "Drilling %",
	"geophysics": "Geophysics %",
	"petroleum_geology": "Petrol. Geo. %",
	"petrophysics": "Petrophysics %",
	"production": "Production %",
	"reservoir": "Reservoir %",
	"sedimentology": "Sediment. %",
	"position_bias": "Position bias",
	"length_bias": "Length bias",
	}


	def filter_dataframe(df_in, search_query, company_filter, open_weight_only):
	"""Apply filters to dataframe."""
	df_out = df_in.copy()

	if search_query:
	df_out = df_out[df_out["model"].str.lower().str.contains(search_query.lower(), regex=False)]

	if company_filter and company_filter != "All":
	df_out = df_out[df_out["company"] == company_filter]

	if open_weight_only:
	df_out = df_out[df_out["open_weight"] == True]

	return df_out


	def get_overall_table(search_query, company_filter, open_weight_only):
	"""Get filtered overall leaderboard."""
	filtered = filter_dataframe(df, search_query, company_filter, open_weight_only)
	result = filtered[OVERALL_COLS].copy()
	result.columns = [COLUMN_NAMES.get(c, c) for c in OVERALL_COLS]
	return result


	def get_difficulty_table(search_query, company_filter, open_weight_only):
	"""Get filtered difficulty breakdown."""
	filtered = filter_dataframe(df, search_query, company_filter, open_weight_only)
	result = filtered[DIFFICULTY_COLS].copy()
	result.columns = [COLUMN_NAMES.get(c, c) for c in DIFFICULTY_COLS]
	return result


	def get_domain_table(search_query, company_filter, open_weight_only):
	"""Get filtered domain breakdown."""
	filtered = filter_dataframe(df, search_query, company_filter, open_weight_only)
	result = filtered[DOMAIN_COLS].copy()
	result.columns = [COLUMN_NAMES.get(c, c) for c in DOMAIN_COLS]
	return result


	def get_bias_table(search_query, company_filter, open_weight_only):
	"""Get filtered bias analysis."""
	filtered = filter_dataframe(df, search_query, company_filter, open_weight_only)
	result = filtered[BIAS_COLS].copy()
	result.columns = [COLUMN_NAMES.get(c, c) for c in BIAS_COLS]
	return result


	def create_accuracy_vs_price_chart(open_weight_only=False):
	"""Create interactive scatter plot of accuracy vs price."""
	data = df.copy()
	if open_weight_only:
	data = data[data["open_weight"] == True]

	fig = px.scatter(
	data,
	x="price_avg",
	y="accuracy",
	color="open_weight",
	hover_name="model",
	hover_data={
	"company": True,
	"accuracy": ":.1f",
	"price_avg": ":.2f",
	"open_weight": True,
	},
	color_discrete_map={True: "#1f77b4", False: "#ff7f0e"},
	labels={
	"price_avg": "Average price ($/M tokens)",
	"accuracy": "Accuracy (%)",
	"open_weight": "Open weight",
	},
	title=f"Accuracy vs price ({len(data)} models)",
	)

	fig.update_traces(marker=dict(size=10, opacity=0.8))
	fig.update_layout(
	font=dict(family="Inter, sans-serif"),
	legend_title_text="Model type",
	legend=dict(
	itemsizing="constant",
	title_font_size=12,
	),
	hovermode="closest",
	height=800,
	)

	# Update legend labels
	fig.for_each_trace(
	lambda t: t.update(name="Open weight" if t.name == "True" else "Closed")
	)

	return fig


	def create_all_models_chart(open_weight_only=False):
	"""Create horizontal bar chart of all models."""
	data = df.copy()
	if open_weight_only:
	data = data[data["open_weight"] == True]

	all_models = data.sort_values("accuracy", ascending=False)
	# Preserve sort order with category_orders
	model_order = all_models["model"].tolist()

	fig = px.bar(
	all_models,
	x="accuracy",
	y="model",
	orientation="h",
	color="open_weight",
	color_discrete_map={True: "#1f77b4", False: "#ff7f0e"},
	hover_data={"company": True, "accuracy": ":.1f"},
	labels={"accuracy": "Accuracy (%)", "model": "Model"},
	title=f"All models by accuracy ({len(all_models)} total)",
	category_orders={"model": model_order},
	)

	fig.update_layout(
	font=dict(family="Inter, sans-serif"),
	yaxis=dict(tickfont=dict(size=9)),
	legend_title_text="Model type",
	showlegend=True,
	height=max(400, len(all_models) * 20),
	)

	fig.for_each_trace(
	lambda t: t.update(name="Open weight" if t.name == "True" else "Closed")
	)

	return fig


	def create_open_weight_chart():
	"""Create bar chart of all open-weight models."""
	open_models = df[df["open_weight"] == True].sort_values("accuracy", ascending=False)
	model_order = open_models["model"].tolist()

	fig = px.bar(
	open_models,
	x="accuracy",
	y="model",
	orientation="h",
	color_discrete_sequence=["#1f77b4"],
	hover_data={"company": True, "accuracy": ":.1f"},
	labels={"accuracy": "Accuracy (%)", "model": "Model"},
	title=f"Open-weight models ({len(open_models)} total)",
	category_orders={"model": model_order},
	)

	fig.update_layout(
	font=dict(family="Inter, sans-serif"),
	yaxis=dict(tickfont=dict(size=10)),
	height=max(400, len(open_models) * 18),
	)

	return fig


	def create_domain_heatmap(open_weight_only=False):
	"""Create heatmap of domain performance for top 50 models."""
	data = df.copy()
	if open_weight_only:
	data = data[data["open_weight"] == True]

	# Limit to top 50 models by rank
	top_models = data.sort_values("rank").head(50)
	domain_cols = ["drilling", "geophysics", "petroleum_geology", "petrophysics", "production", "reservoir", "sedimentology"]

	heatmap_data = top_models[["model"] + domain_cols].set_index("model")
	heatmap_data.columns = ["Drilling", "Geophysics", "Petrol. Geo.", "Petrophysics", "Production", "Reservoir", "Sediment."]

	fig = px.imshow(
	heatmap_data,
	color_continuous_scale="RdYlGn",
	aspect="auto",
	title=f"Domain performance heatmap (top {len(top_models)} models)",
	labels=dict(color="Accuracy %"),
	)

	fig.update_layout(
	font=dict(family="Inter, sans-serif"),
	height=max(400, len(top_models) * 20),
	)

	return fig


	# About text
	ABOUT_TEXT = """
	## FormationEval Benchmark

	FormationEval is an open multiple-choice question (MCQ) benchmark for evaluating language models on petroleum geoscience and subsurface disciplines.

	### Key statistics
	- 505 questions across 7 domains
	- 72 models evaluated
	- 3 authoritative sources: Ellis & Singer (2007), Bjørlykke (2010), TU Delft OCW

	### Project context
	FormationEval `v0.1` was the first 505-question benchmark version and formed a small part of the work presented at EAGE Digital 2026 in Stavanger in the session New Frontiers In Geomodelling: Recent Digital Advances under the title Multi-Agent Framework for Subsurface Workflows: Petrophysicist, Geologist and Reservoir Engineer GenAI Agents. It was built to compare models for oil and gas geoscience and subsurface tasks and to provide a public leaderboard that was useful in practice. At that point I did not see public benchmarks or leaderboards in that area that matched that need. DISKOS-QA and the SPE MCQ Dataset were added later as separate imported tracks in the same suite.

	### Current suite status
	- MCQ v0.1: evaluated and shown in this Space
	- DISKOS-QA: imported into the public suite, not yet rerun in this Space
	- SPE MCQ Dataset: imported into the public suite, not yet rerun in this Space
	- The original model comparison goal of the project was already addressed by the published MCQ leaderboard
	- A full rerun on the expanded suite is pending because this is a self funded one person project and expanded suite evaluation requires materially more token spend
	- If you want to collaborate, support reruns or discuss related research and engineering work, contact almaz.ermilov@gmail.com
	- Imported track provenance and licensing notes: [main repository notices](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation/blob/main/THIRD_PARTY_NOTICES.md), [dataset notices](https://huggingface.co/datasets/AlmazErmilov/FormationEval/blob/main/THIRD_PARTY_NOTICES.md), the upstream [DISKOS-QA benchmark](https://github.com/georgeghon/DISKOS-QA), and the upstream [SPE MCQ dataset](https://huggingface.co/datasets/ynuwara/spe_mcq_dataset)

	### Domains
	\| Domain \| Questions \|
	\|--------\|-----------\|
	\| Petrophysics \| 272 \|
	\| Petroleum Geology \| 151 \|
	\| Sedimentology \| 98 \|
	\| Geophysics \| 80 \|
	\| Reservoir Engineering \| 43 \|
	\| Drilling Engineering \| 24 \|
	\| Production Engineering \| 14 \|

	### Difficulty distribution
	\| Level \| Count \| % \|
	\|-------\|-------\|---\|
	\| Easy \| 132 \| 26% \|
	\| Medium \| 274 \| 54% \|
	\| Hard \| 99 \| 20% \|

	### Links
	- Paper: [arXiv:2601.02158](https://arxiv.org/abs/2601.02158)
	- Dataset: [AlmazErmilov/FormationEval](https://huggingface.co/datasets/AlmazErmilov/FormationEval)
	- GitHub: [FormationEval Repository](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation)
	- Website: [formationeval.no](https://www.formationeval.no)
	- DISKOS-QA browser: [formationeval.no/diskos-qa](https://www.formationeval.no/diskos-qa)
	- Unified question browser: [formationeval.no/questions](https://www.formationeval.no/questions)
	- Third party notices: [main repository notices](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation/blob/main/THIRD_PARTY_NOTICES.md)
	"""

	CITATION_BIBTEX = """@misc{ermilov2026formationeval,
	title={FormationEval, an open multiple-choice benchmark for petroleum geoscience},
	author={Almaz Ermilov},
	year={2026},
	eprint={2601.02158},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2601.02158},
	}"""


	# Custom CSS to show more rows in tables (~20 rows visible)
	CUSTOM_CSS = """
	.tall-table {
	min-height: 700px !important;
	}
	.tall-table .table-wrap {
	max-height: 700px !important;
	}
	"""

	# Build Gradio interface
	with gr.Blocks(title="FormationEval Leaderboard", css=CUSTOM_CSS) as demo:

	gr.Markdown("# FormationEval Leaderboard")
	gr.Markdown(f"{metadata['total_models']} models evaluated on {metadata['total_questions']} petroleum geoscience MCQs")
	gr.Markdown(
	"> March 2026 update: FormationEval now also includes the imported DISKOS-QA and SPE MCQ tracks. "
	"This Space still displays results for the evaluated MCQ v0.1 track only. "
	"A full rerun on the expanded suite is pending because this is a self funded one "
	"person project and expanded suite evaluation requires materially more token spend."
	)

	# Filters row
	with gr.Row():
	search_input = gr.Textbox(
	label="Search model",
	placeholder="Type to search...",
	scale=2,
	)
	company_dropdown = gr.Dropdown(
	choices=["All"] + COMPANIES,
	value="All",
	label="Company",
	scale=1,
	)
	open_weight_checkbox = gr.Checkbox(
	label="Open-weight only",
	value=False,
	scale=1,
	)

	with gr.Tabs():
	# Overall tab
	with gr.Tab("Overall"):
	overall_table = gr.Dataframe(
	value=get_overall_table("", "All", False),
	interactive=False,
	wrap=True,
	elem_classes=["tall-table"],
	)

	# Difficulty tab
	with gr.Tab("By difficulty"):
	difficulty_table = gr.Dataframe(
	value=get_difficulty_table("", "All", False),
	interactive=False,
	wrap=True,
	elem_classes=["tall-table"],
	)

	# Domain tab
	with gr.Tab("By domain"):
	domain_table = gr.Dataframe(
	value=get_domain_table("", "All", False),
	interactive=False,
	wrap=True,
	elem_classes=["tall-table"],
	)

	# Bias tab
	with gr.Tab("Bias analysis"):
	gr.Markdown("Position bias: How much the model favors certain answer positions (A/B/C/D). Low is better.")
	gr.Markdown("Length bias: How much the model favors longer answers. High means the model tends to pick the longest option.")
	bias_table = gr.Dataframe(
	value=get_bias_table("", "All", False),
	interactive=False,
	wrap=True,
	elem_classes=["tall-table"],
	)

	# Charts tab
	with gr.Tab("Charts"):
	with gr.Row():
	accuracy_price_plot = gr.Plot(value=create_accuracy_vs_price_chart(), label="Accuracy vs price")

	with gr.Row():
	with gr.Column():
	all_models_plot = gr.Plot(value=create_all_models_chart(), label="All models")
	with gr.Column():
	open_weight_plot = gr.Plot(value=create_open_weight_chart(), label="Open-weight models")

	with gr.Row():
	domain_heatmap_plot = gr.Plot(value=create_domain_heatmap(), label="Domain performance")

	# About tab
	with gr.Tab("About"):
	gr.Markdown(ABOUT_TEXT)
	gr.Markdown("### Citation")
	gr.Code(CITATION_BIBTEX, language=None, label="BibTeX")

	# Connect filters to tables
	filter_inputs = [search_input, company_dropdown, open_weight_checkbox]

	search_input.change(get_overall_table, filter_inputs, overall_table)
	search_input.change(get_difficulty_table, filter_inputs, difficulty_table)
	search_input.change(get_domain_table, filter_inputs, domain_table)
	search_input.change(get_bias_table, filter_inputs, bias_table)

	company_dropdown.change(get_overall_table, filter_inputs, overall_table)
	company_dropdown.change(get_difficulty_table, filter_inputs, difficulty_table)
	company_dropdown.change(get_domain_table, filter_inputs, domain_table)
	company_dropdown.change(get_bias_table, filter_inputs, bias_table)

	open_weight_checkbox.change(get_overall_table, filter_inputs, overall_table)
	open_weight_checkbox.change(get_difficulty_table, filter_inputs, difficulty_table)
	open_weight_checkbox.change(get_domain_table, filter_inputs, domain_table)
	open_weight_checkbox.change(get_bias_table, filter_inputs, bias_table)

	# Connect checkbox to charts
	open_weight_checkbox.change(
	lambda x: create_accuracy_vs_price_chart(x),
	inputs=[open_weight_checkbox],
	outputs=[accuracy_price_plot]
	)
	open_weight_checkbox.change(
	lambda x: create_all_models_chart(x),
	inputs=[open_weight_checkbox],
	outputs=[all_models_plot]
	)
	open_weight_checkbox.change(
	lambda x: create_domain_heatmap(x),
	inputs=[open_weight_checkbox],
	outputs=[domain_heatmap_plot]
	)

	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Soft(font=gr.themes.GoogleFont("Inter")))