| """ |
| FormationEval Leaderboard - Interactive benchmark results viewer |
| |
| 72 models evaluated on 505 petroleum geoscience MCQs. DISKOS-QA and SPE MCQ |
| have been added to the public suite and the expanded rerun is pending. |
| """ |
|
|
| import json |
| import gradio as gr |
| import pandas as pd |
| import plotly.express as px |
| import plotly.graph_objects as go |
| from pathlib import Path |
|
|
| |
| DATA_PATH = Path(__file__).parent / "leaderboard_data.json" |
| with open(DATA_PATH) as f: |
| data = json.load(f) |
|
|
| df = pd.DataFrame(data["models"]) |
| metadata = data["metadata"] |
|
|
| |
| COMPANIES = sorted(df["company"].unique().tolist()) |
|
|
| |
| OVERALL_COLS = ["rank", "model", "company", "open_weight", "price_input", "price_output", "accuracy", "correct", "total"] |
| DIFFICULTY_COLS = ["rank", "model", "company", "accuracy", "easy", "medium", "hard"] |
| DOMAIN_COLS = ["rank", "model", "drilling", "geophysics", "petroleum_geology", "petrophysics", "production", "reservoir", "sedimentology"] |
| BIAS_COLS = ["rank", "model", "company", "accuracy", "position_bias", "length_bias"] |
|
|
| |
| COLUMN_NAMES = { |
| "rank": "Rank", |
| "model": "Model", |
| "company": "Company", |
| "open_weight": "Open", |
| "price_input": "Input $/M", |
| "price_output": "Output $/M", |
| "price_avg": "Avg $/M", |
| "accuracy": "Accuracy %", |
| "correct": "Correct", |
| "total": "Total", |
| "easy": "Easy %", |
| "medium": "Medium %", |
| "hard": "Hard %", |
| "drilling": "Drilling %", |
| "geophysics": "Geophysics %", |
| "petroleum_geology": "Petrol. Geo. %", |
| "petrophysics": "Petrophysics %", |
| "production": "Production %", |
| "reservoir": "Reservoir %", |
| "sedimentology": "Sediment. %", |
| "position_bias": "Position bias", |
| "length_bias": "Length bias", |
| } |
|
|
|
|
| def filter_dataframe(df_in, search_query, company_filter, open_weight_only): |
| """Apply filters to dataframe.""" |
| df_out = df_in.copy() |
|
|
| if search_query: |
| df_out = df_out[df_out["model"].str.lower().str.contains(search_query.lower(), regex=False)] |
|
|
| if company_filter and company_filter != "All": |
| df_out = df_out[df_out["company"] == company_filter] |
|
|
| if open_weight_only: |
| df_out = df_out[df_out["open_weight"] == True] |
|
|
| return df_out |
|
|
|
|
| def get_overall_table(search_query, company_filter, open_weight_only): |
| """Get filtered overall leaderboard.""" |
| filtered = filter_dataframe(df, search_query, company_filter, open_weight_only) |
| result = filtered[OVERALL_COLS].copy() |
| result.columns = [COLUMN_NAMES.get(c, c) for c in OVERALL_COLS] |
| return result |
|
|
|
|
| def get_difficulty_table(search_query, company_filter, open_weight_only): |
| """Get filtered difficulty breakdown.""" |
| filtered = filter_dataframe(df, search_query, company_filter, open_weight_only) |
| result = filtered[DIFFICULTY_COLS].copy() |
| result.columns = [COLUMN_NAMES.get(c, c) for c in DIFFICULTY_COLS] |
| return result |
|
|
|
|
| def get_domain_table(search_query, company_filter, open_weight_only): |
| """Get filtered domain breakdown.""" |
| filtered = filter_dataframe(df, search_query, company_filter, open_weight_only) |
| result = filtered[DOMAIN_COLS].copy() |
| result.columns = [COLUMN_NAMES.get(c, c) for c in DOMAIN_COLS] |
| return result |
|
|
|
|
| def get_bias_table(search_query, company_filter, open_weight_only): |
| """Get filtered bias analysis.""" |
| filtered = filter_dataframe(df, search_query, company_filter, open_weight_only) |
| result = filtered[BIAS_COLS].copy() |
| result.columns = [COLUMN_NAMES.get(c, c) for c in BIAS_COLS] |
| return result |
|
|
|
|
| def create_accuracy_vs_price_chart(open_weight_only=False): |
| """Create interactive scatter plot of accuracy vs price.""" |
| data = df.copy() |
| if open_weight_only: |
| data = data[data["open_weight"] == True] |
|
|
| fig = px.scatter( |
| data, |
| x="price_avg", |
| y="accuracy", |
| color="open_weight", |
| hover_name="model", |
| hover_data={ |
| "company": True, |
| "accuracy": ":.1f", |
| "price_avg": ":.2f", |
| "open_weight": True, |
| }, |
| color_discrete_map={True: "#1f77b4", False: "#ff7f0e"}, |
| labels={ |
| "price_avg": "Average price ($/M tokens)", |
| "accuracy": "Accuracy (%)", |
| "open_weight": "Open weight", |
| }, |
| title=f"Accuracy vs price ({len(data)} models)", |
| ) |
|
|
| fig.update_traces(marker=dict(size=10, opacity=0.8)) |
| fig.update_layout( |
| font=dict(family="Inter, sans-serif"), |
| legend_title_text="Model type", |
| legend=dict( |
| itemsizing="constant", |
| title_font_size=12, |
| ), |
| hovermode="closest", |
| height=800, |
| ) |
|
|
| |
| fig.for_each_trace( |
| lambda t: t.update(name="Open weight" if t.name == "True" else "Closed") |
| ) |
|
|
| return fig |
|
|
|
|
| def create_all_models_chart(open_weight_only=False): |
| """Create horizontal bar chart of all models.""" |
| data = df.copy() |
| if open_weight_only: |
| data = data[data["open_weight"] == True] |
|
|
| all_models = data.sort_values("accuracy", ascending=False) |
| |
| model_order = all_models["model"].tolist() |
|
|
| fig = px.bar( |
| all_models, |
| x="accuracy", |
| y="model", |
| orientation="h", |
| color="open_weight", |
| color_discrete_map={True: "#1f77b4", False: "#ff7f0e"}, |
| hover_data={"company": True, "accuracy": ":.1f"}, |
| labels={"accuracy": "Accuracy (%)", "model": "Model"}, |
| title=f"All models by accuracy ({len(all_models)} total)", |
| category_orders={"model": model_order}, |
| ) |
|
|
| fig.update_layout( |
| font=dict(family="Inter, sans-serif"), |
| yaxis=dict(tickfont=dict(size=9)), |
| legend_title_text="Model type", |
| showlegend=True, |
| height=max(400, len(all_models) * 20), |
| ) |
|
|
| fig.for_each_trace( |
| lambda t: t.update(name="Open weight" if t.name == "True" else "Closed") |
| ) |
|
|
| return fig |
|
|
|
|
| def create_open_weight_chart(): |
| """Create bar chart of all open-weight models.""" |
| open_models = df[df["open_weight"] == True].sort_values("accuracy", ascending=False) |
| model_order = open_models["model"].tolist() |
|
|
| fig = px.bar( |
| open_models, |
| x="accuracy", |
| y="model", |
| orientation="h", |
| color_discrete_sequence=["#1f77b4"], |
| hover_data={"company": True, "accuracy": ":.1f"}, |
| labels={"accuracy": "Accuracy (%)", "model": "Model"}, |
| title=f"Open-weight models ({len(open_models)} total)", |
| category_orders={"model": model_order}, |
| ) |
|
|
| fig.update_layout( |
| font=dict(family="Inter, sans-serif"), |
| yaxis=dict(tickfont=dict(size=10)), |
| height=max(400, len(open_models) * 18), |
| ) |
|
|
| return fig |
|
|
|
|
| def create_domain_heatmap(open_weight_only=False): |
| """Create heatmap of domain performance for top 50 models.""" |
| data = df.copy() |
| if open_weight_only: |
| data = data[data["open_weight"] == True] |
|
|
| |
| top_models = data.sort_values("rank").head(50) |
| domain_cols = ["drilling", "geophysics", "petroleum_geology", "petrophysics", "production", "reservoir", "sedimentology"] |
|
|
| heatmap_data = top_models[["model"] + domain_cols].set_index("model") |
| heatmap_data.columns = ["Drilling", "Geophysics", "Petrol. Geo.", "Petrophysics", "Production", "Reservoir", "Sediment."] |
|
|
| fig = px.imshow( |
| heatmap_data, |
| color_continuous_scale="RdYlGn", |
| aspect="auto", |
| title=f"Domain performance heatmap (top {len(top_models)} models)", |
| labels=dict(color="Accuracy %"), |
| ) |
|
|
| fig.update_layout( |
| font=dict(family="Inter, sans-serif"), |
| height=max(400, len(top_models) * 20), |
| ) |
|
|
| return fig |
|
|
|
|
| |
| ABOUT_TEXT = """ |
| ## FormationEval Benchmark |
| |
| **FormationEval** is an open multiple-choice question (MCQ) benchmark for evaluating language models on petroleum geoscience and subsurface disciplines. |
| |
| ### Key statistics |
| - **505 questions** across 7 domains |
| - **72 models** evaluated |
| - **3 authoritative sources**: Ellis & Singer (2007), Bjørlykke (2010), TU Delft OCW |
| |
| ### Project context |
| FormationEval `v0.1` was the first 505-question benchmark version and formed a small part of the work presented at EAGE Digital 2026 in Stavanger in the session *New Frontiers In Geomodelling: Recent Digital Advances* under the title *Multi-Agent Framework for Subsurface Workflows: Petrophysicist, Geologist and Reservoir Engineer GenAI Agents*. It was built to compare models for oil and gas geoscience and subsurface tasks and to provide a public leaderboard that was useful in practice. At that point I did not see public benchmarks or leaderboards in that area that matched that need. DISKOS-QA and the SPE MCQ Dataset were added later as separate imported tracks in the same suite. |
| |
| ### Current suite status |
| - **MCQ v0.1**: evaluated and shown in this Space |
| - **DISKOS-QA**: imported into the public suite, not yet rerun in this Space |
| - **SPE MCQ Dataset**: imported into the public suite, not yet rerun in this Space |
| - The original model comparison goal of the project was already addressed by the published MCQ leaderboard |
| - A full rerun on the expanded suite is pending because this is a self funded one person project and expanded suite evaluation requires materially more token spend |
| - If you want to collaborate, support reruns or discuss related research and engineering work, contact almaz.ermilov@gmail.com |
| - Imported track provenance and licensing notes: [main repository notices](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation/blob/main/THIRD_PARTY_NOTICES.md), [dataset notices](https://huggingface.co/datasets/AlmazErmilov/FormationEval/blob/main/THIRD_PARTY_NOTICES.md), the upstream [DISKOS-QA benchmark](https://github.com/georgeghon/DISKOS-QA), and the upstream [SPE MCQ dataset](https://huggingface.co/datasets/ynuwara/spe_mcq_dataset) |
| |
| ### Domains |
| | Domain | Questions | |
| |--------|-----------| |
| | Petrophysics | 272 | |
| | Petroleum Geology | 151 | |
| | Sedimentology | 98 | |
| | Geophysics | 80 | |
| | Reservoir Engineering | 43 | |
| | Drilling Engineering | 24 | |
| | Production Engineering | 14 | |
| |
| ### Difficulty distribution |
| | Level | Count | % | |
| |-------|-------|---| |
| | Easy | 132 | 26% | |
| | Medium | 274 | 54% | |
| | Hard | 99 | 20% | |
| |
| ### Links |
| - **Paper**: [arXiv:2601.02158](https://arxiv.org/abs/2601.02158) |
| - **Dataset**: [AlmazErmilov/FormationEval](https://huggingface.co/datasets/AlmazErmilov/FormationEval) |
| - **GitHub**: [FormationEval Repository](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation) |
| - **Website**: [formationeval.no](https://www.formationeval.no) |
| - **DISKOS-QA browser**: [formationeval.no/diskos-qa](https://www.formationeval.no/diskos-qa) |
| - **Unified question browser**: [formationeval.no/questions](https://www.formationeval.no/questions) |
| - **Third party notices**: [main repository notices](https://github.com/AlmazErmilov/FormationEval-an-Open-Benchmark-for-Oil-Gas-Geoscience-MCQ-Evaluation/blob/main/THIRD_PARTY_NOTICES.md) |
| """ |
|
|
| CITATION_BIBTEX = """@misc{ermilov2026formationeval, |
| title={FormationEval, an open multiple-choice benchmark for petroleum geoscience}, |
| author={Almaz Ermilov}, |
| year={2026}, |
| eprint={2601.02158}, |
| archivePrefix={arXiv}, |
| primaryClass={cs.CL}, |
| url={https://arxiv.org/abs/2601.02158}, |
| }""" |
|
|
|
|
| |
| CUSTOM_CSS = """ |
| .tall-table { |
| min-height: 700px !important; |
| } |
| .tall-table .table-wrap { |
| max-height: 700px !important; |
| } |
| """ |
|
|
| |
| with gr.Blocks(title="FormationEval Leaderboard", css=CUSTOM_CSS) as demo: |
|
|
| gr.Markdown("# FormationEval Leaderboard") |
| gr.Markdown(f"**{metadata['total_models']} models** evaluated on **{metadata['total_questions']} petroleum geoscience MCQs**") |
| gr.Markdown( |
| "> March 2026 update: FormationEval now also includes the imported DISKOS-QA and SPE MCQ tracks. " |
| "This Space still displays results for the evaluated MCQ v0.1 track only. " |
| "A full rerun on the expanded suite is pending because this is a self funded one " |
| "person project and expanded suite evaluation requires materially more token spend." |
| ) |
|
|
| |
| with gr.Row(): |
| search_input = gr.Textbox( |
| label="Search model", |
| placeholder="Type to search...", |
| scale=2, |
| ) |
| company_dropdown = gr.Dropdown( |
| choices=["All"] + COMPANIES, |
| value="All", |
| label="Company", |
| scale=1, |
| ) |
| open_weight_checkbox = gr.Checkbox( |
| label="Open-weight only", |
| value=False, |
| scale=1, |
| ) |
|
|
| with gr.Tabs(): |
| |
| with gr.Tab("Overall"): |
| overall_table = gr.Dataframe( |
| value=get_overall_table("", "All", False), |
| interactive=False, |
| wrap=True, |
| elem_classes=["tall-table"], |
| ) |
|
|
| |
| with gr.Tab("By difficulty"): |
| difficulty_table = gr.Dataframe( |
| value=get_difficulty_table("", "All", False), |
| interactive=False, |
| wrap=True, |
| elem_classes=["tall-table"], |
| ) |
|
|
| |
| with gr.Tab("By domain"): |
| domain_table = gr.Dataframe( |
| value=get_domain_table("", "All", False), |
| interactive=False, |
| wrap=True, |
| elem_classes=["tall-table"], |
| ) |
|
|
| |
| with gr.Tab("Bias analysis"): |
| gr.Markdown("**Position bias**: How much the model favors certain answer positions (A/B/C/D). Low is better.") |
| gr.Markdown("**Length bias**: How much the model favors longer answers. High means the model tends to pick the longest option.") |
| bias_table = gr.Dataframe( |
| value=get_bias_table("", "All", False), |
| interactive=False, |
| wrap=True, |
| elem_classes=["tall-table"], |
| ) |
|
|
| |
| with gr.Tab("Charts"): |
| with gr.Row(): |
| accuracy_price_plot = gr.Plot(value=create_accuracy_vs_price_chart(), label="Accuracy vs price") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| all_models_plot = gr.Plot(value=create_all_models_chart(), label="All models") |
| with gr.Column(): |
| open_weight_plot = gr.Plot(value=create_open_weight_chart(), label="Open-weight models") |
|
|
| with gr.Row(): |
| domain_heatmap_plot = gr.Plot(value=create_domain_heatmap(), label="Domain performance") |
|
|
| |
| with gr.Tab("About"): |
| gr.Markdown(ABOUT_TEXT) |
| gr.Markdown("### Citation") |
| gr.Code(CITATION_BIBTEX, language=None, label="BibTeX") |
|
|
| |
| filter_inputs = [search_input, company_dropdown, open_weight_checkbox] |
|
|
| search_input.change(get_overall_table, filter_inputs, overall_table) |
| search_input.change(get_difficulty_table, filter_inputs, difficulty_table) |
| search_input.change(get_domain_table, filter_inputs, domain_table) |
| search_input.change(get_bias_table, filter_inputs, bias_table) |
|
|
| company_dropdown.change(get_overall_table, filter_inputs, overall_table) |
| company_dropdown.change(get_difficulty_table, filter_inputs, difficulty_table) |
| company_dropdown.change(get_domain_table, filter_inputs, domain_table) |
| company_dropdown.change(get_bias_table, filter_inputs, bias_table) |
|
|
| open_weight_checkbox.change(get_overall_table, filter_inputs, overall_table) |
| open_weight_checkbox.change(get_difficulty_table, filter_inputs, difficulty_table) |
| open_weight_checkbox.change(get_domain_table, filter_inputs, domain_table) |
| open_weight_checkbox.change(get_bias_table, filter_inputs, bias_table) |
|
|
| |
| open_weight_checkbox.change( |
| lambda x: create_accuracy_vs_price_chart(x), |
| inputs=[open_weight_checkbox], |
| outputs=[accuracy_price_plot] |
| ) |
| open_weight_checkbox.change( |
| lambda x: create_all_models_chart(x), |
| inputs=[open_weight_checkbox], |
| outputs=[all_models_plot] |
| ) |
| open_weight_checkbox.change( |
| lambda x: create_domain_heatmap(x), |
| inputs=[open_weight_checkbox], |
| outputs=[domain_heatmap_plot] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(theme=gr.themes.Soft(font=gr.themes.GoogleFont("Inter"))) |
|
|