Spaces:

macrocosm-os
/

finetuning-leaderboard

Runtime error

App Files Files Community

rusticluftig commited on Oct 22, 2024

Commit

9b87de8

1 Parent(s): 20e459a

Add benchmark data to the LB

Browse files

Files changed (3) hide show

app.py +76 -43
requirements.txt +1 -0
utils.py +94 -163

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from dotenv import load_dotenv
 from huggingface_hub import HfApi
 import competitions
 import utils
@@ -54,9 +55,8 @@ def main():
     vali_runs = state_vars["vali_runs"]
     scores = state_vars["scores"]
     validator_df = state_vars["validator_df"]
-    benchmarks = state_vars.get("benchmarks", None)
-    benchmark_timestamp = state_vars.get("benchmark_timestamp", None)
-    losses_1 = state_vars["losses_1"]
     losses_2 = state_vars["losses_2"]
     demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
@@ -74,51 +74,44 @@ def main():
             },
             num_top_classes=10,
         )
-        if benchmarks is not None:
-            with gr.Accordion("Top Model Benchmarks"):
-                gr.components.Dataframe(benchmarks)
-                gr.HTML("""<div>PPL computed using a stride of 512. See <a href='https://github.com/macrocosm-os/finetuning/blob/dev/scripts/run_benchmarks.py'>here</a> for the full code.</div>""")
-                gr.HTML(f"""<div>Last Updated: {benchmark_timestamp.strftime("%Y-%m-%d %H:%M:%S")} (UTC)</div>""")
         with gr.Accordion("Competition Results"):
             gr.HTML(EVALUATION_HEADER)
             show_stale = gr.Checkbox(label="Show Stale", interactive=True)
             competition_leaderboards = []
-            comp_1 = competitions.COMPETITION_DETAILS[1]
-            with gr.Accordion(f"{comp_1.name} Competition"):
-                gr.HTML(comp_1.html_description)
-                competition_leaderboards.append(gr.components.Dataframe(
-                    value=utils.leaderboard_data(model_data, scores, 1, show_stale.value),
-                    headers=["Name", "Win Rate", "Average Loss", "Weight", "UID", "Block"],
-                    datatype=["markdown", "number", "number", "number", "number", "number"],
-                    elem_id="comp1-table",
-                    interactive=False,
-                    visible=True,
-                ))
-                gr.LinePlot(
-                    losses_1,
-                    x="timestamp",
-                    x_title="Date",
-                    y="losses",
-                    y_title="Average Loss",
-                    interactive=True,
-                    visible=True,
-                    width=1024,
-                    title="Best Average Loss Over Time",
-                )
             comp_2 = competitions.COMPETITION_DETAILS[2]
             # Covert the losses into % of correct answers.
-            losses_2["losses"] = losses_2["losses"].apply(lambda x: 1 - x if x else None)
             with gr.Accordion(f"{comp_2.name} Competition"):
                 gr.HTML(comp_2.html_description)
-                competition_leaderboards.append(gr.components.Dataframe(
-                    value=utils.leaderboard_data(model_data, scores, 2, show_stale.value),
-                    headers=["Name", "Win Rate", "MC Score", "Weight", "UID", "Block"],
-                    datatype=["markdown", "number", "number", "number", "number", "number"],
-                    elem_id="comp2-table",
-                    interactive=False,
-                    visible=True,
-                ))
                 gr.LinePlot(
                     losses_2,
                     x="timestamp",
@@ -130,19 +123,59 @@ def main():
                     width=1024,
                     title="Best MC Score Over Time",
                 )
-            gr.HTML("""
                     <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
                     <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
                     <li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
                     <li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
                     <li><b>UID:</b> the Bittensor UID of the miner</li>
                     <li><b>Weight:</b> the bittensor weight set for this model</li>
-                    <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>.""")
             show_stale.change(
-                lambda stale: [utils.leaderboard_data(model_data, scores, 1, stale), utils.leaderboard_data(model_data, scores, 2, stale)],
                 inputs=[show_stale],
                 outputs=competition_leaderboards,
-            )
         with gr.Accordion("Validator Stats"):
             gr.components.Dataframe(

 from apscheduler.schedulers.background import BackgroundScheduler
 from dotenv import load_dotenv
 from huggingface_hub import HfApi
+import matplotlib.pyplot as plt
 import competitions
 import utils
     vali_runs = state_vars["vali_runs"]
     scores = state_vars["scores"]
     validator_df = state_vars["validator_df"]
+    benchmarks_df = state_vars["benchmarks_df"]
+    benchmarks_targets = state_vars["benchmarks_targets"]
     losses_2 = state_vars["losses_2"]
     demo = gr.Blocks(css=".typewriter {font-family: 'JMH Typewriter', sans-serif;}")
             },
             num_top_classes=10,
         )
         with gr.Accordion("Competition Results"):
             gr.HTML(EVALUATION_HEADER)
             show_stale = gr.Checkbox(label="Show Stale", interactive=True)
             competition_leaderboards = []
             comp_2 = competitions.COMPETITION_DETAILS[2]
             # Covert the losses into % of correct answers.
+            losses_2["losses"] = losses_2["losses"].apply(
+                lambda x: 1 - x if x else None
+            )
             with gr.Accordion(f"{comp_2.name} Competition"):
                 gr.HTML(comp_2.html_description)
+                competition_leaderboards.append(
+                    gr.components.Dataframe(
+                        value=utils.leaderboard_data(
+                            model_data, scores, 2, show_stale.value
+                        ),
+                        headers=[
+                            "Name",
+                            "Win Rate",
+                            "MC Score",
+                            "Weight",
+                            "UID",
+                            "Block",
+                        ],
+                        datatype=[
+                            "markdown",
+                            "number",
+                            "number",
+                            "number",
+                            "number",
+                            "number",
+                        ],
+                        elem_id="comp2-table",
+                        interactive=False,
+                        visible=True,
+                    )
+                )
                 gr.LinePlot(
                     losses_2,
                     x="timestamp",
                     width=1024,
                     title="Best MC Score Over Time",
                 )
+            gr.HTML(
+                """
                     <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
                     <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
                     <li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
                     <li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
                     <li><b>UID:</b> the Bittensor UID of the miner</li>
                     <li><b>Weight:</b> the bittensor weight set for this model</li>
+                    <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
+            )
             show_stale.change(
+                lambda stale: [utils.leaderboard_data(model_data, scores, 2, stale)],
                 inputs=[show_stale],
                 outputs=competition_leaderboards,
+            )
+        if benchmarks_df is not None:
+            def create_benchmark_plot(benchmark: str):
+                fig = plt.figure(figsize=(10, 8))
+                plt.plot(benchmarks_df["timestamp"], benchmarks_df[benchmark])
+                # Adding horizontal dotted lines for various benchmark targets (well-known models)
+                for model, score in benchmarks_targets[benchmark].items():
+                    plt.axhline(y=score, linestyle="--", label=f"{model}")
+                    plt.text(
+                        benchmarks_df["timestamp"].max(),
+                        score,
+                        f"{model}",
+                        va="center",
+                        ha="right",
+                        backgroundcolor="white",
+                    )
+                # Adding labels and title
+                plt.ylabel(benchmark.upper())
+                plt.title(f"{benchmark.upper()} Over Time")
+                plt.xticks(rotation=45)
+                return fig
+            with gr.Accordion("Top Model Benchmarks"):
+                mmlu = create_benchmark_plot("mmlu")
+                mmlu_pro = create_benchmark_plot("mmlu_pro")
+                gr.Plot(mmlu)
+                gr.Plot(mmlu_pro)
+                gr.HTML(
+                    """<div>Benchmarks computed using <a href='https://github.com/EleutherAI/lm-evaluation-harness'>lm-eval harness</a></div>"""
+                )
+                gr.HTML(
+                    """<ul><li>MMLU: Raw score</li><li>MMLU Pro: Normalized score using <a href='https://huggingface.co/docs/leaderboards/open_llm_leaderboard/normalization'>this</a> method</li></ul>"""
+                )
         with gr.Accordion("Validator Stats"):
             gr.components.Dataframe(

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ huggingface-hub
 gradio
 pandas
 flask

 gradio
 pandas
 flask
+matplotlib

utils.py CHANGED Viewed

@@ -15,7 +15,7 @@ import pandas as pd
 import wandb
 from bittensor.extrinsics.serving import get_metadata
 from dotenv import load_dotenv
-from wandb.apis.public.history import HistoryScan
 NETUID = 37
 DELAY_SECS = 3
@@ -26,8 +26,7 @@ load_dotenv()
 WANDB_TOKEN = os.environ.get("WANDB_API_KEY", None)
 SUBTENSOR_ENDPOINT = os.environ.get("SUBTENSOR_ENDPOINT", None)
 VALIDATOR_WANDB_PROJECT = "rusticluftig/finetuning"
-BENCHMARK_WANDB_PROJECT = ""
-BENCHMARK_FLAG = os.environ.get("BENCHMARK_FLAG", None)
 @dataclass(frozen=True)
@@ -146,19 +145,26 @@ def get_subnet_data(
     return result
-def get_wandb_runs(project: str, filters: Dict[str, Any]) -> List:
     """Get the latest runs from Wandb, retrying infinitely until we get them.
     Returns:
-        List: List of runs matching the provided filters, newest run (by creation time) first.
     """
     while True:
-        api = wandb.Api(api_key=WANDB_TOKEN)
         runs = list(
             api.runs(
                 project,
                 filters=filters,
-                order="-created_at",
             )
         )
         if len(runs) > 0:
@@ -178,12 +184,13 @@ def get_scores(
         uids (List[int]): List of UIDs to get scores for.
         wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
     """
     def _maybe_convert_loss(loss: float, comp_id: int) -> float:
         """Converts loss to score for competitions that require it."""
         if comp_id == 2:
             return 1 - loss if loss else None
         return loss
     result = {}
     previous_timestamp = None
     seen_competitions = set()
@@ -209,7 +216,9 @@ def get_scores(
                 # Only the most recent run per competition is fresh.
                 is_fresh = comp_id not in seen_competitions
                 result[uid] = {
-                    "avg_loss": _maybe_convert_loss(uid_data.get("average_loss", None), comp_id),
                     "win_rate": uid_data.get("win_rate", None),
                     "win_total": uid_data.get("win_total", None),
                     "weight": uid_data.get("weight", None),
@@ -245,32 +254,35 @@ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
     """Returns a dataframe of the best average model loss over time."""
     timestamps = []
     losses = []
     for run in wandb_runs:
         # For each run, check the 10 most recent steps.
         best_loss = math.inf
         should_add_datapoint = False
         min_step = max(0, run.lastHistoryStep - 10)
-        history_scan = HistoryScan(
-            run.client, run, min_step, run.lastHistoryStep, page_size=10
         )
         max_timestamp = None
         for step in history_scan:
-            if "original_format_json" not in step:
-                continue
             data = json.loads(step["original_format_json"])
             all_uid_data = data["uid_data"]
             timestamp = datetime.datetime.fromtimestamp(data["timestamp"])
             if max_timestamp is None:
                 max_timestamp = timestamp
             max_timestamp = max(max_timestamp, timestamp)
             for _, uid_data in all_uid_data.items():
                 loss = uid_data.get("average_loss", math.inf)
                 c_id = uid_data.get("competition_id", None)
                 if c_id is None or c_id != competition_id:
                     continue
                 if loss < best_loss:
                     best_loss = loss
                     should_add_datapoint = True
@@ -278,15 +290,8 @@ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
         if should_add_datapoint:
             timestamps.append(max_timestamp)
             losses.append(best_loss)
-    return pd.DataFrame({"timestamp": timestamps, "losses": losses })
-def next_epoch(subtensor: bt.subtensor, block: int) -> int:
-    return (
-        block
-        + subtensor.get_subnet_hyperparameters(NETUID).tempo
-        - subtensor.blocks_since_epoch(NETUID, block)
-    )
 def is_floatable(x) -> bool:
@@ -321,26 +326,65 @@ def leaderboard_data(
             c.block,
         ]
         for c in leaderboard
-        if c.competition_id == competition_id and ((c.uid in scores and scores[c.uid]["fresh"]) or show_stale)
     ]
-def get_benchmarks() -> Tuple[pd.DataFrame, datetime.datetime]:
     """Returns the latest benchmarks and the time they were run."""
     if not BENCHMARK_WANDB_PROJECT:
         print("No benchmark project set.")
         return None, None
-    runs = get_wandb_runs(project=BENCHMARK_WANDB_PROJECT, filters=None)
     for run in runs:
-        artifacts = list(run.logged_artifacts())
-        if artifacts:
-            table = artifacts[-1].get("benchmarks")
-            if table:
-                return table.get_dataframe(), datetime.datetime.strptime(
-                    run.metadata["startedAt"], "%Y-%m-%dT%H:%M:%S.%f"
-                )
-    print("Failed to get benchmarks from Wandb.")
-    return None, None
 def make_validator_dataframe(
@@ -406,31 +450,32 @@ def load_state_vars() -> dict[Any]:
             model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
             model_data.sort(key=lambda x: x.incentive, reverse=True)
             print(f"Loaded {len(model_data)} models")
             vali_runs = get_wandb_runs(
-                project=VALIDATOR_WANDB_PROJECT,
-                filters={"$and": [{"config.type": "validator"}], "$or": [{"config.uid": 28}, {"config.uid": 16}]},
             )
             print(f"Loaded {len(vali_runs)} validator runs")
             scores = get_scores([x.uid for x in model_data], vali_runs)
             validator_df = get_validator_weights(metagraph)
             weight_keys = set()
             for uid, stats in validator_df.items():
                 weight_keys.update(stats[-1].keys())
             # Compute loss over time for all competitions.
-            losses_1 = get_losses_over_time(vali_runs, 1)
             losses_2 = get_losses_over_time(vali_runs, 2)
-            # Enable benchmark if the flag is set
-            if BENCHMARK_FLAG:
-                benchmarks, benchmark_timestamp = get_benchmarks()
-            else:
-                benchmarks, benchmark_timestamp = None, None
             break
         except KeyboardInterrupt:
@@ -447,121 +492,7 @@ def load_state_vars() -> dict[Any]:
         "vali_runs": vali_runs,
         "scores": scores,
         "validator_df": validator_df,
-        "benchmarks": benchmarks,
-        "benchmark_timestamp": benchmark_timestamp,
-        "losses_1": losses_1,
         "losses_2": losses_2,
     }
-def test_load_state_vars():
-    # TODO: Change to finetuning data.
-    subtensor = bt.subtensor("finney")
-    metagraph = subtensor.metagraph(NETUID, lite=True)
-    model_data = [
-        ModelData(
-            uid=253,
-            hotkey="5DjoPAgZ54Zf6NsuiVYh8RjonnWWWREE2iXBNzM2VDBMQDPm",
-            namespace="jw-hf-test",
-            name="jw2",
-            commit="aad131f6b02219964e6dcf749c2a23e75a7ceca8",
-            secure_hash="L1ImYzWJwV+9KSnZ2TYW0Iy2KMcVjJVTd30YJoRkpbw=",
-            block=3131103,
-            incentive=1.0,
-            emission=209.06051635742188,
-        ),
-        ModelData(
-            uid=1,
-            hotkey="5CccVtjk4yamCao6QYgEg7jc8vktdj16RbLKNUftHfEsjuJS",
-            namespace="borggAI",
-            name="bittensor-subnet9-models",
-            commit="d373864bc6c972872edb8db95eed570958054bac",
-            secure_hash="+drdTIKYEGYClW2FFVVID6A2Dh//4rLmExRFCJsH6Y4=",
-            block=2081837,
-            incentive=0.0,
-            emission=0.0,
-        ),
-        ModelData(
-            uid=2,
-            hotkey="5HYwoXaczs3jAptbb5mk4aUCkgZqeNcNzJKxSec97GwasfLy",
-            namespace="jungiebeen",
-            name="pretrain1",
-            commit="4c0c6bfd0f92e243d6c8a82209142e7204c852c3",
-            secure_hash="ld/agc0XIWICom/Cpj0fkQLcMogMNj/F65MJogK5RLY=",
-            block=2467482,
-            incentive=0.0,
-            emission=0.0,
-        ),
-        ModelData(
-            uid=3,
-            hotkey="5Dnb6edh9yTeEp5aasRPZVPRAkxvQ6qnERVcXw22awMZ5rxm",
-            namespace="jungiebeen",
-            name="pretrain2",
-            commit="e827b7281c92224adb11124489cc45356553a87a",
-            secure_hash="ld/agc0XIWICom/Cpj0fkQLcMogMNj/F65MJogK5RLY=",
-            block=2467497,
-            incentive=0.0,
-            emission=0.0,
-        ),
-        ModelData(
-            uid=4,
-            hotkey="5FRfca8NbnH424WaX43PMhKBnbLA1bZpRRoXXiVs6HgsxN4K",
-            namespace="ZainAli60",
-            name="mine_modeles",
-            commit="8a4ed4ad1f1fb58d424fd22e8e9874b87d32917c",
-            secure_hash="tVcbZAFoNIOF+Ntxq31OQ2NrLXf5iFCmmPUJlpkMYYo=",
-            block=2508509,
-            incentive=0.0,
-            emission=0.0,
-        ),
-    ]
-    vali_runs = get_wandb_runs(
-        project=VALIDATOR_WANDB_PROJECT,
-        filters={"config.type": "validator", "config.uid": 238},
-    )
-    scores = get_scores([x.uid for x in model_data], vali_runs)
-    validator_df = {
-        28: (1.0, 33273.4453125, {253: 1.0}),
-        49: (
-            0.9127794504165649,
-            10401.677734375,
-            {
-                7: 0.0867,
-                217: 0.0001,
-                219: 0.0001,
-                241: 0.0001,
-                248: 0.0001,
-                253: 0.9128,
-            },
-        ),
-        78: (1.0, 26730.37109375, {253: 1.0}),
-        116: (1.0, 629248.4375, {253: 1.0}),
-        150: (1.0, 272634.53125, {253: 1.0}),
-        161: (1.0, 280212.53125, {253: 1.0}),
-        180: (1.0, 16838.0, {253: 1.0}),
-        184: (1.0, 47969.3984375, {253: 1.0}),
-        210: (1.0, 262846.28125, {253: 1.0}),
-        213: (1.0, 119462.734375, {253: 1.0}),
-        215: (1.0, 274747.46875, {253: 1.0}),
-        234: (1.0, 38831.6953125, {253: 1.0}),
-        236: (1.0, 183966.9375, {253: 1.0}),
-        238: (1.0, 1293707.25, {253: 1.0}),
-        240: (1.0, 106461.6015625, {253: 1.0}),
-        243: (1.0, 320271.5, {253: 1.0}),
-        244: (1.0, 116138.9609375, {253: 1.0}),
-        247: (0.9527428150177002, 119812.390625, {7: 0.0472, 253: 0.9528}),
-        249: (1.0, 478127.3125, {253: 1.0}),
-        252: (1.0, 442395.03125, {253: 1.0}),
-        254: (1.0, 46845.2109375, {253: 1.0}),
-        255: (1.0, 28977.56640625, {253: 1.0}),
-    }
-    return {
-        "metagraph": metagraph,
-        "model_data": model_data,
-        "vali_runs": vali_runs,
-        "scores": scores,
-        "validator_df": validator_df,
-    }

 import wandb
 from bittensor.extrinsics.serving import get_metadata
 from dotenv import load_dotenv
+from wandb.apis.public.history import SampledHistoryScan
 NETUID = 37
 DELAY_SECS = 3
 WANDB_TOKEN = os.environ.get("WANDB_API_KEY", None)
 SUBTENSOR_ENDPOINT = os.environ.get("SUBTENSOR_ENDPOINT", None)
 VALIDATOR_WANDB_PROJECT = "rusticluftig/finetuning"
+BENCHMARK_WANDB_PROJECT = "rusticluftig/test-benchmarks"
 @dataclass(frozen=True)
     return result
+def get_wandb_runs(
+    project: str, filters: Dict[str, Any], order: str = "-created_at"
+) -> List:
     """Get the latest runs from Wandb, retrying infinitely until we get them.
+    Args:
+        project (str): The Wandb project to get runs from.
+        filters (Dict[str, Any]): Filters to apply to the runs.
+        order (str): Order to sort the runs by. Defaults to "-created_at" (newest first)
     Returns:
+        List: List of runs matching the provided filters
     """
     while True:
+        api = wandb.Api(api_key=WANDB_TOKEN, timeout=100)
         runs = list(
             api.runs(
                 project,
                 filters=filters,
+                order=order,
             )
         )
         if len(runs) > 0:
         uids (List[int]): List of UIDs to get scores for.
         wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
     """
     def _maybe_convert_loss(loss: float, comp_id: int) -> float:
         """Converts loss to score for competitions that require it."""
         if comp_id == 2:
             return 1 - loss if loss else None
         return loss
     result = {}
     previous_timestamp = None
     seen_competitions = set()
                 # Only the most recent run per competition is fresh.
                 is_fresh = comp_id not in seen_competitions
                 result[uid] = {
+                    "avg_loss": _maybe_convert_loss(
+                        uid_data.get("average_loss", None), comp_id
+                    ),
                     "win_rate": uid_data.get("win_rate", None),
                     "win_total": uid_data.get("win_total", None),
                     "weight": uid_data.get("weight", None),
     """Returns a dataframe of the best average model loss over time."""
     timestamps = []
     losses = []
     for run in wandb_runs:
         # For each run, check the 10 most recent steps.
         best_loss = math.inf
         should_add_datapoint = False
         min_step = max(0, run.lastHistoryStep - 10)
+        history_scan = SampledHistoryScan(
+            run.client,
+            run,
+            ["original_format_json"],
+            min_step,
+            run.lastHistoryStep,
+            page_size=10,
         )
         max_timestamp = None
         for step in history_scan:
             data = json.loads(step["original_format_json"])
             all_uid_data = data["uid_data"]
             timestamp = datetime.datetime.fromtimestamp(data["timestamp"])
             if max_timestamp is None:
                 max_timestamp = timestamp
             max_timestamp = max(max_timestamp, timestamp)
             for _, uid_data in all_uid_data.items():
                 loss = uid_data.get("average_loss", math.inf)
                 c_id = uid_data.get("competition_id", None)
                 if c_id is None or c_id != competition_id:
                     continue
                 if loss < best_loss:
                     best_loss = loss
                     should_add_datapoint = True
         if should_add_datapoint:
             timestamps.append(max_timestamp)
             losses.append(best_loss)
+    return pd.DataFrame({"timestamp": timestamps, "losses": losses})
 def is_floatable(x) -> bool:
             c.block,
         ]
         for c in leaderboard
+        if c.competition_id == competition_id
+        and ((c.uid in scores and scores[c.uid]["fresh"]) or show_stale)
     ]
+def get_benchmarks() -> Tuple[pd.DataFrame, Dict[str, Dict[str, float]]]:
     """Returns the latest benchmarks and the time they were run."""
     if not BENCHMARK_WANDB_PROJECT:
         print("No benchmark project set.")
         return None, None
+    runs = get_wandb_runs(
+        project=BENCHMARK_WANDB_PROJECT, filters=None, order="+created_at"
+    )
+    timestamps, uids, models, mmlu, mmlu_pro = [], [], [], [], []
     for run in runs:
+        uid = run.config.get("uid", None)
+        model = run.config.get("model", None)
+        if not uid or not model:
+            continue
+        samples = list(
+            SampledHistoryScan(
+                run.client,
+                run,
+                ["_timestamp", "mmlu.acc,none", "mmlu_pro"],
+                0,
+                1,
+            )
+        )
+        if not samples:
+            continue
+        sample = samples[0]
+        timestamps.append(datetime.datetime.fromtimestamp(sample["_timestamp"]))
+        mmlu.append(sample["mmlu.acc,none"])
+        mmlu_pro.append(sample["mmlu_pro"])
+        uids.append(uid)
+        models.append(model)
+    return (
+        pd.DataFrame(
+            {
+                "timestamp": timestamps,
+                "uid": uids,
+                "model": models,
+                "mmlu": mmlu,
+                "mmlu_pro": mmlu_pro,
+            }
+        ),
+        {
+            "mmlu": {
+                "Llama-3.1-8B-Instruct": 0.681,
+                "Mistral-7B-Instruct-v0.3": 0.597,
+                "gemma-2-9b-it": 0.719,
+            },
+            "mmlu_pro": {
+                "Llama-3.1-8B-Instruct": 30.68,
+                "Mistral-7B-Instruct-v0.3": 23.06,
+                "gemma-2-9b-it": 31.95,
+            },
+        },
+    )
 def make_validator_dataframe(
             model_data: List[ModelData] = get_subnet_data(subtensor, metagraph)
             model_data.sort(key=lambda x: x.incentive, reverse=True)
             print(f"Loaded {len(model_data)} models")
             vali_runs = get_wandb_runs(
+                project=VALIDATOR_WANDB_PROJECT,
+                filters={
+                    "$and": [{"config.type": "validator"}],
+                    "$or": [{"config.uid": 28}, {"config.uid": 16}],
+                },
             )
             print(f"Loaded {len(vali_runs)} validator runs")
             scores = get_scores([x.uid for x in model_data], vali_runs)
+            print(f"Loaded {len(scores)} scores")
             validator_df = get_validator_weights(metagraph)
             weight_keys = set()
             for uid, stats in validator_df.items():
                 weight_keys.update(stats[-1].keys())
+            print("Loaded validator weights")
             # Compute loss over time for all competitions.
             losses_2 = get_losses_over_time(vali_runs, 2)
+            print("Loaded losses over time for comp 2")
+            benchmarks_df, benchmarks_targets = get_benchmarks()
+            print("Loaded benchmarks")
             break
         except KeyboardInterrupt:
         "vali_runs": vali_runs,
         "scores": scores,
         "validator_df": validator_df,
+        "benchmarks_df": benchmarks_df,
+        "benchmarks_targets": benchmarks_targets,
         "losses_2": losses_2,
     }