Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import pandas as pd | |
| from datasets import load_dataset, get_dataset_config_names | |
| from datasets.exceptions import DatasetNotFoundError | |
| from tqdm.auto import tqdm | |
| from src.display.formatting import has_no_nan_values, make_clickable_model | |
| from src.display.utils import AutoEvalColumn, EvalQueueColumn | |
| from src.envs import TOKEN | |
| from src.leaderboard.read_evals import get_raw_eval_results | |
| from src.logger import get_logger | |
| logger = get_logger(__name__) | |
| def get_leaderboard_df(results_dataset_name: str) -> pd.DataFrame: | |
| """Creates a dataframe from all the individual experiment results""" | |
| try: | |
| configs = get_dataset_config_names(results_dataset_name, token=TOKEN) | |
| except (DatasetNotFoundError, FileNotFoundError): | |
| # Return an empty DataFrame with expected columns | |
| return pd.DataFrame( | |
| columns=[ | |
| "System Name", | |
| "System Type", | |
| "Organization", | |
| "Success Rate (%)", | |
| "Problems Solved", | |
| "Submitted On", | |
| ] | |
| ) | |
| rows = [] | |
| for submission_id in tqdm(configs, total=len(configs), desc="Processing Submission Results"): | |
| submission_ds = load_dataset(results_dataset_name, submission_id, split="train", token=TOKEN) | |
| submission_df = pd.DataFrame(submission_ds) | |
| if submission_df.empty or "did_pass" not in submission_df.columns or submission_df.did_pass.isna().any(): | |
| logger.warning(f"Skipping {submission_id} due to invalid did_pass values") | |
| continue | |
| success_rate = 100 * submission_df["did_pass"].mean() | |
| num_solved = submission_df["did_pass"].sum() | |
| first_row = submission_df.iloc[0] | |
| rows.append( | |
| { | |
| "System Name": first_row["system_name"], | |
| "System Type": first_row["system_type"], | |
| "Organization": first_row["organization"], | |
| "Success Rate (%)": success_rate, | |
| "Problems Solved": num_solved, | |
| "Submitted On": pd.to_datetime(first_row["submission_ts"]).strftime("%Y-%m-%d %H:%M"), | |
| } | |
| ) | |
| full_df = pd.DataFrame(rows) | |
| # TODO: forbid multiple submissions under the same name? | |
| # Keep only the latest entry per unique (System Name, System Type, Organization) triplet | |
| final_df = ( | |
| full_df.sort_values("Submitted On", ascending=False) | |
| .drop_duplicates(subset=["System Name", "System Type", "Organization"], keep="first") | |
| .sort_values(by=[AutoEvalColumn.success_rate.name], ascending=False) | |
| .reset_index(drop=True) | |
| ) | |
| cols_to_round = ["Success Rate (%)"] | |
| final_df[cols_to_round] = final_df[cols_to_round].round(decimals=2) | |
| return final_df | |
| def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: | |
| """Creates the different dataframes for the evaluation queues requestes""" | |
| entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")] | |
| all_evals = [] | |
| for entry in entries: | |
| if ".json" in entry: | |
| file_path = os.path.join(save_path, entry) | |
| with open(file_path) as fp: | |
| data = json.load(fp) | |
| data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) | |
| data[EvalQueueColumn.revision.name] = data.get("revision", "main") | |
| all_evals.append(data) | |
| elif ".md" not in entry: | |
| # this is a folder | |
| sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")] | |
| for sub_entry in sub_entries: | |
| file_path = os.path.join(save_path, entry, sub_entry) | |
| with open(file_path) as fp: | |
| data = json.load(fp) | |
| data[EvalQueueColumn.model.name] = make_clickable_model(data["model"]) | |
| data[EvalQueueColumn.revision.name] = data.get("revision", "main") | |
| all_evals.append(data) | |
| pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]] | |
| running_list = [e for e in all_evals if e["status"] == "RUNNING"] | |
| finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"] | |
| df_pending = pd.DataFrame.from_records(pending_list, columns=cols) | |
| df_running = pd.DataFrame.from_records(running_list, columns=cols) | |
| df_finished = pd.DataFrame.from_records(finished_list, columns=cols) | |
| return df_finished[cols], df_running[cols], df_pending[cols] | |