Spaces:

Prosodia
/

Atlas

Sleeping

App Files Files Community

Victor Dieguez commited on Nov 24, 2025

Commit

781d4b0

1 Parent(s): fd29588

Removing envs variables

Browse files

Files changed (1) hide show

src/populate.py +112 -2

src/populate.py CHANGED Viewed

@@ -50,7 +50,7 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
         df = df[has_no_nan_values(df, existing_benchmarks)]
     return df
-'''
 def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
     """
@@ -93,7 +93,7 @@ def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_co
         df = df.dropna(subset=existing_benchmarks, how="any")
     return df
-'''
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
@@ -129,6 +129,116 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
     return df_finished[cols], df_running[cols], df_pending[cols]
 '''
 def get_evaluation_queue_df(save_path: str, cols: list):
     """
     Stubbed evaluation queue.

         df = df[has_no_nan_values(df, existing_benchmarks)]
     return df
 def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
     """
         df = df.dropna(subset=existing_benchmarks, how="any")
     return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
     return df_finished[cols], df_running[cols], df_pending[cols]
 '''
+def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
+    """
+    Build the leaderboard dataframe directly from JSON files in eval_results_path.
+    We completely bypass get_raw_eval_results because our JSONs are already in
+    a simple schema:
+      - config.model_name
+      - results[benchmark_name]["acc"] in [0, 1]
+    We:
+      - create a row for each *.json
+      - fill all AutoEvalColumn fields with None, then overwrite the ones we know:
+        * model        -> clickable HF link
+        * Average ⬆️  -> mean of all metrics (in percentage)
+        * each benchmark col in `benchmark_cols` -> metric * 100
+    """
+    # 1) Collect all .json files under eval_results_path
+    if not os.path.isdir(eval_results_path):
+        print(f"Results path '{eval_results_path}' does not exist.")
+        return pd.DataFrame(columns=cols)
+    json_files = [
+        f for f in os.listdir(eval_results_path)
+        if f.endswith(".json") and not f.startswith(".")
+    ]
+    if not json_files:
+        print(f"No JSON result files found in '{eval_results_path}'.")
+        return pd.DataFrame(columns=cols)
+    rows = []
+    for fname in json_files:
+        fpath = os.path.join(eval_results_path, fname)
+        try:
+            with open(fpath, "r", encoding="utf-8") as fp:
+                data = json.load(fp)
+        except Exception as e:
+            print(f"Failed to read '{fpath}': {e}")
+            continue
+        # Start with all columns set to None so the DF matches AutoEvalColumn
+        row = {field.name: None for field in fields(AutoEvalColumn)}
+        # ---- model column ----
+        config = data.get("config", {})
+        model_id = (
+            config.get("model_name")
+            or config.get("model_id")
+            or config.get("model")  # just in case
+        )
+        if model_id is None:
+            # skip weird files without model info
+            print(f"Skipping '{fname}' – no model_name in config.")
+            continue
+        # Fill the "model" column (clickable markdown link)
+        row[AutoEvalColumn.model.name] = make_clickable_model(model_id)
+        # ---- metrics ----
+        results = data.get("results", {})
+        scores = []
+        for bench in benchmark_cols:
+            bench_result = results.get(bench, None)
+            if not isinstance(bench_result, dict):
+                continue
+            # We agreed on metric key "acc" in your JSONs
+            val = bench_result.get("acc", None)
+            if val is None:
+                continue
+            # Convert to percentage (e.g. 0.747 -> 74.7)
+            score = float(val) * 100.0
+            row[bench] = score
+            scores.append(score)
+        # ---- Average ⬆️ ----
+        avg_col = AutoEvalColumn.average.name
+        if scores:
+            row[avg_col] = sum(scores) / len(scores)
+        else:
+            row[avg_col] = None
+        rows.append(row)
+    if not rows:
+        print("No valid evaluation rows constructed – returning empty leaderboard.")
+        return pd.DataFrame(columns=cols)
+    df = pd.DataFrame(rows)
+    # Keep column ordering consistent with COLS
+    existing_cols = [c for c in cols if c in df.columns]
+    df = df[existing_cols]
+    # Round numeric columns
+    num_cols = df.select_dtypes(include="number").columns
+    if len(num_cols) > 0:
+        df[num_cols] = df[num_cols].round(2)
+    # Optional: drop rows with NaNs in benchmark columns
+    existing_benchmarks = [c for c in benchmark_cols if c in df.columns]
+    if existing_benchmarks:
+        df = df.dropna(subset=existing_benchmarks, how="any")
+    return df
+'''
 def get_evaluation_queue_df(save_path: str, cols: list):
     """
     Stubbed evaluation queue.