Spaces:

Prosodia
/

Atlas

Sleeping

Atlas / src /populate.py

Victor Dieguez

Removing envs variables

974e6f0 3 months ago

4.45 kB

	import json
	import os
	from dataclasses import fields

	import pandas as pd

	from src.display.formatting import make_clickable_model
	from src.display.utils import AutoEvalColumn


	def get_leaderboard_df(eval_results_path, eval_requests_path, cols, benchmark_cols):
	"""
	Build the leaderboard dataframe directly from JSON files in eval_results_path.

	We completely bypass get_raw_eval_results because our JSONs are already in
	a simple schema:
	- config.model_name
	- results[benchmark_name]["acc"] in [0, 1]

	We:
	- create a row for each *.json
	- fill all AutoEvalColumn fields with None, then overwrite the ones we know:
	* model -> clickable HF link
	* Average ⬆️ -> mean of all metrics (in percentage)
	* each benchmark col in `benchmark_cols` -> metric * 100
	"""
	# 1) Collect all .json files under eval_results_path
	if not os.path.isdir(eval_results_path):
	print(f"Results path '{eval_results_path}' does not exist.")
	return pd.DataFrame(columns=cols)

	json_files = [
	f for f in os.listdir(eval_results_path)
	if f.endswith(".json") and not f.startswith(".")
	]

	if not json_files:
	print(f"No JSON result files found in '{eval_results_path}'.")
	return pd.DataFrame(columns=cols)

	rows = []

	for fname in json_files:
	fpath = os.path.join(eval_results_path, fname)
	try:
	with open(fpath, "r", encoding="utf-8") as fp:
	data = json.load(fp)
	except Exception as e:
	print(f"Failed to read '{fpath}': {e}")
	continue

	# Start with all columns set to None so the DF matches AutoEvalColumn
	#row = {field.name: None for field in fields(AutoEvalColumn)}
	row = {c: None for c in cols}
	# ---- model column ----
	config = data.get("config", {})
	model_id = (
	config.get("model_name")
	or config.get("model_id")
	or config.get("model") # just in case
	)

	if model_id is None:
	# skip weird files without model info
	print(f"Skipping '{fname}' – no model_name in config.")
	continue

	# Fill the "model" column (clickable markdown link)
	row[AutoEvalColumn.model.name] = make_clickable_model(model_id)

	# ---- metrics ----
	results = data.get("results", {})
	scores = []

	for bench in benchmark_cols:
	bench_result = results.get(bench, None)
	if not isinstance(bench_result, dict):
	continue

	# We agreed on metric key "acc" in your JSONs
	val = bench_result.get("acc", None)
	if val is None:
	continue

	# Convert to percentage (e.g. 0.747 -> 74.7)
	score = float(val) * 100.0
	row[bench] = score
	scores.append(score)

	# ---- Average ⬆️ ----
	avg_col = AutoEvalColumn.average.name
	if scores:
	row[avg_col] = sum(scores) / len(scores)
	else:
	row[avg_col] = None

	rows.append(row)

	if not rows:
	print("No valid evaluation rows constructed – returning empty leaderboard.")
	return pd.DataFrame(columns=cols)

	df = pd.DataFrame(rows)

	# Keep column ordering consistent with COLS
	existing_cols = [c for c in cols if c in df.columns]
	df = df[existing_cols]

	# Round numeric columns
	num_cols = df.select_dtypes(include="number").columns
	if len(num_cols) > 0:
	df[num_cols] = df[num_cols].round(2)

	# Optional: drop rows with NaNs in benchmark columns
	existing_benchmarks = [c for c in benchmark_cols if c in df.columns]
	if existing_benchmarks:
	df = df.dropna(subset=existing_benchmarks, how="any")

	return df


	def get_evaluation_queue_df(save_path: str, cols: list):
	"""
	Stubbed evaluation queue.

	You are not using a requests dataset / eval queue, so we just:
	- ensure the directory exists, and
	- return three empty dataframes (finished, running, pending)
	with the expected columns.
	"""
	os.makedirs(save_path, exist_ok=True)

	empty_df = pd.DataFrame(columns=cols)

	# The order here must match how app.py unpacks the result:
	# finished_df, running_df, pending_df = get_evaluation_queue_df(...)
	return empty_df, empty_df.copy(), empty_df.copy()