ITBench-Lite

Running

App Files Files Community

ITBench-Lite / analysis_src /utils.py

rohan-arora

activities (#1)

7932ebf 21 days ago

raw

history blame contribute delete

8.35 kB

	import json
	from pathlib import Path
	import pandas as pd

	# Model display names (short for figures)
	# Follows ArtificialAnalysis.ai naming conventions
	MODEL_DISPLAY_NAMES = {
	# OpenAI / Azure
	"Azure_gpt-5.1-2025-11-13": "GPT-5.1",
	"Azure_gpt-5.1-chat-2025-11-13": "GPT-5.1",
	"Azure_o4-mini": "o4-mini",
	"Azure_gpt-4o": "GPT-4o",
	"openai_gpt-oss-120b": "GPT-OSS-120B",
	"openai_gpt-oss-20b": "GPT-OSS-20B",
	# Google / GCP
	"GCP_gemini-2.5-pro": "Gemini 2.5 Pro",
	"gemini-2.5-pro": "Gemini 2.5 Pro",
	"gcp_gemini-3-pro-preview": "Gemini 3 Pro",
	"gemini-3-pro-preview": "Gemini 3 Pro",
	"gemini-3-flash-preview": "Gemini 3 Flash",
	"google_gemini-3-flash-preview": "Gemini 3 Flash",
	# Moonshot AI
	"moonshotai_kimi-k2-thinking": "Kimi K2",
	"kimi-k2-thinking": "Kimi K2",
	# Anthropic / AWS
	"aws_claude-opus-4-5": "Claude Opus 4.5",
	# Mistral AI
	"mistralai_mistral-large-2512": "Mistral Large",
	# Alibaba / Qwen
	"qwen_qwen3-vl-32b-instruct": "Qwen3-VL-32B",
	# ServiceNow
	"ServiceNow-AI_Apriel-1.6-15b-Thinker": "Apriel-1.6-15B",
	}


	def get_model_name(dirname: str) -> str:
	"""Extract model name from directory name."""
	name = dirname.replace("react with code_", "").replace("_07ccdb1", "")
	return MODEL_DISPLAY_NAMES.get(name, name)


	def find_react_with_code_dirs(leaderboard_dir: Path) -> list[Path]:
	"""Find all agent directories (excluding hidden and backup directories)."""
	dirs = []
	for d in leaderboard_dir.iterdir():
	if d.is_dir() and not d.name.startswith(".") and not d.name.startswith("backup_"):
	dirs.append(d)
	return sorted(dirs)


	def read_judge_outputs_from_dir(agent_dir: Path) -> dict[str, list[dict]]:
	"""
	Read all judge_output.json files from an agent directory.

	Returns:
	Dict mapping scenario_id -> list of judge outputs (one per trial)
	"""
	scenario_data = {}

	# Check if directory contains Scenario folders directly, or if we need to go one level deeper
	# (e.g., agent_dir/sre/Scenario-1, agent_dir/finops/Scenario-1, etc.)
	has_scenarios = any(d.name.startswith("Scenario") for d in agent_dir.iterdir() if d.is_dir())

	if not has_scenarios:
	# Look for subdirectories that might contain scenarios (sre, finops, etc.)
	subdirs = [d for d in agent_dir.iterdir() if d.is_dir() and not d.name.startswith(".")]
	if len(subdirs) == 1:
	# If there's exactly one subdirectory, use it
	agent_dir = subdirs[0]
	elif len(subdirs) > 1:
	# If there are multiple, try to find one with Scenario folders
	for subdir in subdirs:
	if any(d.name.startswith("Scenario") for d in subdir.iterdir() if d.is_dir()):
	agent_dir = subdir
	break

	for scenario_dir in agent_dir.iterdir():
	if not scenario_dir.is_dir() or not scenario_dir.name.startswith("Scenario"):
	continue

	scenario_id = scenario_dir.name
	trials = []

	# Look for trial subdirectories (1, 2, 3, etc.)
	for trial_dir in sorted(scenario_dir.iterdir()):
	if not trial_dir.is_dir():
	continue

	judge_file = trial_dir / "judge_output.json"
	if judge_file.exists():
	try:
	with open(judge_file) as f:
	judge_data = json.load(f)
	trials.append(judge_data)
	except Exception as e:
	print(f" Warning: Error reading {judge_file}: {e}")

	if trials:
	scenario_data[scenario_id] = trials

	return scenario_data


	def extract_trial_scores_from_judge_outputs(
	scenario_data: dict[str, list[dict]],
	metric: str
	) -> dict[str, list[float]]:
	"""
	Extract per-trial scores for a given metric from judge outputs.

	Args:
	scenario_data: Dict mapping scenario_id -> list of judge outputs
	metric: The metric name to extract

	Returns:
	Dict mapping scenario_id -> list of trial scores
	"""
	scenario_trials = {}

	for scenario_id, trials in scenario_data.items():
	scores = []
	for trial in trials:
	flat_scores = trial.get("flat_scores", {})
	score = flat_scores.get(metric)

	# Handle None/null values
	if score is None:
	score = 0.0
	scores.append(float(score))

	if scores:
	scenario_trials[scenario_id] = scores

	return scenario_trials


	def get_runs_stats(scenario_data: dict[str, list], min_runs_required: int) -> tuple[int, int, int, int]:
	"""Get run statistics: (n_scenarios, min_runs, max_runs, n_qualifying)."""
	if not scenario_data:
	return 0, 0, 0, 0

	run_counts = [len(trials) for trials in scenario_data.values()]
	n_qualifying = sum(1 for c in run_counts if c >= min_runs_required)
	return len(scenario_data), min(run_counts), max(run_counts), n_qualifying


	def filter_scenarios_with_min_runs(scenario_data: dict[str, list], min_runs_required: int) -> dict[str, list]:
	"""Filter to only include scenarios with >= min_runs_required runs."""
	return {
	scenario_id: trials
	for scenario_id, trials in scenario_data.items()
	if len(trials) >= min_runs_required
	}

	def find_latest_rollout_file(trial_dir: Path) -> Path:
	"""Find the latest rollout file in a trial's sessions directory or session.jsonl."""
	# First check for session.jsonl directly in trial_dir (new structure)
	session_file = trial_dir / "session.jsonl"
	if session_file.exists():
	return session_file

	# Fall back to sessions/rollout-*.jsonl (old structure)
	sessions_dir = trial_dir / "sessions"
	if not sessions_dir.exists():
	return None

	rollout_files = []
	for rollout_file in sessions_dir.rglob("rollout-*.jsonl"):
	rollout_files.append(rollout_file)

	if not rollout_files:
	return None

	# Sort by modification time and return the latest
	return max(rollout_files, key=lambda p: p.stat().st_mtime)

	def json_to_filtered_df(path: str) -> pd.DataFrame:
	"""
	Load a .json or .jsonl file, keep only rows whose payload.type is in
	DESIRED_TYPES, select USEFUL_COLS, and return the DataFrame sorted by
	timestamp ascending.

	Parameters
	----------
	path : str
	Path to the JSON or JSON Lines file.

	Returns
	-------
	pd.DataFrame
	Tidied DataFrame ready for analysis/labs.
	"""
	DESIRED_TYPES = {
	"agent_message",
	"function_call",
	"function_call_output"
	}

	# Union of all “useful” columns
	USEFUL_COLS = [
	"timestamp",
	"payload.type",
	"payload.message",
	"payload.role",
	"payload.content",
	"payload.name",
	"payload.arguments",
	"payload.call_id",
	"payload.output",
	]
	path = Path(path)
	if not path.exists():
	raise FileNotFoundError(f"{path} does not exist")

	# 1. Load the records -----------------------------------------------------
	if path.suffix.lower() in {".jsonl", ".ndjson"}:
	with path.open("r", encoding="utf-8") as f:
	records = [json.loads(line) for line in f if line.strip()]
	else:
	with path.open("r", encoding="utf-8") as f:
	data = json.load(f)
	records = data if isinstance(data, list) else [data]

	# 2. Flatten nested JSON --------------------------------------------------
	df = pd.json_normalize(records)

	# 3. Filter by payload.type ----------------------------------------------
	if "payload.type" not in df.columns:
	raise KeyError("'payload.type' column missing from data")
	df = df[df["payload.type"].isin(DESIRED_TYPES)].copy()

	# 4. Ensure all useful columns exist (add empty if missing) --------------
	for col in USEFUL_COLS:
	if col not in df.columns:
	df[col] = pd.NA

	# 5. Subset to useful columns only ---------------------------------------
	df = df[USEFUL_COLS]

	# 6. Sort by timestamp ----------------------------------------------------
	df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
	df = df.sort_values("timestamp", ignore_index=True)

	return df