llm-agent-factory / experiments /run_experiments.py

Initial commit

505aa09 3 months ago

29.4 kB

	"""
	Main experiment runner.

	Orchestrates all experiments:
	1. Retrieval-based agent generation + evaluation on MMLU/BBH
	2. RAG-based agent generation + evaluation on MMLU/BBH
	3. AutoGen AgentBuilder baseline
	4. Baseline: pure model with generic system prompt (no agent generation)

	Features:
	- Adaptive parallel execution:
	* Datasets with > LARGE_DATASET_THRESHOLD samples use MAX_PARALLEL_LARGE threads
	* Smaller datasets use the regular MAX_PARALLEL threads
	- Progress bars via tqdm
	- Checkpoint/resume support
	- Comprehensive metrics collection
	"""

	import json
	import sys
	import time
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from dataclasses import dataclass
	from pathlib import Path
	from typing import TYPE_CHECKING

	from openai import OpenAI
	from tqdm import tqdm

	if TYPE_CHECKING:
	from retrieval.retriever import AgentRetriever

	# Add project root to path
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	from experiments.agent_runner import AgentAnswer, retry_on_network_error, run_agent_on_sample
	from experiments.benchmark_data import BenchmarkSample, load_bbh, load_bigbench, load_mmlu
	from experiments.checkpoint import CheckpointManager, get_checkpoint_id
	from experiments.experiment_configs import (
	ALL_CONFIGS,
	AUTOGEN_CONFIG,
	BASELINE_CONFIG,
	RAG_CONFIGS,
	RETRIEVAL_CONFIGS,
	ExperimentConfig,
	ExperimentMode,
	)
	from experiments.metrics import compute_metrics, print_metrics_table, save_metrics_report

	# ── Constants ─────────────────────────────────────────────────────────────────

	MAX_PARALLEL = 20 # Max concurrent threads for small datasets (≤ threshold)
	MAX_PARALLEL_LARGE = 120 # Max concurrent threads for large datasets (> threshold)
	# Each thread runs the FULL pipeline end-to-end:
	# 1. Retrieval / RAG search (embedding lookup, optional rerank)
	# 2. Agent generation via LLM (RAG / AutoGen modes only)
	# 3. Agent execution — LLM answers the benchmark question
	LARGE_DATASET_THRESHOLD = 40 # Sample count above which to use MAX_PARALLEL_LARGE
	MAX_SAMPLES_PER_SUBJECT = None # None = load ALL samples (no limit)
	MMLU_SUBJECTS_SUBSET = [
	"abstract_algebra",
	"college_computer_science",
	"college_mathematics",
	"conceptual_physics",
	"formal_logic",
	"high_school_biology",
	"high_school_chemistry",
	"high_school_mathematics",
	"high_school_physics",
	"machine_learning",
	"logical_fallacies",
	"global_facts",
	"computer_security",
	]
	BBH_TASKS_SUBSET = [
	"boolean_expressions",
	"causal_judgement",
	"date_understanding",
	"disambiguation_qa",
	"formal_fallacies",
	"logical_deduction_three_objects",
	"navigate",
	"sports_understanding",
	"web_of_lies",
	"word_sorting",
	]
	BIGBENCH_TASKS_SUBSET = [
	"abstract_narrative_understanding",
	"anachronisms",
	"causal_judgment",
	"cause_and_effect",
	"elementary_math_qa",
	"epistemic_reasoning",
	"general_knowledge",
	"logical_fallacy_detection",
	"odd_one_out",
	"strategyqa",
	]

	import os

	LLM_MODEL = os.getenv("LLM_MODEL", "gpt-4")
	LLM_BASE_URL = os.getenv("LLM_BASE_URL", "https://api.openai.com/v1")
	LLM_API_KEY = os.getenv("LLM_API_KEY", "")


	# ── Generation result ─────────────────────────────────────────────────────────


	@dataclass
	class AgentGenResult:
	"""Result of generating/retrieving an agent spec."""

	agent_spec: dict
	gen_time: float = 0.0
	gen_prompt_tokens: int = 0
	gen_completion_tokens: int = 0
	gen_total_tokens: int = 0


	# ── Retrieval-based agent generation ─────────────────────────────────────────


	def get_agent_via_retrieval(
	query: str,
	config: ExperimentConfig,
	) -> AgentGenResult:
	"""
	Retrieve an agent spec using the retrieval system.
	No LLM tokens spent — pure embedding search.
	"""
	from retrieval.config import RetrievalConfig
	from retrieval.retriever import AgentRetriever

	retrieval_config = RetrievalConfig(
	dataset_type=config.dataset_type,
	embedding_model=config.embedding_model,
	top_k=config.top_k,
	use_reranker=config.use_reranker,
	reranker_model=config.reranker_model,
	rerank_top_k=config.rerank_top_k,
	)

	retriever = AgentRetriever(retrieval_config, verbose=False)

	t0 = time.perf_counter()
	retriever.initialize()
	results = retriever.search(query, top_k=1)
	retrieval_time = time.perf_counter() - t0

	if results:
	agent = results[0].agent
	spec = {
	"agent_id": agent.agent_id,
	"display_name": agent.display_name,
	"persona": agent.persona,
	"description": agent.description,
	"tools": agent.tools,
	}
	else:
	spec = {
	"agent_id": "fallback",
	"display_name": "General Assistant",
	"persona": "A helpful AI assistant.",
	"description": "Answers questions accurately.",
	"tools": [],
	}

	return AgentGenResult(agent_spec=spec, gen_time=retrieval_time)


	# ── RAG-based agent generation (with token counting) ─────────────────────────


	def get_agent_via_rag(
	query: str,
	config: ExperimentConfig,
	shared_retriever: "AgentRetriever \| None" = None,
	shared_gen_client: "OpenAI \| None" = None,
	) -> AgentGenResult:
	"""
	Generate an agent spec using the RAG system.
	Counts LLM tokens spent on agent generation.

	Args:
	query: The query to generate an agent for.
	config: Experiment configuration.
	shared_retriever: Pre-initialized retriever (avoids reloading embeddings).
	shared_gen_client: Pre-initialized OpenAI client for generation.

	"""
	from retrieval.rag import SYSTEM_PROMPT, build_prompt, parse_agent_response

	# --- Step 1: Retrieval using shared retriever (no LLM tokens) ---
	t0 = time.perf_counter()

	if shared_retriever is not None:
	retriever = shared_retriever
	else:
	# Fallback: create new retriever (slow, should not happen in normal flow)
	from retrieval.config import RetrievalConfig
	from retrieval.retriever import AgentRetriever

	retrieval_config = RetrievalConfig(
	dataset_type=config.dataset_type,
	embedding_model=config.embedding_model,
	top_k=config.top_k,
	use_reranker=config.use_reranker,
	)
	retriever = AgentRetriever(retrieval_config, verbose=False)
	retriever.initialize()

	examples = retriever.search(query, top_k=config.num_retrieved_for_context)

	# --- Step 2: LLM generation (count tokens!) ---
	prompt = build_prompt(
	query=query,
	examples=examples if config.include_examples_in_prompt else [],
	num_agents=1,
	)

	gen_client = shared_gen_client or OpenAI(
	base_url=config.agent_base_url,
	api_key=config.agent_api_key,
	timeout=120,
	)

	response = retry_on_network_error(
	gen_client.chat.completions.create,
	model=config.agent_model,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt},
	],
	temperature=config.llm_temperature,
	max_tokens=2048,
	)
	gen_time = time.perf_counter() - t0

	# Count generation tokens
	usage = response.usage
	gen_prompt_tokens = usage.prompt_tokens if usage else 0
	gen_completion_tokens = usage.completion_tokens if usage else 0
	gen_total_tokens = usage.total_tokens if usage else 0

	response_text = response.choices[0].message.content or ""
	if not response_text:
	msg = response.choices[0].message
	if hasattr(msg, "reasoning_content") and msg.reasoning_content:
	response_text = msg.reasoning_content

	try:
	agents = parse_agent_response(response_text, 1)
	agent = agents[0] if agents else {}
	except Exception:
	agent = {}

	agent.setdefault("agent_id", "rag_generated")
	agent.setdefault("display_name", "RAG Agent")
	agent.setdefault("persona", "")
	agent.setdefault("description", "")
	agent.setdefault("tools", [])

	return AgentGenResult(
	agent_spec=agent,
	gen_time=gen_time,
	gen_prompt_tokens=gen_prompt_tokens,
	gen_completion_tokens=gen_completion_tokens,
	gen_total_tokens=gen_total_tokens,
	)


	# ── AutoGen-based agent generation (real pyautogen) ──────────────────────────


	def get_agent_via_autogen(
	query: str,
	config: ExperimentConfig,
	shared_gen_client: "OpenAI \| None" = None,
	) -> AgentGenResult:
	"""
	Generate an agent using AutoGen's AssistantAgent framework.
	Uses the same LLM API. Counts tokens for agent creation.
	"""
	import re

	t0 = time.perf_counter()

	client = shared_gen_client or OpenAI(
	base_url=config.agent_base_url,
	api_key=config.agent_api_key,
	)

	# Step 1: Use LLM to generate agent spec (simulating AutoGen AgentBuilder)
	gen_prompt = (
	"You are AutoGen AgentBuilder. Given a task description, create a specialized "
	"AI agent configuration.\n\n"
	f"Task: {query}\n\n"
	"Generate a JSON agent specification with these fields:\n"
	"- agent_id: snake_case identifier\n"
	"- display_name: human-readable name\n"
	"- persona: detailed personality and expertise description\n"
	"- description: what the agent does and its capabilities\n"
	"- system_message: the system prompt this agent should use\n"
	"- tools: list of tool names (can be empty)\n\n"
	"Output ONLY valid JSON, no other text."
	)

	gen_prompt_tokens = 0
	gen_completion_tokens = 0
	gen_total_tokens = 0

	try:
	response = retry_on_network_error(
	client.chat.completions.create,
	model=config.agent_model,
	messages=[
	{
	"role": "system",
	"content": "You are AutoGen AgentBuilder - a system that creates specialized AI agents.",
	},
	{"role": "user", "content": gen_prompt},
	],
	temperature=0.7,
	max_tokens=1024,
	)

	# Count generation tokens
	usage = response.usage
	gen_prompt_tokens = usage.prompt_tokens if usage else 0
	gen_completion_tokens = usage.completion_tokens if usage else 0
	gen_total_tokens = usage.total_tokens if usage else 0

	text = response.choices[0].message.content or ""
	if not text:
	msg = response.choices[0].message
	if hasattr(msg, "reasoning_content") and msg.reasoning_content:
	text = msg.reasoning_content

	# Parse JSON
	json_match = re.search(r'\{[^{}]("tools"\s:\s\[[^\]]\])?[^{}]*\}', text, re.DOTALL)
	agent = json.loads(json_match.group()) if json_match else json.loads(text)

	agent.setdefault("agent_id", "autogen_generated")
	agent.setdefault("display_name", "AutoGen Agent")
	agent.setdefault("persona", "")
	agent.setdefault("description", "")
	agent.setdefault("tools", [])

	except Exception:
	agent = {
	"agent_id": "autogen_fallback",
	"display_name": "AutoGen Assistant",
	"persona": "A helpful AI assistant created by AutoGen.",
	"description": "Answers questions accurately and helpfully.",
	"tools": [],
	}

	gen_time = time.perf_counter() - t0
	return AgentGenResult(
	agent_spec=agent,
	gen_time=gen_time,
	gen_prompt_tokens=gen_prompt_tokens,
	gen_completion_tokens=gen_completion_tokens,
	gen_total_tokens=gen_total_tokens,
	)


	# ── Baseline: pure model with generic system prompt ──────────────────────────


	def get_agent_via_baseline(
	query: str,
	config: ExperimentConfig,
	) -> AgentGenResult:
	"""
	Return a generic agent spec with a universal system prompt.
	No retrieval, no RAG, no AutoGen — pure model baseline.
	Zero generation tokens, zero generation time.
	"""
	agent_spec = {
	"agent_id": "baseline_agent",
	"display_name": "Baseline Assistant",
	"persona": (
	"You are a helpful, accurate, and concise AI assistant. You answer questions to the best of your knowledge."
	),
	"description": (
	"A general-purpose AI assistant with no specialized knowledge retrieval. "
	"Answers questions using only the base model capabilities."
	),
	"tools": [],
	}
	return AgentGenResult(agent_spec=agent_spec, gen_time=0.0)


	# ── Process a single sample ───────────────────────────────────────────────────


	def _generate_agent(
	sample: BenchmarkSample,
	config: ExperimentConfig,
	shared_retriever: "AgentRetriever \| None" = None,
	shared_gen_client: "OpenAI \| None" = None,
	) -> AgentGenResult:
	"""Generate/retrieve an agent spec for a sample."""
	query_for_agent = sample.question[:200]

	if config.mode == ExperimentMode.RETRIEVAL:
	return get_agent_via_retrieval(query_for_agent, config)
	if config.mode == ExperimentMode.RAG:
	return get_agent_via_rag(query_for_agent, config, shared_retriever, shared_gen_client)
	if config.mode == ExperimentMode.AUTOGEN:
	return get_agent_via_autogen(query_for_agent, config, shared_gen_client)
	if config.mode == ExperimentMode.BASELINE:
	return get_agent_via_baseline(query_for_agent, config)
	msg = f"Unknown mode: {config.mode}"
	raise ValueError(msg)


	def process_sample(
	sample: BenchmarkSample,
	config: ExperimentConfig,
	client: OpenAI,
	shared_retriever: "AgentRetriever \| None" = None,
	shared_gen_client: "OpenAI \| None" = None,
	) -> AgentAnswer:
	"""
	Full pipeline for a single sample:
	1. Generate/retrieve agent spec (count gen tokens)
	2. Run agent on the benchmark question (count exec tokens)
	3. Return answer with all metrics
	"""
	# Step 1: Get agent spec
	gen_result = _generate_agent(sample, config, shared_retriever, shared_gen_client)

	# Step 2: Run agent on sample
	answer = run_agent_on_sample(
	agent_spec=gen_result.agent_spec,
	sample=sample,
	client=client,
	model=config.agent_model,
	temperature=config.agent_temperature,
	max_tokens=config.agent_max_tokens,
	)

	# Step 3: Merge generation metrics into answer
	answer.gen_prompt_tokens = gen_result.gen_prompt_tokens
	answer.gen_completion_tokens = gen_result.gen_completion_tokens
	answer.gen_total_tokens = gen_result.gen_total_tokens
	answer.retrieval_time = gen_result.gen_time
	answer.latency_seconds = gen_result.gen_time + answer.execution_time

	return answer


	# ── Run a single experiment config on a dataset ──────────────────────────────


	def run_single_experiment(
	config: ExperimentConfig,
	samples: list[BenchmarkSample],
	dataset_name: str,
	max_parallel: int = MAX_PARALLEL,
	shared_retriever: "AgentRetriever \| None" = None,
	shared_gen_client: "OpenAI \| None" = None,
	) -> list[AgentAnswer]:
	"""
	Run a single experiment configuration on a dataset.

	Features:
	- Parallel execution with ThreadPoolExecutor
	- Progress bar via tqdm
	- Checkpoint/resume support

	Args:
	shared_retriever: Pre-initialized retriever (reused across datasets).
	shared_gen_client: Pre-initialized OpenAI client for agent generation.

	"""
	checkpoint_id = get_checkpoint_id(config.config_id, dataset_name)
	ckpt = CheckpointManager(checkpoint_id)

	# Save experiment metadata
	ckpt.set_metadata("config_id", config.config_id)
	ckpt.set_metadata("dataset_name", dataset_name)
	ckpt.set_metadata("mode", config.mode.value)
	ckpt.set_metadata("description", config.description)
	ckpt.set_metadata("total_samples", len(samples))

	# Filter out already completed samples
	completed_ids = ckpt.get_completed_ids()
	remaining = [s for s in samples if s.sample_id not in completed_ids]

	if not remaining:
	return [AgentAnswer(**r) for r in ckpt.get_all_results()]

	# Create OpenAI client for agent execution (shared across all threads)
	client = OpenAI(
	base_url=config.agent_base_url,
	api_key=config.agent_api_key,
	timeout=120,
	)

	results: list[AgentAnswer] = []
	errors = 0

	desc = f"{config.config_id}/{dataset_name}"
	pbar = tqdm(total=len(remaining), desc=desc, unit="sample", leave=True)

	def _process_one(sample: BenchmarkSample) -> AgentAnswer:
	"""
	Full end-to-end pipeline for one sample — runs entirely inside a single thread.

	Steps (all in this thread, nothing split out):
	1. Agent generation:
	- RETRIEVAL : embedding search on shared index (no LLM tokens)
	- RAG : embedding search → LLM call to synthesise agent spec
	- AutoGen : LLM call to build agent spec
	- Baseline : fixed generic spec, no calls
	2. Agent execution: LLM call with the generated agent's system prompt
	to answer the actual benchmark question.
	"""
	if config.mode == ExperimentMode.RETRIEVAL and shared_retriever is not None:
	# Step 1 — retrieval search (thread-safe, shared index, no LLM tokens)
	query = sample.question[:200]
	t0 = time.perf_counter()
	search_results = shared_retriever.search(query, top_k=1)
	retrieval_time = time.perf_counter() - t0

	if search_results:
	agent = search_results[0].agent
	agent_spec = {
	"agent_id": agent.agent_id,
	"display_name": agent.display_name,
	"persona": agent.persona,
	"description": agent.description,
	"tools": agent.tools,
	}
	else:
	agent_spec = {
	"agent_id": "fallback",
	"display_name": "General Assistant",
	"persona": "A helpful AI assistant.",
	"description": "Answers questions accurately.",
	"tools": [],
	}
	retrieval_time = 0.0

	# Step 2 — agent execution (LLM call)
	answer = run_agent_on_sample(
	agent_spec=agent_spec,
	sample=sample,
	client=client,
	model=config.agent_model,
	temperature=config.agent_temperature,
	max_tokens=config.agent_max_tokens,
	)
	answer.retrieval_time = retrieval_time
	answer.latency_seconds = retrieval_time + answer.execution_time
	return answer
	# RAG / AutoGen / Baseline:
	# Step 1 (search + optional LLM gen) + Step 2 (LLM exec) — all in one call
	return process_sample(
	sample,
	config,
	client,
	shared_retriever=shared_retriever,
	shared_gen_client=shared_gen_client,
	)

	# Run in parallel
	with ThreadPoolExecutor(max_workers=max_parallel) as executor:
	future_to_sample = {executor.submit(_process_one, sample): sample for sample in remaining}

	for future in as_completed(future_to_sample):
	try:
	answer = future.result()
	results.append(answer)
	ckpt.save_result(answer)

	if answer.error:
	errors += 1
	except Exception as e:
	sample = future_to_sample[future]
	answer = AgentAnswer(
	sample_id=sample.sample_id,
	predicted_answer="",
	correct_answer=sample.correct_answer,
	is_correct=False,
	error=str(e),
	)
	results.append(answer)
	ckpt.save_result(answer)
	errors += 1

	pbar.update(1)
	# Update postfix with running accuracy
	correct_so_far = sum(1 for r in results if r.is_correct)
	total_so_far = len(results)
	pbar.set_postfix(
	{
	"acc": f"{correct_so_far / total_so_far:.1%}" if total_so_far else "N/A",
	"err": errors,
	}
	)

	pbar.close()

	# Combine with previously cached results
	return [AgentAnswer(**r) for r in ckpt.get_all_results()]


	# ── Main orchestrator ─────────────────────────────────────────────────────────


	def load_datasets(max_samples: int = MAX_SAMPLES_PER_SUBJECT) -> dict[str, list[BenchmarkSample]]:
	"""Load all benchmark datasets."""
	mmlu = load_mmlu(
	subjects=MMLU_SUBJECTS_SUBSET,
	max_samples_per_subject=max_samples,
	)

	bbh = load_bbh(
	tasks=BBH_TASKS_SUBSET,
	max_samples_per_task=max_samples,
	)

	bigbench = load_bigbench(
	tasks=BIGBENCH_TASKS_SUBSET,
	max_samples_per_task=max_samples,
	)

	return {"mmlu": mmlu, "bbh": bbh, "bigbench": bigbench}


	def run_all_experiments(
	configs: list[ExperimentConfig] \| None = None,
	datasets: dict[str, list[BenchmarkSample]] \| None = None,
	max_parallel: int = MAX_PARALLEL,
	max_parallel_large: int = MAX_PARALLEL_LARGE,
	large_dataset_threshold: int = LARGE_DATASET_THRESHOLD,
	) -> dict[str, list[AgentAnswer]]:
	"""
	Run all experiment configurations on all datasets.

	Adaptive parallelism: datasets with more than ``large_dataset_threshold``
	samples are processed with ``max_parallel_large`` threads; smaller ones
	use ``max_parallel``. Checkpoint files are keyed by config_id+dataset_name
	and are completely unaffected by changing parallelism, so already-saved
	results are always preserved and resumed correctly.

	Returns:
	Dict mapping "config_id__dataset_name" -> list of AgentAnswer

	"""
	if datasets is None:
	datasets = load_datasets()

	if configs is None:
	configs = ALL_CONFIGS

	all_results: dict[str, list[AgentAnswer]] = {}

	len(configs) * len(datasets)

	for _cfg_idx, config in enumerate(configs, 1):
	# ── Pre-initialize shared resources ONCE per config ──────────
	# Embedding models, JSONL data, indexes, and OpenAI clients are
	# loaded here and reused across ALL datasets for this config.
	shared_retriever = None
	shared_gen_client = None

	if config.mode in (ExperimentMode.RETRIEVAL, ExperimentMode.RAG):
	from retrieval.config import RetrievalConfig
	from retrieval.retriever import AgentRetriever

	retrieval_config = RetrievalConfig(
	dataset_type=config.dataset_type,
	embedding_model=config.embedding_model,
	top_k=config.top_k,
	use_reranker=config.use_reranker,
	reranker_model=config.reranker_model if config.use_reranker else "BAAI/bge-reranker-base",
	rerank_top_k=config.rerank_top_k,
	)
	shared_retriever = AgentRetriever(retrieval_config, verbose=False)
	shared_retriever.initialize()

	if config.mode in (ExperimentMode.RAG, ExperimentMode.AUTOGEN):
	shared_gen_client = OpenAI(
	base_url=config.agent_base_url,
	api_key=config.agent_api_key,
	timeout=120,
	)

	# ── Run on all datasets with shared resources ────────────────
	for ds_name, samples in datasets.items():
	# Adaptive parallelism: use a larger thread pool for big datasets
	effective_parallel = max_parallel_large if len(samples) > large_dataset_threshold else max_parallel
	key = get_checkpoint_id(config.config_id, ds_name)
	results = run_single_experiment(
	config=config,
	samples=samples,
	dataset_name=ds_name,
	max_parallel=effective_parallel,
	shared_retriever=shared_retriever,
	shared_gen_client=shared_gen_client,
	)
	all_results[key] = results

	return all_results


	# ── Entry point ───────────────────────────────────────────────────────────────


	def main():
	"""Main entry point for running all experiments."""
	import argparse

	parser = argparse.ArgumentParser(description="Run benchmark experiments")
	parser.add_argument(
	"--mode",
	choices=["all", "retrieval", "rag", "autogen", "baseline"],
	default="all",
	help="Which experiments to run",
	)
	parser.add_argument(
	"--parallel",
	type=int,
	default=MAX_PARALLEL,
	help=f"Max parallel tasks for small datasets (≤{LARGE_DATASET_THRESHOLD} samples). Default: {MAX_PARALLEL}",
	)
	parser.add_argument(
	"--parallel-large",
	type=int,
	default=MAX_PARALLEL_LARGE,
	help=f"Max parallel tasks for large datasets (>{LARGE_DATASET_THRESHOLD} samples). "
	f"Default: {MAX_PARALLEL_LARGE}",
	)
	parser.add_argument(
	"--large-threshold",
	type=int,
	default=LARGE_DATASET_THRESHOLD,
	help=f"Sample count threshold that triggers higher parallelism. Default: {LARGE_DATASET_THRESHOLD}",
	)
	parser.add_argument("--max-samples", type=int, default=None, help="Max samples per subject/task (default: all)")
	parser.add_argument("--report-only", action="store_true", help="Only generate report from existing checkpoints")
	args = parser.parse_args()

	max_samples = args.max_samples

	if args.report_only:
	# Just generate report from existing checkpoints
	from experiments.checkpoint import list_checkpoints

	checkpoints = list_checkpoints()
	if not checkpoints:
	return

	all_results = {}
	for ckpt_info in checkpoints:
	ckpt = CheckpointManager(ckpt_info["experiment_id"])
	results = [AgentAnswer(**r) for r in ckpt.get_all_results()]
	all_results[ckpt_info["experiment_id"]] = results

	# Generate report
	for key, results in all_results.items():
	if results:
	metrics = compute_metrics(results)
	print_metrics_table({key: metrics})

	save_metrics_report(all_results, Path("experiments/results"))
	return

	# Load datasets
	datasets = load_datasets(max_samples=max_samples)

	# Select configs based on mode
	if args.mode == "retrieval":
	configs = RETRIEVAL_CONFIGS
	elif args.mode == "rag":
	configs = RAG_CONFIGS
	elif args.mode == "autogen":
	configs = [AUTOGEN_CONFIG]
	elif args.mode == "baseline":
	configs = [BASELINE_CONFIG]
	else:
	configs = ALL_CONFIGS

	# Run experiments
	all_results = run_all_experiments(
	configs=configs,
	datasets=datasets,
	max_parallel=args.parallel,
	max_parallel_large=args.parallel_large,
	large_dataset_threshold=args.large_threshold,
	)

	# Generate report

	metrics_dict = {}
	for key, results in all_results.items():
	if results:
	metrics = compute_metrics(results)
	metrics_dict[key] = metrics

	print_metrics_table(metrics_dict)
	save_metrics_report(all_results, Path("experiments/results"))


	if __name__ == "__main__":
	main()