Spaces:

davebulaval
/

cole

Running

cole / src /evaluation /evaluation_pipeline_small_2.py

8fa3acc 3 months ago

3.37 kB

	import argparse
	import gc
	import logging
	from datetime import datetime

	import torch
	import wandb
	from tqdm import tqdm

	from predictions.all_llms import small_llm_2
	from src import WANDB_PROJECT
	from src.evaluation.llm_evaluator import ModelEvaluator
	from src.evaluation.llm_factory import model_factory
	from src.task.task_factory import tasks_factory
	from src.task.task_names import Tasks

	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--test",
	help="If set to true, the system will default to testing only a small model with a few examples.",
	default=False,
	type=bool,
	)
	parser.add_argument(
	"--max_examples",
	"-m",
	help="The maximum number of examples to use, defaults to None.",
	type=int,
	default=None,
	)
	parser.add_argument(
	"--token",
	"-t",
	help="Input your HuggingFace token to fetch models.",
	type=str,
	default=None,
	)
	parser.add_argument(
	"--models_name",
	"-mn",
	help="The name of the model(s) to load.",
	type=str,
	default=None,
	)

	parser.add_argument(
	"--batch_size",
	help="The batch size to use during the evaluation.",
	type=int,
	default=256,
	)

	parser.add_argument(
	"--skip_first_n",
	help="The number of LLM to skip in the list of split",
	type=int,
	default=None,
	)

	args = parser.parse_args()

	tasks_names = list(Tasks)

	tasks = tasks_factory(tasks_names)

	models = []
	if args.models_name is not None:
	if args.models_name in small_llm_2:
	models = small_llm_2[args.models_name]
	elif args.models_name == "RandomBaselineModel":
	models = ["RandomBaselineModel"]
	else:
	models = args.models_name.split(",")
	else:
	models = small_llm_2["all"]

	if args.skip_first_n is not None:
	models = models[args.skip_first_n :]

	logging.info("Starting Evaluation")

	time_start = datetime.now()

	for model_name in tqdm(
	models, total=len(models), desc="Processing LLM inference on tasks."
	):

	try:
	model = model_factory(model_name, batch_size=args.batch_size)
	logging.info("Creating model")
	evaluator = ModelEvaluator()
	logging.info("Evaluating model")

	exp_name = f"{model_name}"
	wandb.init(
	project=WANDB_PROJECT,
	entity="doctorate",
	config={"model_name": model_name, "tasks": "; ".join(tasks_names)},
	name=exp_name,
	)

	predictions_payload = evaluator.evaluate_subset(model, tasks, args.max_examples)
	wandb.log(predictions_payload)

	logging.info("Saving results")
	evaluator.save_results("./results")

	metrics_payload = evaluator.compute_metrics()
	evaluator.save_metrics("./results")
	wandb.log(metrics_payload)

	except Exception as e:
	error_message = f"Evaluation failed for model {model_name}: {e}"
	logging.error(error_message)
	wandb.finish(exit_code=1)
	continue
	finally:
	# Memory cleaning
	if "model" in locals():
	del model
	if "evaluator" in locals():
	del evaluator
	gc.collect()
	torch.cuda.empty_cache()
	wandb.finish(exit_code=0)

	time_end = datetime.now()
	info_message = f"End time: {time_end}"
	logging.info(info_message)
	elapsed_time = time_end - time_start
	info_message = f"Elapsed time: {elapsed_time}"
	logging.info(info_message)