Add files using upload-large-folder tool

14c9c2b verified about 1 month ago

5.42 kB

	import os
	import json
	import time
	import random
	import argparse
	import importlib.util
	from pathlib import Path
	from typing import Optional, Tuple, Callable

	import numpy as np

	from shinka.llm import LLMClient
	from lm_judge_prompt import make_lm_input_and_output_processors


	def evaluate_with_lm_judge(
	program_path: str,
	results_dir: str,
	lm_input_and_output_processors: Callable \| Tuple[Callable, Callable] = (
	make_lm_input_and_output_processors),
	llm_judge_names=[
	"azure-gpt-4.1",
	"bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0",
	"gemini-2.5-pro",
	],
	llm_judge_kwargs=dict(
	temperatures=0.0,
	max_tokens=8196,
	reasoning_efforts="low",
	model_sample_probs=None,
	output_model=None,
	verbose=True
	),
	limit_max_characters: Optional[int] = None,
	num_samples: int = 20,
	seed: int = 42,
	):

	spec = importlib.util.spec_from_file_location("program", program_path)
	if spec is None:
	print(f"Error: Could not load spec for module at {program_path}")
	return
	if spec.loader is None:
	print(f"Error: No loader found for module at {program_path}")
	return

	module = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(module)

	random.seed(seed)
	random_ints = [random.randint(0, 10000) for _ in range(num_samples)]

	start_t = time.time()

	error = ""
	correct = True

	if isinstance(lm_input_and_output_processors, tuple):
	get_evaluation_prompt, extract_results = lm_input_and_output_processors
	else:
	get_evaluation_prompt, extract_results = lm_input_and_output_processors(
	number_of_samples=num_samples)
	try:
	start_t = time.time()
	novel_outputs = module.run_experiment(random_ints)
	if limit_max_characters is not None:
	novel_outputs = [
	output[:limit_max_characters] for output in novel_outputs]

	if not isinstance(llm_judge_names, list):
	llm_judge_names = [llm_judge_names]
	if not isinstance(llm_judge_kwargs, list):
	llm_judge_kwargs = [llm_judge_kwargs] * len(llm_judge_names)

	llm_judges = [LLMClient(
	model_names=llm_judge_names[i],
	**llm_judge_kwargs[i],
	) for i in range(len(llm_judge_names))]

	lm_judge_sys_prompt, lm_judge_message = get_evaluation_prompt(
	novel_outputs)

	results_dict = {}
	all_final_scores = []
	total_cost = 0.0

	for llm_judge_idx in range(len(llm_judges)):
	llm_judge = llm_judges[llm_judge_idx]

	llm_judge_kwargs = llm_judge.get_kwargs()

	llm_judge_response = llm_judge.query(
	msg=lm_judge_message,
	system_msg=lm_judge_sys_prompt,
	llm_kwargs=llm_judge_kwargs,
	)

	total_costs = llm_judge_response.cost or 0
	response_content = llm_judge_response.content

	llm_judge_scores: dict = extract_results(response_content)

	total_cost += total_costs
	for k, v in llm_judge_scores.items():
	results_dict['judge{}_{}'.format(llm_judge_idx + 1, k)] = v
	all_final_scores.append(llm_judge_scores.get(
	'final_novelty_score', 0.0))

	results_dict['combined_score'] = float(np.mean(all_final_scores))

	if results_dict['combined_score'] is None:
	results_dict['combined_score'] = 0.0

	metrics = {}
	metrics["runtime"] = time.time() - start_t
	metrics["public"] = results_dict
	metrics["private"] = {"evaluation_cost": total_cost}
	metrics["combined_score"] = results_dict['combined_score']
	error = ""
	correct = True
	except Exception as e:
	print(f"Error: {e}")
	metrics = {
	"combined_score": 0,
	"public": {},
	"private": {},
	"runtime": 0,
	}
	error = str(e)
	correct = False

	print(metrics)
	elapsed = metrics["runtime"]
	hours = int(elapsed // 3600)
	minutes = int((elapsed % 3600) // 60)
	seconds = int(elapsed % 60)
	print(f"Completed after {hours}h {minutes}m {seconds}s")
	# Save correct to JSON file
	correct_file = os.path.join(results_dir, "correct.json")
	with open(correct_file, "w") as f:
	json.dump({"correct": correct, "error": error}, f, indent=4)
	print(f"Correct saved to {correct_file}")

	# Save metrics to JSON file
	metrics_file = os.path.join(
	results_dir,
	"metrics.json",
	)
	with open(metrics_file, "w") as f:
	json.dump(metrics, f, indent=4)
	print(f"Metrics saved to {metrics_file}")
	return metrics


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Novelty evaluation functions and default script"
	)
	parser.add_argument(
	"--program_path",
	type=str,
	default="initial.py",
	help="Path to the program to evaluate",
	)

	parser.add_argument(
	"--results_dir",
	type=str,
	default="results",
	help="Directory to save results and logs",
	)
	parsed_args = parser.parse_args()
	Path(parsed_args.results_dir).mkdir(parents=True, exist_ok=True)
	evaluate_with_lm_judge(parsed_args.program_path, parsed_args.results_dir)