JustinTX's picture
Add files using upload-large-folder tool
14c9c2b verified
import os
import json
import time
import random
import argparse
import importlib.util
from pathlib import Path
from typing import Optional, Tuple, Callable
import numpy as np
from shinka.llm import LLMClient
from lm_judge_prompt import make_lm_input_and_output_processors
def evaluate_with_lm_judge(
program_path: str,
results_dir: str,
lm_input_and_output_processors: Callable | Tuple[Callable, Callable] = (
make_lm_input_and_output_processors),
llm_judge_names=[
"azure-gpt-4.1",
"bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0",
"gemini-2.5-pro",
],
llm_judge_kwargs=dict(
temperatures=0.0,
max_tokens=8196,
reasoning_efforts="low",
model_sample_probs=None,
output_model=None,
verbose=True
),
limit_max_characters: Optional[int] = None,
num_samples: int = 20,
seed: int = 42,
):
spec = importlib.util.spec_from_file_location("program", program_path)
if spec is None:
print(f"Error: Could not load spec for module at {program_path}")
return
if spec.loader is None:
print(f"Error: No loader found for module at {program_path}")
return
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
random.seed(seed)
random_ints = [random.randint(0, 10000) for _ in range(num_samples)]
start_t = time.time()
error = ""
correct = True
if isinstance(lm_input_and_output_processors, tuple):
get_evaluation_prompt, extract_results = lm_input_and_output_processors
else:
get_evaluation_prompt, extract_results = lm_input_and_output_processors(
number_of_samples=num_samples)
try:
start_t = time.time()
novel_outputs = module.run_experiment(random_ints)
if limit_max_characters is not None:
novel_outputs = [
output[:limit_max_characters] for output in novel_outputs]
if not isinstance(llm_judge_names, list):
llm_judge_names = [llm_judge_names]
if not isinstance(llm_judge_kwargs, list):
llm_judge_kwargs = [llm_judge_kwargs] * len(llm_judge_names)
llm_judges = [LLMClient(
model_names=llm_judge_names[i],
**llm_judge_kwargs[i],
) for i in range(len(llm_judge_names))]
lm_judge_sys_prompt, lm_judge_message = get_evaluation_prompt(
novel_outputs)
results_dict = {}
all_final_scores = []
total_cost = 0.0
for llm_judge_idx in range(len(llm_judges)):
llm_judge = llm_judges[llm_judge_idx]
llm_judge_kwargs = llm_judge.get_kwargs()
llm_judge_response = llm_judge.query(
msg=lm_judge_message,
system_msg=lm_judge_sys_prompt,
llm_kwargs=llm_judge_kwargs,
)
total_costs = llm_judge_response.cost or 0
response_content = llm_judge_response.content
llm_judge_scores: dict = extract_results(response_content)
total_cost += total_costs
for k, v in llm_judge_scores.items():
results_dict['judge{}_{}'.format(llm_judge_idx + 1, k)] = v
all_final_scores.append(llm_judge_scores.get(
'final_novelty_score', 0.0))
results_dict['combined_score'] = float(np.mean(all_final_scores))
if results_dict['combined_score'] is None:
results_dict['combined_score'] = 0.0
metrics = {}
metrics["runtime"] = time.time() - start_t
metrics["public"] = results_dict
metrics["private"] = {"evaluation_cost": total_cost}
metrics["combined_score"] = results_dict['combined_score']
error = ""
correct = True
except Exception as e:
print(f"Error: {e}")
metrics = {
"combined_score": 0,
"public": {},
"private": {},
"runtime": 0,
}
error = str(e)
correct = False
print(metrics)
elapsed = metrics["runtime"]
hours = int(elapsed // 3600)
minutes = int((elapsed % 3600) // 60)
seconds = int(elapsed % 60)
print(f"Completed after {hours}h {minutes}m {seconds}s")
# Save correct to JSON file
correct_file = os.path.join(results_dir, "correct.json")
with open(correct_file, "w") as f:
json.dump({"correct": correct, "error": error}, f, indent=4)
print(f"Correct saved to {correct_file}")
# Save metrics to JSON file
metrics_file = os.path.join(
results_dir,
"metrics.json",
)
with open(metrics_file, "w") as f:
json.dump(metrics, f, indent=4)
print(f"Metrics saved to {metrics_file}")
return metrics
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Novelty evaluation functions and default script"
)
parser.add_argument(
"--program_path",
type=str,
default="initial.py",
help="Path to the program to evaluate",
)
parser.add_argument(
"--results_dir",
type=str,
default="results",
help="Directory to save results and logs",
)
parsed_args = parser.parse_args()
Path(parsed_args.results_dir).mkdir(parents=True, exist_ok=True)
evaluate_with_lm_judge(parsed_args.program_path, parsed_args.results_dir)