import os import json import time import random import argparse import importlib.util from pathlib import Path from typing import Optional, Tuple, Callable import numpy as np from shinka.llm import LLMClient from lm_judge_prompt import make_lm_input_and_output_processors def evaluate_with_lm_judge( program_path: str, results_dir: str, lm_input_and_output_processors: Callable | Tuple[Callable, Callable] = ( make_lm_input_and_output_processors), llm_judge_names=[ "azure-gpt-4.1", "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0", "gemini-2.5-pro", ], llm_judge_kwargs=dict( temperatures=0.0, max_tokens=8196, reasoning_efforts="low", model_sample_probs=None, output_model=None, verbose=True ), limit_max_characters: Optional[int] = None, num_samples: int = 20, seed: int = 42, ): spec = importlib.util.spec_from_file_location("program", program_path) if spec is None: print(f"Error: Could not load spec for module at {program_path}") return if spec.loader is None: print(f"Error: No loader found for module at {program_path}") return module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) random.seed(seed) random_ints = [random.randint(0, 10000) for _ in range(num_samples)] start_t = time.time() error = "" correct = True if isinstance(lm_input_and_output_processors, tuple): get_evaluation_prompt, extract_results = lm_input_and_output_processors else: get_evaluation_prompt, extract_results = lm_input_and_output_processors( number_of_samples=num_samples) try: start_t = time.time() novel_outputs = module.run_experiment(random_ints) if limit_max_characters is not None: novel_outputs = [ output[:limit_max_characters] for output in novel_outputs] if not isinstance(llm_judge_names, list): llm_judge_names = [llm_judge_names] if not isinstance(llm_judge_kwargs, list): llm_judge_kwargs = [llm_judge_kwargs] * len(llm_judge_names) llm_judges = [LLMClient( model_names=llm_judge_names[i], **llm_judge_kwargs[i], ) for i in range(len(llm_judge_names))] lm_judge_sys_prompt, lm_judge_message = get_evaluation_prompt( novel_outputs) results_dict = {} all_final_scores = [] total_cost = 0.0 for llm_judge_idx in range(len(llm_judges)): llm_judge = llm_judges[llm_judge_idx] llm_judge_kwargs = llm_judge.get_kwargs() llm_judge_response = llm_judge.query( msg=lm_judge_message, system_msg=lm_judge_sys_prompt, llm_kwargs=llm_judge_kwargs, ) total_costs = llm_judge_response.cost or 0 response_content = llm_judge_response.content llm_judge_scores: dict = extract_results(response_content) total_cost += total_costs for k, v in llm_judge_scores.items(): results_dict['judge{}_{}'.format(llm_judge_idx + 1, k)] = v all_final_scores.append(llm_judge_scores.get( 'final_novelty_score', 0.0)) results_dict['combined_score'] = float(np.mean(all_final_scores)) if results_dict['combined_score'] is None: results_dict['combined_score'] = 0.0 metrics = {} metrics["runtime"] = time.time() - start_t metrics["public"] = results_dict metrics["private"] = {"evaluation_cost": total_cost} metrics["combined_score"] = results_dict['combined_score'] error = "" correct = True except Exception as e: print(f"Error: {e}") metrics = { "combined_score": 0, "public": {}, "private": {}, "runtime": 0, } error = str(e) correct = False print(metrics) elapsed = metrics["runtime"] hours = int(elapsed // 3600) minutes = int((elapsed % 3600) // 60) seconds = int(elapsed % 60) print(f"Completed after {hours}h {minutes}m {seconds}s") # Save correct to JSON file correct_file = os.path.join(results_dir, "correct.json") with open(correct_file, "w") as f: json.dump({"correct": correct, "error": error}, f, indent=4) print(f"Correct saved to {correct_file}") # Save metrics to JSON file metrics_file = os.path.join( results_dir, "metrics.json", ) with open(metrics_file, "w") as f: json.dump(metrics, f, indent=4) print(f"Metrics saved to {metrics_file}") return metrics if __name__ == "__main__": parser = argparse.ArgumentParser( description="Novelty evaluation functions and default script" ) parser.add_argument( "--program_path", type=str, default="initial.py", help="Path to the program to evaluate", ) parser.add_argument( "--results_dir", type=str, default="results", help="Directory to save results and logs", ) parsed_args = parser.parse_args() Path(parsed_args.results_dir).mkdir(parents=True, exist_ok=True) evaluate_with_lm_judge(parsed_args.program_path, parsed_args.results_dir)