| import os |
| import json |
| import time |
| import random |
| import argparse |
| import importlib.util |
| from pathlib import Path |
| from typing import Optional, Tuple, Callable |
|
|
| import numpy as np |
|
|
| from shinka.llm import LLMClient |
| from lm_judge_prompt import make_lm_input_and_output_processors |
|
|
|
|
| def evaluate_with_lm_judge( |
| program_path: str, |
| results_dir: str, |
| lm_input_and_output_processors: Callable | Tuple[Callable, Callable] = ( |
| make_lm_input_and_output_processors), |
| llm_judge_names=[ |
| "azure-gpt-4.1", |
| "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0", |
| "gemini-2.5-pro", |
| ], |
| llm_judge_kwargs=dict( |
| temperatures=0.0, |
| max_tokens=8196, |
| reasoning_efforts="low", |
| model_sample_probs=None, |
| output_model=None, |
| verbose=True |
| ), |
| limit_max_characters: Optional[int] = None, |
| num_samples: int = 20, |
| seed: int = 42, |
| ): |
|
|
| spec = importlib.util.spec_from_file_location("program", program_path) |
| if spec is None: |
| print(f"Error: Could not load spec for module at {program_path}") |
| return |
| if spec.loader is None: |
| print(f"Error: No loader found for module at {program_path}") |
| return |
|
|
| module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(module) |
|
|
| random.seed(seed) |
| random_ints = [random.randint(0, 10000) for _ in range(num_samples)] |
|
|
| start_t = time.time() |
|
|
| error = "" |
| correct = True |
|
|
| if isinstance(lm_input_and_output_processors, tuple): |
| get_evaluation_prompt, extract_results = lm_input_and_output_processors |
| else: |
| get_evaluation_prompt, extract_results = lm_input_and_output_processors( |
| number_of_samples=num_samples) |
| try: |
| start_t = time.time() |
| novel_outputs = module.run_experiment(random_ints) |
| if limit_max_characters is not None: |
| novel_outputs = [ |
| output[:limit_max_characters] for output in novel_outputs] |
|
|
| if not isinstance(llm_judge_names, list): |
| llm_judge_names = [llm_judge_names] |
| if not isinstance(llm_judge_kwargs, list): |
| llm_judge_kwargs = [llm_judge_kwargs] * len(llm_judge_names) |
|
|
| llm_judges = [LLMClient( |
| model_names=llm_judge_names[i], |
| **llm_judge_kwargs[i], |
| ) for i in range(len(llm_judge_names))] |
|
|
| lm_judge_sys_prompt, lm_judge_message = get_evaluation_prompt( |
| novel_outputs) |
|
|
| results_dict = {} |
| all_final_scores = [] |
| total_cost = 0.0 |
|
|
| for llm_judge_idx in range(len(llm_judges)): |
| llm_judge = llm_judges[llm_judge_idx] |
|
|
| llm_judge_kwargs = llm_judge.get_kwargs() |
|
|
| llm_judge_response = llm_judge.query( |
| msg=lm_judge_message, |
| system_msg=lm_judge_sys_prompt, |
| llm_kwargs=llm_judge_kwargs, |
| ) |
|
|
| total_costs = llm_judge_response.cost or 0 |
| response_content = llm_judge_response.content |
|
|
| llm_judge_scores: dict = extract_results(response_content) |
|
|
| total_cost += total_costs |
| for k, v in llm_judge_scores.items(): |
| results_dict['judge{}_{}'.format(llm_judge_idx + 1, k)] = v |
| all_final_scores.append(llm_judge_scores.get( |
| 'final_novelty_score', 0.0)) |
|
|
| results_dict['combined_score'] = float(np.mean(all_final_scores)) |
|
|
| if results_dict['combined_score'] is None: |
| results_dict['combined_score'] = 0.0 |
|
|
| metrics = {} |
| metrics["runtime"] = time.time() - start_t |
| metrics["public"] = results_dict |
| metrics["private"] = {"evaluation_cost": total_cost} |
| metrics["combined_score"] = results_dict['combined_score'] |
| error = "" |
| correct = True |
| except Exception as e: |
| print(f"Error: {e}") |
| metrics = { |
| "combined_score": 0, |
| "public": {}, |
| "private": {}, |
| "runtime": 0, |
| } |
| error = str(e) |
| correct = False |
|
|
| print(metrics) |
| elapsed = metrics["runtime"] |
| hours = int(elapsed // 3600) |
| minutes = int((elapsed % 3600) // 60) |
| seconds = int(elapsed % 60) |
| print(f"Completed after {hours}h {minutes}m {seconds}s") |
| |
| correct_file = os.path.join(results_dir, "correct.json") |
| with open(correct_file, "w") as f: |
| json.dump({"correct": correct, "error": error}, f, indent=4) |
| print(f"Correct saved to {correct_file}") |
|
|
| |
| metrics_file = os.path.join( |
| results_dir, |
| "metrics.json", |
| ) |
| with open(metrics_file, "w") as f: |
| json.dump(metrics, f, indent=4) |
| print(f"Metrics saved to {metrics_file}") |
| return metrics |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser( |
| description="Novelty evaluation functions and default script" |
| ) |
| parser.add_argument( |
| "--program_path", |
| type=str, |
| default="initial.py", |
| help="Path to the program to evaluate", |
| ) |
|
|
| parser.add_argument( |
| "--results_dir", |
| type=str, |
| default="results", |
| help="Directory to save results and logs", |
| ) |
| parsed_args = parser.parse_args() |
| Path(parsed_args.results_dir).mkdir(parents=True, exist_ok=True) |
| evaluate_with_lm_judge(parsed_args.program_path, parsed_args.results_dir) |
|
|