File size: 5,216 Bytes
5374a2d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | """
Best Prompt Evaluation Script
This script evaluates baseline performance using simple prompts without optimization.
It serves as a baseline comparison for evolutionary prompt optimization results.
"""
import asyncio
import os
import re
import csv
from typing import Dict
from datetime import datetime
from dotenv import load_dotenv
from tqdm.asyncio import tqdm as aio_tqdm
from evoagentx.benchmark.bigbenchhard import BIGBenchHard
from evoagentx.models import OpenAILLM, OpenAILLMConfig
class SinglePromptSarcasmClassifier:
"""
A simple classifier using a single fixed prompt for task processing.
This serves as a baseline for comparison with evolved prompts.
"""
def __init__(self, model: OpenAILLM):
"""
Initialize the baseline classifier.
Args:
model: The language model to use for inference
"""
self.model = model
self.task_instruction = "After your reasoning, respond the answer only with option like this: the answer is (A)"
self.chain_of_thought_prefix = "Let's think step by step."
def __call__(self, input: str) -> tuple[str, dict]:
"""
Process input with the fixed prompt.
Args:
input: The input text to process
Returns:
Tuple of (answer, metadata)
"""
full_prompt = f"Question:{input}{self.task_instruction}"
response = self.model.generate(prompt=full_prompt)
prediction = response.content.strip()
# Extract answer using regex pattern
pattern = r"the answer is\s*(.*)"
match = re.search(pattern, prediction, re.IGNORECASE)
answer = match.group(1).strip().rstrip('.') if match else "N/A"
return answer, {"full_prompt": full_prompt}
async def main():
"""Main execution function for baseline evaluation."""
# Load environment configuration
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY not found in environment variables.")
# Models to evaluate
model_list = [
"gpt-4.1-nano-2025-04-14",
]
# Evaluate each model
for model_name in model_list:
# Configure language model
llm_config = OpenAILLMConfig(
model=model_name,
openai_key=OPENAI_API_KEY,
stream=False,
)
llm = OpenAILLM(config=llm_config)
# Set up benchmark and classifier
benchmark = BIGBenchHard("geometric_shapes", dev_sample_num=50, seed=10)
program = SinglePromptSarcasmClassifier(model=llm)
print(f"\n--- Evaluating on Test Set with model {model_name} ---")
test_data = benchmark.get_test_data()
results_list = []
task_name = benchmark.task
async def evaluate_example_concurrently(example: Dict) -> float:
"""
Evaluate a single example asynchronously.
Args:
example: The example to evaluate
Returns:
The evaluation score (0.0 or 1.0)
"""
prediction, meta = await asyncio.to_thread(
program,
input=example["input"]
)
score_dict = benchmark.evaluate(prediction, benchmark.get_label(example))
# Save detailed results for each sample
results_list.append({
"input": example["input"],
"label": benchmark.get_label(example),
"prediction": prediction,
"em": score_dict.get("em", 0.0),
"prompt": meta["full_prompt"],
"model": model_name,
"task": task_name
})
return score_dict.get("em", 0.0)
# Run evaluation on test set
if test_data:
tasks = [evaluate_example_concurrently(ex) for ex in test_data]
results = await aio_tqdm.gather(*tasks, desc="Evaluating on Test Set")
correct_count = sum(results)
test_accuracy = correct_count / len(test_data)
print(f"Test Accuracy: {test_accuracy:.4f}")
# Save results to CSV with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_name = f"results_{model_name}_{task_name}_{timestamp}.csv"
csv_path = os.path.join(os.path.dirname(__file__), csv_name)
with open(csv_path, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"input", "label", "prediction", "em", "prompt", "model", "task"
])
# Write average score at the top
f.write(f"平均分数,{test_accuracy:.4f}\n")
writer.writeheader()
writer.writerows(results_list)
print(f"详细结果已保存到: {csv_path}")
else:
test_accuracy = 0.0
return test_accuracy
if __name__ == "__main__":
asyncio.run(main()) |