File size: 5,216 Bytes
5374a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
Best Prompt Evaluation Script

This script evaluates baseline performance using simple prompts without optimization.
It serves as a baseline comparison for evolutionary prompt optimization results.
"""

import asyncio
import os
import re
import csv
from typing import Dict
from datetime import datetime

from dotenv import load_dotenv
from tqdm.asyncio import tqdm as aio_tqdm

from evoagentx.benchmark.bigbenchhard import BIGBenchHard
from evoagentx.models import OpenAILLM, OpenAILLMConfig


class SinglePromptSarcasmClassifier:
    """
    A simple classifier using a single fixed prompt for task processing.
    
    This serves as a baseline for comparison with evolved prompts.
    """
    
    def __init__(self, model: OpenAILLM):
        """
        Initialize the baseline classifier.
        
        Args:
            model: The language model to use for inference
        """
        self.model = model
        self.task_instruction = "After your reasoning, respond the answer only with option like this: the answer is (A)"
        self.chain_of_thought_prefix = "Let's think step by step."

    def __call__(self, input: str) -> tuple[str, dict]:
        """
        Process input with the fixed prompt.
        
        Args:
            input: The input text to process
            
        Returns:
            Tuple of (answer, metadata)
        """
        full_prompt = f"Question:{input}{self.task_instruction}"
        response = self.model.generate(prompt=full_prompt)
        prediction = response.content.strip()
        
        # Extract answer using regex pattern
        pattern = r"the answer is\s*(.*)"
        match = re.search(pattern, prediction, re.IGNORECASE)
        answer = match.group(1).strip().rstrip('.') if match else "N/A"
        
        return answer, {"full_prompt": full_prompt}


async def main():
    """Main execution function for baseline evaluation."""
    
    # Load environment configuration
    load_dotenv()
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    if not OPENAI_API_KEY:
        raise ValueError("OPENAI_API_KEY not found in environment variables.")
    
    # Models to evaluate
    model_list = [
        "gpt-4.1-nano-2025-04-14",
    ]

    # Evaluate each model
    for model_name in model_list:
        # Configure language model
        llm_config = OpenAILLMConfig(
            model=model_name,
            openai_key=OPENAI_API_KEY,
            stream=False,
        )
        llm = OpenAILLM(config=llm_config)
        
        # Set up benchmark and classifier
        benchmark = BIGBenchHard("geometric_shapes", dev_sample_num=50, seed=10)
        program = SinglePromptSarcasmClassifier(model=llm)
        
        print(f"\n--- Evaluating on Test Set with model {model_name} ---")
        test_data = benchmark.get_test_data()
        results_list = []
        task_name = benchmark.task

        async def evaluate_example_concurrently(example: Dict) -> float:
            """
            Evaluate a single example asynchronously.
            
            Args:
                example: The example to evaluate
                
            Returns:
                The evaluation score (0.0 or 1.0)
            """
            prediction, meta = await asyncio.to_thread(
                program,
                input=example["input"]
            )
            score_dict = benchmark.evaluate(prediction, benchmark.get_label(example))
            
            # Save detailed results for each sample
            results_list.append({
                "input": example["input"],
                "label": benchmark.get_label(example),
                "prediction": prediction,
                "em": score_dict.get("em", 0.0),
                "prompt": meta["full_prompt"],
                "model": model_name,
                "task": task_name
            })
            return score_dict.get("em", 0.0)

        # Run evaluation on test set
        if test_data:
            tasks = [evaluate_example_concurrently(ex) for ex in test_data]
            results = await aio_tqdm.gather(*tasks, desc="Evaluating on Test Set")
            correct_count = sum(results)
            test_accuracy = correct_count / len(test_data)
            print(f"Test Accuracy: {test_accuracy:.4f}")
            
            # Save results to CSV with timestamp
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            csv_name = f"results_{model_name}_{task_name}_{timestamp}.csv"
            csv_path = os.path.join(os.path.dirname(__file__), csv_name)
            
            with open(csv_path, "w", encoding="utf-8", newline="") as f:
                writer = csv.DictWriter(f, fieldnames=[
                    "input", "label", "prediction", "em", "prompt", "model", "task"
                ])
                
                # Write average score at the top
                f.write(f"平均分数,{test_accuracy:.4f}\n")
                writer.writeheader()
                writer.writerows(results_list)
                
            print(f"详细结果已保存到: {csv_path}")
        else:
            test_accuracy = 0.0
            
    return test_accuracy


if __name__ == "__main__":
    asyncio.run(main())