|
|
|
|
|
|
|
|
from ...evaluators.aflow_evaluator import AFlowEvaluator |
|
|
from ...core.callbacks import suppress_logger_info |
|
|
from ...core.logging import logger |
|
|
|
|
|
class EvaluationUtils: |
|
|
|
|
|
def __init__(self, root_path: str): |
|
|
self.root_path = root_path |
|
|
|
|
|
async def evaluate_graph_async(self, optimizer, validation_n, data, initial=False): |
|
|
|
|
|
evaluator = AFlowEvaluator(llm=optimizer.executor_llm) |
|
|
sum_score = 0 |
|
|
|
|
|
for _ in range(validation_n): |
|
|
|
|
|
with suppress_logger_info(): |
|
|
score, avg_cost, total_cost, all_failed = await evaluator.graph_evaluate_async(optimizer.benchmark, optimizer.graph, is_test=False) |
|
|
cur_round = optimizer.round + 1 if initial is False else optimizer.round |
|
|
new_data = optimizer.data_utils.create_result_data(cur_round, score, avg_cost, total_cost) |
|
|
data.append(new_data) |
|
|
|
|
|
result_path = optimizer.data_utils.get_results_file_path(self.root_path) |
|
|
optimizer.data_utils.save_results(result_path, data) |
|
|
|
|
|
sum_score += score |
|
|
|
|
|
if all_failed: |
|
|
logger.warning(f"All test cases failed in round {cur_round}. Stopping evaluation for this round.") |
|
|
break |
|
|
|
|
|
return sum_score / validation_n |
|
|
|
|
|
async def evaluate_graph_test_async(self, optimizer): |
|
|
|
|
|
evaluator = AFlowEvaluator(llm=optimizer.executor_llm) |
|
|
with suppress_logger_info(): |
|
|
score, avg_cost, total_cost, all_failed = await evaluator.graph_evaluate_async(optimizer.benchmark, optimizer.graph, is_test=True) |
|
|
return score, avg_cost, total_cost |