iLOVE2D's picture
Upload 2846 files
5374a2d verified
# Acknowledgement: Modified from AFlow (https://github.com/geekan/MetaGPT/blob/main/metagpt/ext/aflow/scripts/optimizer_utils/evaluation_utils.py) under MIT License
from ...evaluators.aflow_evaluator import AFlowEvaluator
from ...core.callbacks import suppress_logger_info
from ...core.logging import logger
class EvaluationUtils:
def __init__(self, root_path: str):
self.root_path = root_path
async def evaluate_graph_async(self, optimizer, validation_n, data, initial=False):
evaluator = AFlowEvaluator(llm=optimizer.executor_llm)
sum_score = 0
for _ in range(validation_n):
with suppress_logger_info():
score, avg_cost, total_cost, all_failed = await evaluator.graph_evaluate_async(optimizer.benchmark, optimizer.graph, is_test=False)
cur_round = optimizer.round + 1 if initial is False else optimizer.round
new_data = optimizer.data_utils.create_result_data(cur_round, score, avg_cost, total_cost)
data.append(new_data)
result_path = optimizer.data_utils.get_results_file_path(self.root_path)
optimizer.data_utils.save_results(result_path, data)
sum_score += score
if all_failed:
logger.warning(f"All test cases failed in round {cur_round}. Stopping evaluation for this round.")
break
return sum_score / validation_n
async def evaluate_graph_test_async(self, optimizer):
evaluator = AFlowEvaluator(llm=optimizer.executor_llm)
with suppress_logger_info():
score, avg_cost, total_cost, all_failed = await evaluator.graph_evaluate_async(optimizer.benchmark, optimizer.graph, is_test=True)
return score, avg_cost, total_cost