File size: 4,242 Bytes

5374a2d

from dotenv import load_dotenv

from evoagentx.agents.agent_manager import AgentManager
from evoagentx.benchmark import HotPotQA
from evoagentx.core.callbacks import suppress_logger_info
from evoagentx.core.logging import logger
from evoagentx.evaluators import Evaluator
from evoagentx.models import OpenAILLM, OpenAILLMConfig
from evoagentx.optimizers import TextGradOptimizer
from evoagentx.prompts import StringTemplate
from evoagentx.workflow import SequentialWorkFlowGraph

load_dotenv()

class HotPotQASplits(HotPotQA):

    def _load_data(self):
        # load the original test data 
        super()._load_data()
        # split the data into train, dev and test
        import numpy as np 
        np.random.seed(42)
        permutation = np.random.permutation(len(self._dev_data))
        full_test_data = self._dev_data 
        # randomly select 10 samples for train, 40 for dev, and 100 for test
        self._train_data = [full_test_data[idx] for idx in permutation[:10]]
        self._dev_data = [full_test_data[idx] for idx in permutation[10:50]]
        self._test_data = [full_test_data[idx] for idx in permutation[50:150]]


def collate_func(example: dict) -> dict:
    context_list = []
    for item in example["context"]:
        context = "Title: {}\nText: {}".format(item[0], " ".join([t.strip() for t in item[1]]))
        context_list.append(context)
    context = "\n\n".join(context_list)
    problem = "Context: {}\n\nQuestion: {}\n\nAnswer:".format(context, example["question"])
    return {"problem": problem}


hotpotqa_graph_data = {
    "goal": "Answer the question based on the context. The answer should be a direct response to the question, without including explanations or reasoning.",
    "tasks": [
        {
            "name": "answer_generate",
            "description": "Answer the question based on the context.",
            "inputs": [
                {"name": "problem", "type": "str", "required": True, "description": "The problem to solve."}
            ],
            "outputs": [
                {"name": "answer", "type": "str", "required": True, "description": "The answer to the problem."}
            ],
            "prompt_template": StringTemplate(instruction="Think step by step to answer the question. You should explain your thinking process in the 'thought' field, and provide the final answer in the 'answer' field.\nFormat your output in xml format, such as <thought>xxx</thought> and <answer>xxx</answer>."),
            "parse_mode": "xml"
        }
    ] 
}

def main(): 

    executor_config = OpenAILLMConfig(model="gpt-4o-mini")
    executor_llm = OpenAILLM(config=executor_config)

    optimizer_config = OpenAILLMConfig(model="gpt-4o")
    optimizer_llm = OpenAILLM(config=optimizer_config)

    benchmark = HotPotQASplits()
    workflow_graph = SequentialWorkFlowGraph.from_dict(hotpotqa_graph_data)
    agent_manager = AgentManager()
    agent_manager.add_agents_from_workflow(workflow_graph, executor_llm.config)

    evaluator = Evaluator(
        llm=executor_llm, 
        agent_manager=agent_manager, 
        collate_func=collate_func, 
        num_workers=20, 
        verbose=True
    )

    textgrad_optimizer = TextGradOptimizer(
        graph=workflow_graph, 
        optimize_mode="all",
        executor_llm=executor_llm, 
        optimizer_llm=optimizer_llm,
        batch_size=3,
        max_steps=20,
        evaluator=evaluator,
        eval_every_n_steps=1,
        eval_rounds=1,
        save_interval=None,
        save_path="./",
        rollback=True,
        constraints=[]
    )

    logger.info("Evaluating workflow on test set...")
    with suppress_logger_info():
        results = textgrad_optimizer.evaluate(dataset=benchmark, eval_mode="test")
    logger.info(f"Evaluation metrics (before optimization): {results}")

    logger.info("Optimizing workflow...")
    textgrad_optimizer.optimize(benchmark, seed=8)
    textgrad_optimizer.restore_best_graph()

    logger.info("Evaluating workflow on test set...")
    with suppress_logger_info():
        results = textgrad_optimizer.evaluate(dataset=benchmark, eval_mode="test")
    logger.info(f"Evaluation metrics (after optimization): {results}")


if __name__ == "__main__":
    main()